// Compile-time regular expression parser, version 2.0
// Author: Don Clugston, borrowing many ideas from the 1.0 parser written by Eric Anderton.
// This version uses mixins for simplicity, flexibility, and high performance.

// Terminology:
// a regex "sequence" is a set of consecutive "term"s, 
// each of which consists of an "atom", optionally followed by
// a "repeater" (*,+,?, {m}, or {m,n}).

// Points of interest:
// * The parser is able to treat all 'repeater's in a single mixin function, while still applying
//   optimisations (eg, there's absolutely no difference between {1,} and "+").
// * There is absolutely no parameter passing inside the regexp engine. Even functions which
//   can't be inlined will have very low calling cost.
// * Consequently, the speed is excellent. The main unnecessary operations are the checks to see whether we
//   are at the end of the string.
//   This could be greatly improved by precalculating the minimum length required for a match,
//   at least for subsequences of fixed length.
// * Since each mixin can be given access to any desired runtime or compile-time parameters,
//   the scheme is extremely flexible.

//---------------------------------------------------------------------
// Part 0 : Functions from the meta library
//---------------------------------------------------------------------

/******************************************************
 *  ulong atoui!(char [] s);
 *
 *  Converts an ASCII string to an uint.
 */
template atoui(char [] s, uint result=0, int indx=0)
{
    static if (s.length==indx)
        const uint atoui = result;
    else static if (s[indx]<'0' || s[indx]>'9')
        const uint atoui = result;
    else 
        const uint atoui = atoui!(s, result * 10 + s[indx]-'0', indx+1);
}

//---------------------------------------------------------------------
// Part I : Functions for parsing a regular expression string literal.
//---------------------------------------------------------------------
// None of these generate any code.

// retuns index of first char in regstr which equals ch, or -1 if not found
// escaped chars are ignored
template unescapedFindFirst(char [] regstr, char ch, int indx=0)
{
    static if (regstr.length<=indx)
        const int unescapedFindFirst = -1; // not found
    else static if (regstr[indx]==ch) const int unescapedFindFirst=indx;
    else static if (regstr[indx]=='\\')
        // if it's escaped, prevent it from matching.
        const int unescapedunescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+2);
    else const int unescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+1);
}

// Returns the number of chars at the start of regstr which are made up by
// a repetition expression (+, *, ?, {,} )
template repeaterConsumed(char [] regstr)
{
    static if (regstr.length==0) const int repeaterConsumed = 0;
    else static if (regstr[0]=='+' || regstr[0]=='*' || regstr[0]=='?') const int repeaterConsumed = 1;
    else static if (regstr[0]=='{') {
        static if (unescapedFindFirst!(regstr, '}')==-1) {
            pragma(msg, "Error: unmatched { in regular expression");
            static assert(0);
        } else const int repeaterConsumed = 1 + unescapedFindFirst!(regstr, '}');
    } else const int repeaterConsumed = 0;
}

// The minimum allowable number of repetitions for this repeater
template repeaterMin(char [] regstr)
{
    static if (regstr[0]=='*' || regstr[0]=='?') const uint repeaterMin = 0;
    else static if (regstr[0]=='+') const uint repeaterMin = 1;
    else { 
        static assert (regstr[0]=='{') ;
        const uint repeaterMin = atoui!(regstr[1..$]);
     }
}

// The maximum allowable number of repetitions for this repeater
template repeaterMax(char [] regstr)
{
    static if (regstr[0]=='*' || regstr[0]=='+') const uint repeaterMax = uint.max;
    else static if (regstr[0]=='?') const uint repeaterMax = 0;
    else static if (regstr[0]=='{') {
        static if (unescapedFindFirst!(regstr, ',')==-1) // "{n}"
            const uint repeaterMax = repeaterMin!(regstr);
        else static if (regstr[$-2]==',') // "{n,}"
            const uint repeaterMax = uint.max;
        else // "{n,m}" 
            const uint repeaterMax = atoui!(regstr[ 1+unescapedFindFirst!(regstr, ',') .. $]);
    } else {
        pragma(msg, "Error: unsupported repeater " ~ regstr);
        static assert(0);
    }
}

// find the index of the first |, or -1 if not found.
// ignores escaped items, and anything in parentheses.
template findUnion(char [] regstr, int indx=0, int numopenparens=0)
{
    static if (indx>=regstr.length)
        const int findUnion = -1;
    else static if (numopenparens==0 && regstr[indx]=='|')
        const int findUnion = indx;
    else static if (regstr[indx]==')')
        const int findUnion = findUnion!(regstr, indx+1, numopenparens-1);
    else static if (regstr[indx]=='(')
        const int findUnion = findUnion!(regstr, indx+1, numopenparens+1);
    else static if (regstr[indx]=='\\') // skip the escaped character
        const int findUnion = findUnion!(regstr, indx+2, numopenparens);
    else 
        const int findUnion = findUnion!(regstr, indx+1, numopenparens);
}

// keeps going until the number of ) parens equals the number of ( parens.
// All escaped characters are ignored.
// BUG: what about inside [-] ?
template parenConsumed(char [] regstr, int numopenparens=0)
{
    static if (regstr.length==0) {
        pragma(msg, "Unmatched parenthesis");
        static assert(0);
    } else static if (regstr[0]==')') {
        static if (numopenparens==1) const int parenConsumed=1; // finished!
        else const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens-1);
    } else static if (regstr[0]=='(') {
        const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens+1);
    } else static if (regstr[0]=='\\' && regstr.length>1)
        // ignore \(, \).
        const int parenConsumed = 2 + parenConsumed!(regstr[2..$], numopenparens);
    else 
        const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens);    
}

// the term before multiplier
template atomConsumed(char [] regstr)
{
    static if (regstr[0]=='(') const int atomConsumed = parenConsumed!(regstr);
    else static if (regstr[0]=='[') const int atomConsumed = 1 + unescapedFindFirst!(regstr, ']');
    else static if (regstr[0]=='\\') {
        static if (regstr.length>0) {
            const int atomConsumed=2;
        } else  {
            pragma(msg, "Error: Regexp must not end with \\");
            static assert(0);
        }
    } else static if (regstr[0]=='$')  { // NONSTANDARD: referenced parameter
        const int atomConsumed=2;
    } else const int atomConsumed=1; // match single char
}

// parses a term from the front of regstr (which must not be empty).
// consisting of an atom, optionally followed by a repeater.
template termConsumed(char [] regstr)
{
    const int termConsumed = atomConsumed!(regstr) + repeaterConsumed!(regstr[atomConsumed!(regstr)..$]);
}

//---------------------------------------------------------------------
// Part II: mixins which generate the final code
//---------------------------------------------------------------------
// The code is generated by a set of nested mixins.
// At compile time, each mixin is passed a subset of a regexp string.
// It generates a member function bool fn(), which updates a pointer p,
// and returns true if a match was found.

// Each mixin has access to the following values:
// At compile time: 
//     fullpattern -- the complete, unparsed regular expression string
// At run time:
//     searchstr (read only) -- the string being searched
//     p --- the first character in searchstr which is not yet matched.
//     param[0..8] -- the quasi-static parameter strings @1..@9 to match.

// Additional variables or constants can be added as desired.

// Note that unless p is reset to 0, it will automatically behave as a global search,
// continuing from the last place it left off.


// regstr is a sequence of productions, possibly containing a union
template regSequence(char [] regstr)
{
    static if (findUnion!(regstr)==-1) {
        // No unions to worry about
        mixin regNoUnions!(regstr);
    } else {
        pragma(msg, "Parsing union " ~ regstr);
        bool fn() {
            mixin regSequence!(regstr[0..findUnion!(regstr)]) a;
            mixin regSequence!(regstr[findUnion!(regstr)+1..$]) b;
            return a.fn() || b.fn(); 
        }
    }
}

// regstr is a sequence of productions, all of which must be matched
// Does not contain any unions
template regNoUnions(char [] regstr)
{
    static if (regstr.length == termConsumed!(regstr)) {
        // there's only a single item (possibly including a repeater)
        mixin regTerm!(regstr);
    } else {
        bool fn() {
            pragma(msg, "Parsing sequence without unions: " ~ regstr);
            mixin regTerm!(regstr[0..termConsumed!(regstr)]) a;
            mixin regSequence!(regstr[termConsumed!(regstr)..$]) b;
            int oldp = p; // need to roll it back, if not all of them match
            if (a.fn() && b.fn()) return true;
            p = oldp;
            return false;
        }
    }
}

// the atom, optionally followed by a repeater.
// Here we deal with all kinds of repeats,
// but we make different optimisations depending on the allowable repeats.
template regTerm(char [] regstr)
{
    static if (atomConsumed!(regstr)==regstr.length) {
        pragma(msg, "Simple atom " ~ regstr);
        // there is no multiplier, just use the atom
        mixin regAtom!(regstr);
    } else {
        pragma(msg, "Repeated atom " ~ regstr);
        bool fn() {
            const int t = atomConsumed!(regstr);
            const uint repmin = repeaterMin!(regstr[t..$]);
            const uint repmax = repeaterMax!(regstr[t..$]);
            
            mixin regAtom!(regstr[0..t]) a;
            
            static if (repmin == 0 && repmax == 1) { 
                // "?", or "{0,1}". Worth optimising seperately
                a.fn();
                return true;
            } else static if (repmin==0 && repmax == uint.max) {
                // optimise for "*", "{0,}"
                while (a.fn()) {};
                return true;
            } else { // "+", or "{m,n}"
                int numreps=0; // how many repeats have we found?
                while (a.fn()) {
                    numreps++;
                    static if (repmax<uint.max) {
                        // optimise for "+", "{n,}"
                        if (numreps == repmax) return true;
                    }
                }
                return (numreps>=repmin);
           }
        }
    }
}

// generate a parser for an atom
// IN: regstr is a valid atom, without a repeat
// OUT: if atom is matched, return true, and update p.
//      if atom is not matched, return false, and leave p unchanged.
template regAtom(char [] regstr)
{
    static if (regstr[0]=='(') {
            mixin regSequence!(regstr[1..$-1]) a;
    } else static if (regstr[0]=='[') {
        static if (regstr[1]=='^')
            mixin regInvertedRange!(regstr[2..$-1]);
        else
            mixin regRange!(regstr[1..$-1]);
    } else static if (regstr[0]=='.') {
        bool fn() {
            if (p==searchstr.length) return false;
            p++;
            return true;
        }
    } else static if (regstr[0]=='$')  { // NONSTANDARD: referenced parameter
        mixin regParameter!(atoui!(regstr[1..$])-1);
    } else {
        // match single character
        bool fn() {
            if (p==searchstr.length || searchstr[p]!=regstr[0]) return false;
            p++;
            return true;
        }
    }
}

// match a variable string, which will be passed as a parameter.
template regParameter(int parmnum)
{
    bool fn() {
        if (p + param[parmnum].length > searchstr.length) return false;
        if (searchstr[p..p+param[parmnum].length] != param[parmnum]) return false;
        p+=param[parmnum].length;
        return true;
    }
}

template regRange(char [] regstr)
{
    bool fn() {
        pragma(msg, "Parsing range: " ~ regstr ~ "   NOT YET IMPLEMENTED");
        return true;        
    }
}

template regInvertedRange(char [] regstr)
{
    bool fn() {
        pragma(msg, "Parsing inverted range: " ~ regstr ~ "   NOT YET IMPLEMENTED");
        return true;        
    }
}

//---------------------------------------------------------------------
// Part III: the public interface of the regexp engine
//---------------------------------------------------------------------

// Does the regexp match the pattern?
template test(char [] fullpattern)
{
    bool test(char [] searchstr, char [][] param...) {
        int p = 0; // start at the beginning of the string
        mixin regSequence!(fullpattern) a;
        return a.fn();
    }
}

/// Return first substring which matches the pattern, or "" if none.
template search(char [] fullpattern)
{
    char [] search(char [] searchstr, char [][] param...) {
        int p; // next index to test
        mixin regSequence!(fullpattern) a;
        for (int x=0; x<searchstr.length; ++x) {
            p=x;
            if (a.fn()) return searchstr[x..p];
        }
        return ""; // no match
    }
}

//---------------------------------------------------------------------
//                         EXAMPLE
//---------------------------------------------------------------------

import std.stdio;

void main()
{

    writefln("Test 1: The match is: ", search!("ac?b+d*c?a*")("dsafjabbbbcaac"));    
    writefln("Test 2: The match is: ", search!("e?(ab)*b+")("eaaababbbbaac"));
    writefln("Test 3: The match is: ", search!("(abc)*abb")("abbbbaac"));
    writefln("Test 4: The match is: ", search!("a{3}b+")("aaabbbbcaac"));
    writefln("Test 5: The match is: ", search!("(z|a{3})b+|m.*")("aaabbbbcaac"));
    
    char [] quasistatic;
    quasistatic = "x";
    for (int i=0; i<3; ++i) {
        if (i==1) quasistatic = "a";
        if (i==2) quasistatic = "m";
        writefln("Test 6: The match is: ", 
            search!("($1.{10}$1.*)|g.*")("they said D can't do metaprogramming? ", quasistatic));
    }
    writefln("Test 7: The match is: ", search!("(z|a{3})b+|M.*")("zand this is Metaprogramming in D"));
}