// Compile-time regular expression parser, version 2.0 // Author: Don Clugston, borrowing many ideas from the 1.0 parser written by Eric Anderton. // This version uses mixins for simplicity, flexibility, and high performance. // Terminology: // a regex "sequence" is a set of consecutive "term"s, // each of which consists of an "atom", optionally followed by // a "repeater" (*,+,?, {m}, or {m,n}). // Points of interest: // * The parser is able to treat all 'repeater's in a single mixin function, while still applying // optimisations (eg, there's absolutely no difference between {1,} and "+"). // * There is absolutely no parameter passing inside the regexp engine. Even functions which // can't be inlined will have very low calling cost. // * Consequently, the speed is excellent. The main unnecessary operations are the checks to see whether we // are at the end of the string. // This could be greatly improved by precalculating the minimum length required for a match, // at least for subsequences of fixed length. // * Since each mixin can be given access to any desired runtime or compile-time parameters, // the scheme is extremely flexible. //--------------------------------------------------------------------- // Part 0 : Functions from the meta library //--------------------------------------------------------------------- /****************************************************** * ulong atoui!(char [] s); * * Converts an ASCII string to an uint. */ template atoui(char [] s, uint result=0, int indx=0) { static if (s.length==indx) const uint atoui = result; else static if (s[indx]<'0' || s[indx]>'9') const uint atoui = result; else const uint atoui = atoui!(s, result * 10 + s[indx]-'0', indx+1); } //--------------------------------------------------------------------- // Part I : Functions for parsing a regular expression string literal. //--------------------------------------------------------------------- // None of these generate any code. // retuns index of first char in regstr which equals ch, or -1 if not found // escaped chars are ignored template unescapedFindFirst(char [] regstr, char ch, int indx=0) { static if (regstr.length<=indx) const int unescapedFindFirst = -1; // not found else static if (regstr[indx]==ch) const int unescapedFindFirst=indx; else static if (regstr[indx]=='\\') // if it's escaped, prevent it from matching. const int unescapedunescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+2); else const int unescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+1); } // Returns the number of chars at the start of regstr which are made up by // a repetition expression (+, *, ?, {,} ) template repeaterConsumed(char [] regstr) { static if (regstr.length==0) const int repeaterConsumed = 0; else static if (regstr[0]=='+' || regstr[0]=='*' || regstr[0]=='?') const int repeaterConsumed = 1; else static if (regstr[0]=='{') { static if (unescapedFindFirst!(regstr, '}')==-1) { pragma(msg, "Error: unmatched { in regular expression"); static assert(0); } else const int repeaterConsumed = 1 + unescapedFindFirst!(regstr, '}'); } else const int repeaterConsumed = 0; } // The minimum allowable number of repetitions for this repeater template repeaterMin(char [] regstr) { static if (regstr[0]=='*' || regstr[0]=='?') const uint repeaterMin = 0; else static if (regstr[0]=='+') const uint repeaterMin = 1; else { static assert (regstr[0]=='{') ; const uint repeaterMin = atoui!(regstr[1..$]); } } // The maximum allowable number of repetitions for this repeater template repeaterMax(char [] regstr) { static if (regstr[0]=='*' || regstr[0]=='+') const uint repeaterMax = uint.max; else static if (regstr[0]=='?') const uint repeaterMax = 0; else static if (regstr[0]=='{') { static if (unescapedFindFirst!(regstr, ',')==-1) // "{n}" const uint repeaterMax = repeaterMin!(regstr); else static if (regstr[$-2]==',') // "{n,}" const uint repeaterMax = uint.max; else // "{n,m}" const uint repeaterMax = atoui!(regstr[ 1+unescapedFindFirst!(regstr, ',') .. $]); } else { pragma(msg, "Error: unsupported repeater " ~ regstr); static assert(0); } } // find the index of the first |, or -1 if not found. // ignores escaped items, and anything in parentheses. template findUnion(char [] regstr, int indx=0, int numopenparens=0) { static if (indx>=regstr.length) const int findUnion = -1; else static if (numopenparens==0 && regstr[indx]=='|') const int findUnion = indx; else static if (regstr[indx]==')') const int findUnion = findUnion!(regstr, indx+1, numopenparens-1); else static if (regstr[indx]=='(') const int findUnion = findUnion!(regstr, indx+1, numopenparens+1); else static if (regstr[indx]=='\\') // skip the escaped character const int findUnion = findUnion!(regstr, indx+2, numopenparens); else const int findUnion = findUnion!(regstr, indx+1, numopenparens); } // keeps going until the number of ) parens equals the number of ( parens. // All escaped characters are ignored. // BUG: what about inside [-] ? template parenConsumed(char [] regstr, int numopenparens=0) { static if (regstr.length==0) { pragma(msg, "Unmatched parenthesis"); static assert(0); } else static if (regstr[0]==')') { static if (numopenparens==1) const int parenConsumed=1; // finished! else const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens-1); } else static if (regstr[0]=='(') { const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens+1); } else static if (regstr[0]=='\\' && regstr.length>1) // ignore \(, \). const int parenConsumed = 2 + parenConsumed!(regstr[2..$], numopenparens); else const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens); } // the term before multiplier template atomConsumed(char [] regstr) { static if (regstr[0]=='(') const int atomConsumed = parenConsumed!(regstr); else static if (regstr[0]=='[') const int atomConsumed = 1 + unescapedFindFirst!(regstr, ']'); else static if (regstr[0]=='\\') { static if (regstr.length>0) { const int atomConsumed=2; } else { pragma(msg, "Error: Regexp must not end with \\"); static assert(0); } } else static if (regstr[0]=='$') { // NONSTANDARD: referenced parameter const int atomConsumed=2; } else const int atomConsumed=1; // match single char } // parses a term from the front of regstr (which must not be empty). // consisting of an atom, optionally followed by a repeater. template termConsumed(char [] regstr) { const int termConsumed = atomConsumed!(regstr) + repeaterConsumed!(regstr[atomConsumed!(regstr)..$]); } //--------------------------------------------------------------------- // Part II: mixins which generate the final code //--------------------------------------------------------------------- // The code is generated by a set of nested mixins. // At compile time, each mixin is passed a subset of a regexp string. // It generates a member function bool fn(), which updates a pointer p, // and returns true if a match was found. // Each mixin has access to the following values: // At compile time: // fullpattern -- the complete, unparsed regular expression string // At run time: // searchstr (read only) -- the string being searched // p --- the first character in searchstr which is not yet matched. // param[0..8] -- the quasi-static parameter strings @1..@9 to match. // Additional variables or constants can be added as desired. // Note that unless p is reset to 0, it will automatically behave as a global search, // continuing from the last place it left off. // regstr is a sequence of productions, possibly containing a union template regSequence(char [] regstr) { static if (findUnion!(regstr)==-1) { // No unions to worry about mixin regNoUnions!(regstr); } else { pragma(msg, "Parsing union " ~ regstr); bool fn() { mixin regSequence!(regstr[0..findUnion!(regstr)]) a; mixin regSequence!(regstr[findUnion!(regstr)+1..$]) b; return a.fn() || b.fn(); } } } // regstr is a sequence of productions, all of which must be matched // Does not contain any unions template regNoUnions(char [] regstr) { static if (regstr.length == termConsumed!(regstr)) { // there's only a single item (possibly including a repeater) mixin regTerm!(regstr); } else { bool fn() { pragma(msg, "Parsing sequence without unions: " ~ regstr); mixin regTerm!(regstr[0..termConsumed!(regstr)]) a; mixin regSequence!(regstr[termConsumed!(regstr)..$]) b; int oldp = p; // need to roll it back, if not all of them match if (a.fn() && b.fn()) return true; p = oldp; return false; } } } // the atom, optionally followed by a repeater. // Here we deal with all kinds of repeats, // but we make different optimisations depending on the allowable repeats. template regTerm(char [] regstr) { static if (atomConsumed!(regstr)==regstr.length) { pragma(msg, "Simple atom " ~ regstr); // there is no multiplier, just use the atom mixin regAtom!(regstr); } else { pragma(msg, "Repeated atom " ~ regstr); bool fn() { const int t = atomConsumed!(regstr); const uint repmin = repeaterMin!(regstr[t..$]); const uint repmax = repeaterMax!(regstr[t..$]); mixin regAtom!(regstr[0..t]) a; static if (repmin == 0 && repmax == 1) { // "?", or "{0,1}". Worth optimising seperately a.fn(); return true; } else static if (repmin==0 && repmax == uint.max) { // optimise for "*", "{0,}" while (a.fn()) {}; return true; } else { // "+", or "{m,n}" int numreps=0; // how many repeats have we found? while (a.fn()) { numreps++; static if (repmax=repmin); } } } } // generate a parser for an atom // IN: regstr is a valid atom, without a repeat // OUT: if atom is matched, return true, and update p. // if atom is not matched, return false, and leave p unchanged. template regAtom(char [] regstr) { static if (regstr[0]=='(') { mixin regSequence!(regstr[1..$-1]) a; } else static if (regstr[0]=='[') { static if (regstr[1]=='^') mixin regInvertedRange!(regstr[2..$-1]); else mixin regRange!(regstr[1..$-1]); } else static if (regstr[0]=='.') { bool fn() { if (p==searchstr.length) return false; p++; return true; } } else static if (regstr[0]=='$') { // NONSTANDARD: referenced parameter mixin regParameter!(atoui!(regstr[1..$])-1); } else { // match single character bool fn() { if (p==searchstr.length || searchstr[p]!=regstr[0]) return false; p++; return true; } } } // match a variable string, which will be passed as a parameter. template regParameter(int parmnum) { bool fn() { if (p + param[parmnum].length > searchstr.length) return false; if (searchstr[p..p+param[parmnum].length] != param[parmnum]) return false; p+=param[parmnum].length; return true; } } template regRange(char [] regstr) { bool fn() { pragma(msg, "Parsing range: " ~ regstr ~ " NOT YET IMPLEMENTED"); return true; } } template regInvertedRange(char [] regstr) { bool fn() { pragma(msg, "Parsing inverted range: " ~ regstr ~ " NOT YET IMPLEMENTED"); return true; } } //--------------------------------------------------------------------- // Part III: the public interface of the regexp engine //--------------------------------------------------------------------- // Does the regexp match the pattern? template test(char [] fullpattern) { bool test(char [] searchstr, char [][] param...) { int p = 0; // start at the beginning of the string mixin regSequence!(fullpattern) a; return a.fn(); } } /// Return first substring which matches the pattern, or "" if none. template search(char [] fullpattern) { char [] search(char [] searchstr, char [][] param...) { int p; // next index to test mixin regSequence!(fullpattern) a; for (int x=0; x