Recursive-descent parsing

Sat Dec 24 06:14:48 PST 2016

On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwards 
wrote:
> The authors of "The Art of Java" present, as a first coding 
> example, a recursive-descent parser to demonstrate Java's 
> ability to facilitate low level programming commonly performed 
> in C and C++.
>
> I took the opportunity to port the code to D. By doing this, I 
> now have an understanding of how a recursive-descent parser is 
> written and works. What I am now interested in, is learning 
> about what makes this particular implementation a poor or 
> strong one, how to improve upon it and other, possibly 
> superior, implementations made possible by D.
>
> Comments are invited and appreciated.
>
> Andrew Edwards
> If at first you don't succeed...
>
> =========================
> rdp2.d
> =========================
> //module rdp2;
>
> class ParserException : Throwable
> {
>     this(string msg)
>     {
>         super(msg);
>     }
> }
>
> // These are the token types.
> enum
> {
>     NONE,
>     DELIMITER,
>     VARIABLE,
>     NUMBER
> }
>
> // These are the types of syntax errors.
> enum
> {
>     SYNTAX,
>     UNBALPARENS,
>     NOEXP,
>     DIVBYZERO
> }
>
> // This token indecates end-of-expression.
> enum EOE = ";";
>
> string exp;     // refers to expression string
> int ndx;     // current index into the expression
> string token;   // holds current token
> int tokType;    // holds token's types
>
> // Array for variables.
> double[] vars = new double[26];
>
> // Parser entry poing.
> double evaluate(string expstr)
> {
>     double result;
>     exp = expstr;
>     ndx = 0;
>
>     getToken();
>     if (token == EOE)
>         handleErr(NOEXP); // no expression present
>
>     // Parse and evaluate the expression.
>     result = evalExp1();
>
>     if (token != EOE)
>         handleErr(SYNTAX); // last token must be EOE
>
>     return result;
> }
>
> // Process an assignment.
> double evalExp1()
> {
>     double result;
>     int varNdx;
>     int ttokType;
>     string tmpToken;
>
>     if (tokType == VARIABLE)
>     {
>         // save old token
>         tmpToken = token;
>         ttokType = tokType;
>
>         // Compute the index of the variable.
>         import std.ascii: toUpper;
>         varNdx = token[0].toUpper - 'A';
>
>         getToken();
>         if (token != "=")
>         {
>             putBack(); // return current token
>             // restore old token -- not an assignment
>             token = tmpToken;
>             tokType = ttokType;
>         }
>         else
>         {
>             getToken(); // get next part of exp
>             result = evalExp2();
>             vars[varNdx] = result;
>             return result;
>         }
>     }
>     return evalExp2();
> }
>
> // Add or subtract two terms
> double evalExp2()
> {
>     char op;
>     double result;
>     double partialResult;
>
>     result = evalExp3();
>
>     while ((op = token[0]) == '+' || op == '-')
>     {
>         getToken();
>         partialResult = evalExp3();
>         final switch (op)
>         {
>             case '-':
>             {
>                 result -= partialResult;
>                 break;
>             }
>             case '+':
>             {
>                 result += partialResult;
>                 break;
>             }
>         }
>     }
>     return result;
> }
>
> // Multiply or divide two factors
> double evalExp3()
> {
>     char op;
>     double result;
>     double partialResult;
>
>     result = evalExp4();
>
>     while ((op = token[0]) == '*' || op == '/' || op == '%')
>     {
>         getToken();
>         partialResult = evalExp4();
>         final switch (op)
>         {
>             case '*':
>             {
>                 result *= partialResult;
>                 break;
>             }
>             case '/':
>             {
>                 if (partialResult == 0.0)
>                     handleErr(DIVBYZERO);
>                 result /= partialResult;
>                 break;
>             }
>             case '%':
>             {
>                 if (partialResult == 0.0)
>                     handleErr(DIVBYZERO);
>                 result %= partialResult;
>                 break;
>             }
>         }
>     }
>     return result;
> }
>
> // Process an exponent.
> double evalExp4()
> {
>     double result;
>     double partialResult;
>     double ex;
>
>     result = evalExp5();
>
>     if (token == "^")
>     {
>         getToken();
>         partialResult = evalExp4();
>         ex = result;
>         if (partialResult == 0.0)
>         {
>             result = 1.0;
>         }
>         else
>         {
>             for (int t = cast(int)partialResult-1; t > 0; t--)
>                 result *= ex;
>         }
>     }
>     return result;
> }
>
> // Evaluate a unary + or -
> double evalExp5()
> {
>     double result;
>     string op;
>
>     op = null;
>     if ((tokType == DELIMITER) && token == "+" || token == "-")
>     {
>
>         op = token;
>         getToken();
>     }
>     result = evalExp6();
>
>     //if (op == "-") result = -result;
>
>     return (op == "-") ? -result : result;
> }
>
> // Process a parenthesized expression.
> double evalExp6()
> {
>     double result;
>
>     if (token == "(")
>     {
>         getToken();
>         result = evalExp2();
>         if (token != ")")
>             handleErr(UNBALPARENS);
>         getToken();
>     }
>     else
>         result = atom();
>
>     return result;
> }
>
> // Get the value of a number.
> double atom()
> {
>     double result = 0.0;
>
>     switch (tokType)
>     {
>         case NUMBER:
>             try
>             {
>                 import std.conv: parse;
>                 result = token.parse!double;
>             }
>             catch (Exception e)
>             {
>                 handleErr(SYNTAX);
>             }
>             getToken();
>             break;
>         case VARIABLE:
>             result = findVar(token);
>             getToken();
>             break;
>         default:
>             handleErr(SYNTAX);
>             break;
>     }
>     return result;
> }
>
> // Return the value of a variable.
> double findVar(string vname)
> {
>     import std.ascii: isAlpha, toUpper;
>     if (!vname[0].isAlpha)
>     {
>         handleErr(SYNTAX);
>         return 0.0;
>     }
>
>     return vars[(vname[0] - 'A').toUpper];
> }
>
> // Return a token to the input stream.
> void putBack()
> {
>     if (token == EOE) return;
>     foreach(i; 0 .. token.length) ndx--;
> }
>
> // Handle an error.
> void handleErr(int error)
> {
>     string[] err = [
>         "Syntax Error",
>         "Unbalanced Parentheses",
>         "No Expression Present",
>         "Division by Zero"
>     ];
>
>     throw new ParserException(err[error]);
> }
>
> // Obtain the next token.
> void getToken()
> {
>     import std.ascii: isAlpha, isDigit, isWhite;
>     tokType = NONE;
>     token = null;
>
>     // Check for end of expression
>     if (ndx == exp.length)
>     {
>         token = EOE.dup;
>         return;
>     }
>
>     // Skip over white space
>     while (ndx < exp.length && exp[ndx].isWhite) ++ndx;
>
>     // Trailing whitespace ends expression.
>     if (ndx == exp.length)
>     {
>         token = EOE.dup;
>         return;
>     }
>
>     if (exp[ndx].isDelim) // is operator
>     {
>         token ~= exp[ndx];
>         ndx++;
>         tokType = DELIMITER;
>     }
>     else if (exp[ndx].isAlpha) // is variable
>     {
>         while (!exp[ndx].isDelim)
>         {
>             token ~= exp[ndx];
>             ndx++;
>             if (ndx >= exp.length) break;
>         }
>         tokType = VARIABLE;
>     }
>     else if (exp[ndx].isDigit) // is number
>     {
>         while (!exp[ndx].isDelim)
>         {
>             token ~= exp[ndx];
>             ndx++;
>             if (ndx >= exp.length) break;
>
>         }
>         tokType = NUMBER;
>     }
>     else // unknown character terminates expression
>     {
>         token = EOE.dup;
>         return;
>     }
> }
>
> // Returns true if c is a delimiter.
> bool isDelim(char c)
> {
>     import std.algorithm: canFind;
>     return canFind(" +-/*%^=()", c);
> }
>
> =========================
> demordp.d
> =========================
> mixin(import("rdp2.d"));
>
> void main(string[] args)
> {
>     string expr;
>
>     import std.stdio: write, writeln, readln;
>     writeln("Enter an empty expression to stop.");
>
>     for(;;)
>     {
>         write("Enter an expression: ");
>         expr = readln();
>         if (expr == "\n") break;
>         try
>         {
>             writeln("Result: ", expr.evaluate);
>         }
>         catch (ParserException e)
>         {
>             e.msg.writeln;
>         }
>     }
> }

All of the performance characteristics crucially depend on the 
language you want to parse.
In general you want to match as many tokes at as you can in a 
single parse-tree node.
Since that will reduce the number of function calls you have to 
make.
You want to preallocate buffers.
Use the append-functionality sparingly since that involves a call 
into a shared library.
Avoid or encapsulate global state if you can since it can get 
tricky to mange.
Other then that there is not much general advice I can give.

Try to tag your parse-nodes since then you can match patterns 
using a nested switch, which is one of the cleanest routes to do 
it.