Recursive-descent parsing
Stefan Koch via Digitalmars-d-learn
digitalmars-d-learn at puremagic.com
Sat Dec 24 06:14:48 PST 2016
On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwards
wrote:
> The authors of "The Art of Java" present, as a first coding
> example, a recursive-descent parser to demonstrate Java's
> ability to facilitate low level programming commonly performed
> in C and C++.
>
> I took the opportunity to port the code to D. By doing this, I
> now have an understanding of how a recursive-descent parser is
> written and works. What I am now interested in, is learning
> about what makes this particular implementation a poor or
> strong one, how to improve upon it and other, possibly
> superior, implementations made possible by D.
>
> Comments are invited and appreciated.
>
> Andrew Edwards
> If at first you don't succeed...
>
> =========================
> rdp2.d
> =========================
> //module rdp2;
>
> class ParserException : Throwable
> {
> this(string msg)
> {
> super(msg);
> }
> }
>
> // These are the token types.
> enum
> {
> NONE,
> DELIMITER,
> VARIABLE,
> NUMBER
> }
>
> // These are the types of syntax errors.
> enum
> {
> SYNTAX,
> UNBALPARENS,
> NOEXP,
> DIVBYZERO
> }
>
> // This token indecates end-of-expression.
> enum EOE = ";";
>
> string exp; // refers to expression string
> int ndx; // current index into the expression
> string token; // holds current token
> int tokType; // holds token's types
>
> // Array for variables.
> double[] vars = new double[26];
>
> // Parser entry poing.
> double evaluate(string expstr)
> {
> double result;
> exp = expstr;
> ndx = 0;
>
> getToken();
> if (token == EOE)
> handleErr(NOEXP); // no expression present
>
> // Parse and evaluate the expression.
> result = evalExp1();
>
> if (token != EOE)
> handleErr(SYNTAX); // last token must be EOE
>
> return result;
> }
>
> // Process an assignment.
> double evalExp1()
> {
> double result;
> int varNdx;
> int ttokType;
> string tmpToken;
>
> if (tokType == VARIABLE)
> {
> // save old token
> tmpToken = token;
> ttokType = tokType;
>
> // Compute the index of the variable.
> import std.ascii: toUpper;
> varNdx = token[0].toUpper - 'A';
>
> getToken();
> if (token != "=")
> {
> putBack(); // return current token
> // restore old token -- not an assignment
> token = tmpToken;
> tokType = ttokType;
> }
> else
> {
> getToken(); // get next part of exp
> result = evalExp2();
> vars[varNdx] = result;
> return result;
> }
> }
> return evalExp2();
> }
>
> // Add or subtract two terms
> double evalExp2()
> {
> char op;
> double result;
> double partialResult;
>
> result = evalExp3();
>
> while ((op = token[0]) == '+' || op == '-')
> {
> getToken();
> partialResult = evalExp3();
> final switch (op)
> {
> case '-':
> {
> result -= partialResult;
> break;
> }
> case '+':
> {
> result += partialResult;
> break;
> }
> }
> }
> return result;
> }
>
> // Multiply or divide two factors
> double evalExp3()
> {
> char op;
> double result;
> double partialResult;
>
> result = evalExp4();
>
> while ((op = token[0]) == '*' || op == '/' || op == '%')
> {
> getToken();
> partialResult = evalExp4();
> final switch (op)
> {
> case '*':
> {
> result *= partialResult;
> break;
> }
> case '/':
> {
> if (partialResult == 0.0)
> handleErr(DIVBYZERO);
> result /= partialResult;
> break;
> }
> case '%':
> {
> if (partialResult == 0.0)
> handleErr(DIVBYZERO);
> result %= partialResult;
> break;
> }
> }
> }
> return result;
> }
>
> // Process an exponent.
> double evalExp4()
> {
> double result;
> double partialResult;
> double ex;
>
> result = evalExp5();
>
> if (token == "^")
> {
> getToken();
> partialResult = evalExp4();
> ex = result;
> if (partialResult == 0.0)
> {
> result = 1.0;
> }
> else
> {
> for (int t = cast(int)partialResult-1; t > 0; t--)
> result *= ex;
> }
> }
> return result;
> }
>
> // Evaluate a unary + or -
> double evalExp5()
> {
> double result;
> string op;
>
> op = null;
> if ((tokType == DELIMITER) && token == "+" || token == "-")
> {
>
> op = token;
> getToken();
> }
> result = evalExp6();
>
> //if (op == "-") result = -result;
>
> return (op == "-") ? -result : result;
> }
>
> // Process a parenthesized expression.
> double evalExp6()
> {
> double result;
>
> if (token == "(")
> {
> getToken();
> result = evalExp2();
> if (token != ")")
> handleErr(UNBALPARENS);
> getToken();
> }
> else
> result = atom();
>
> return result;
> }
>
> // Get the value of a number.
> double atom()
> {
> double result = 0.0;
>
> switch (tokType)
> {
> case NUMBER:
> try
> {
> import std.conv: parse;
> result = token.parse!double;
> }
> catch (Exception e)
> {
> handleErr(SYNTAX);
> }
> getToken();
> break;
> case VARIABLE:
> result = findVar(token);
> getToken();
> break;
> default:
> handleErr(SYNTAX);
> break;
> }
> return result;
> }
>
> // Return the value of a variable.
> double findVar(string vname)
> {
> import std.ascii: isAlpha, toUpper;
> if (!vname[0].isAlpha)
> {
> handleErr(SYNTAX);
> return 0.0;
> }
>
> return vars[(vname[0] - 'A').toUpper];
> }
>
> // Return a token to the input stream.
> void putBack()
> {
> if (token == EOE) return;
> foreach(i; 0 .. token.length) ndx--;
> }
>
> // Handle an error.
> void handleErr(int error)
> {
> string[] err = [
> "Syntax Error",
> "Unbalanced Parentheses",
> "No Expression Present",
> "Division by Zero"
> ];
>
> throw new ParserException(err[error]);
> }
>
> // Obtain the next token.
> void getToken()
> {
> import std.ascii: isAlpha, isDigit, isWhite;
> tokType = NONE;
> token = null;
>
> // Check for end of expression
> if (ndx == exp.length)
> {
> token = EOE.dup;
> return;
> }
>
> // Skip over white space
> while (ndx < exp.length && exp[ndx].isWhite) ++ndx;
>
> // Trailing whitespace ends expression.
> if (ndx == exp.length)
> {
> token = EOE.dup;
> return;
> }
>
> if (exp[ndx].isDelim) // is operator
> {
> token ~= exp[ndx];
> ndx++;
> tokType = DELIMITER;
> }
> else if (exp[ndx].isAlpha) // is variable
> {
> while (!exp[ndx].isDelim)
> {
> token ~= exp[ndx];
> ndx++;
> if (ndx >= exp.length) break;
> }
> tokType = VARIABLE;
> }
> else if (exp[ndx].isDigit) // is number
> {
> while (!exp[ndx].isDelim)
> {
> token ~= exp[ndx];
> ndx++;
> if (ndx >= exp.length) break;
>
> }
> tokType = NUMBER;
> }
> else // unknown character terminates expression
> {
> token = EOE.dup;
> return;
> }
> }
>
> // Returns true if c is a delimiter.
> bool isDelim(char c)
> {
> import std.algorithm: canFind;
> return canFind(" +-/*%^=()", c);
> }
>
> =========================
> demordp.d
> =========================
> mixin(import("rdp2.d"));
>
> void main(string[] args)
> {
> string expr;
>
> import std.stdio: write, writeln, readln;
> writeln("Enter an empty expression to stop.");
>
> for(;;)
> {
> write("Enter an expression: ");
> expr = readln();
> if (expr == "\n") break;
> try
> {
> writeln("Result: ", expr.evaluate);
> }
> catch (ParserException e)
> {
> e.msg.writeln;
> }
> }
> }
All of the performance characteristics crucially depend on the
language you want to parse.
In general you want to match as many tokes at as you can in a
single parse-tree node.
Since that will reduce the number of function calls you have to
make.
You want to preallocate buffers.
Use the append-functionality sparingly since that involves a call
into a shared library.
Avoid or encapsulate global state if you can since it can get
tricky to mange.
Other then that there is not much general advice I can give.
Try to tag your parse-nodes since then you can match patterns
using a nested switch, which is one of the cleanest routes to do
it.
More information about the Digitalmars-d-learn
mailing list