move parser combinators into own file

dgreensp · dgreensp · commit b869ecbd80d5 · 2012-09-11T16:49:47.000-07:00
diff --git a/packages/jsparse/package.js b/packages/jsparse/package.js
@@ -3,7 +3,6 @@ Package.describe({
 });
 
 Package.on_use(function (api) {
-  api.add_files([
-    'parser.js',
-    'lexer.js'], ['client', 'server']);
+  api.add_files(['lexer.js', 'parserlib.js', 'parser.js'],
+                ['client', 'server']);
 });
diff --git a/packages/jsparse/parser.js b/packages/jsparse/parser.js
@@ -1,381 +1,6 @@
+///// JAVASCRIPT PARSER
 
-// NOTES
-
-// push/pop lexer position
-// need to support [no LineTerminator here]
-// need to be able to look back at original whitespace later,
-//   find all the whitespace before a token
-// "token" means anything but whitespace, newline, or comment
-// multiline comments produce virtual newlines
-// maybe conform to the spec's token input to the syntactic grammar?
-
-// XXX track line/col position, for errors and maybe token info
-
-var isArray = function (obj) {
-  return obj && (typeof obj === 'object') && (typeof obj.length === 'number');
-};
-
-Tokenizer = function (codeOrLexer) {
-  // XXX rethink codeOrLexer later
-  this.lexer = (codeOrLexer instanceof Lexer ? codeOrLexer :
-                new Lexer(codeOrLexer));
-  this.peekType = null;
-  this.peekText = null;
-  this.tokenType = null;
-  this.tokenText = null;
-  this.lastPos = 0;
-  this.pos = 0;
-  this.isLineTerminatorHere = false;
-
-  // load peekType and peekText
-  this.consume();
-};
-
-_.extend(Tokenizer.prototype, {
-  // consumes the token (peekType, peekText) and moves
-  // it into (type, text), loading the next token
-  // into (peekType, peekText).  A token is a lexeme
-  // besides WHITESPACE, COMMENT, and NEWLINE.
-  consume: function () {
-    var self = this;
-    var lexer = self.lexer;
-    self.type = self.peekType;
-    self.text = self.peekText;
-    self.lastPos = self.pos;
-    self.isLineTerminatorHere = false;
-    do {
-      lexer.next();
-      if (lexer.type === "ERROR")
-        throw new Error("Bad token at position " + lexer.lastPos +
-                        ", text `" + lexer.text + "`");
-      else if (lexer.type === "NEWLINE")
-        self.isLineTerminatorHere = true;
-      else if (lexer.type === "COMMENT" && ! /^.*$/.test(lexer.text))
-        // multiline comments containing line terminators count
-        // as line terminators.
-        self.isLineTerminatorHere = true;
-    } while (lexer.type !== "EOF" && ! Lexer.isToken(lexer.type));
-    self.peekType = lexer.type;
-    self.peekText = lexer.text;
-    self.pos = lexer.lastPos;
-  }
-});
-
-// A parser that consume()s has to succeed.
-// Similarly, a parser that fails can't have consumed.
-
-// mutates the parser; don't describe an existing parser.
-var describe = function (description, parser) {
-  parser.description = description;
-  return parser;
-};
-
-// Call this as `throw parseError(...)`.
-// `expected` is a parser, `after` is a string.
-var parseError = function (t, expected, after) {
-  var str = (expected.description ? "Expected " + expected.description :
-             // all parsers that might error should have descriptions,
-             // but just in case:
-             "Unexpected token");
-  if (after)
-    str += " after " + (after.text ? "`" + after.text + "`" : after);
-  var pos = t.pos;
-  str += " at position " + pos;
-  str += ", found " + (t.peekText ? "`" + t.peekText + "`" : "EOF");
-  var e = new Error(str);
-  return e;
-};
-
-///// TERMINAL PARSER CONSTRUCTORS
-
-var _tokenClassImpl = function (type, text, dontConsume) {
-  var textSet = (text ? makeSet(text.split(' ')) : null);
-  var description = (text ? text.split(' ').join(', ') : type);
-  return describe(
-    description,
-    function (t) {
-      if (t.peekType == type && (!text || textSet[t.peekText])) {
-        if (dontConsume)
-          return [];
-        var ret = {text: t.peekText, pos: t.pos};
-        t.consume();
-        return ret;
-      }
-      return null;
-    });
-};
-
-var _tokenImpl = function (text, dontConsume) {
-  if (/\w/.test(text))
-    return _tokenClassImpl('KEYWORD', text, dontConsume);
-  return _tokenClassImpl('PUNCTUATION', text, dontConsume);
-};
-
-var tokenClass = function (type, text) {
-  if (type === "ERROR" || type === "EOF")
-    throw new Error("Can't create EOF or ERROR tokens, can only look ahead");
-  return _tokenClassImpl(type, text);
-};
-
-var token = function (text) {
-  return _tokenImpl(text);
-};
-
-// Like token, but marks tokens that need to defy the lexer's
-// heuristic about whether the next '/' is a division or
-// starts a regex.
-var preSlashToken = function (text, divisionNotRegex) {
-  var impl = _tokenImpl(text);
-  return describe(impl.description,
-                  function (t) {
-                    // temporarily set divisionPermitted,
-                    // restoring it if we don't match.
-                    var oldValue = t.lexer.divisionPermitted;
-                    var result;
-                    try {
-                      t.lexer.divisionPermitted = divisionNotRegex;
-                      result = impl(t);
-                      return result;
-                    } finally {
-                      if (! result)
-                        t.lexer.divisionPermitted = oldValue;
-                    }
-                  });
-};
-
-// NON-CONSUMING PARSER CONSTRUCTORS
-
-var lookAheadTokenClass = function (type, text) {
-  return _tokenClassImpl(type, text, true);
-};
-
-var lookAheadToken = function (text) {
-  return _tokenImpl(text, true);
-};
-
-///// NON-TERMINAL PARSER CONSTRUCTORS
-
-// call as: runRequired(parser, tokenizer[, prevToken])
-// to run parser(tokenizer) and assert it matches
-var runRequired = function (parser, tokenizer, prevToken) {
-  return revalue(
-    tokenizer ? parser(tokenizer) : parser,
-    function (v, t) {
-      if (! v)
-        throw parseError(t || tokenizer, parser, prevToken);
-      return v;
-    });
-};
-
-var runMaybeRequired = function (require, parser, tokenizer, prevToken) {
-  if (require)
-    return runRequired(parser, tokenizer, prevToken);
-  else
-    return parser(tokenizer);
-};
-
-// Polymorphic in parsers and results; an experiment.
-var named = function (name, parserOrResult) {
-  return describe(
-    name,
-    revalue(
-      parserOrResult,
-      function (value) {
-        if (! value)
-          return null;
-
-        var result;
-        if (isArray(value) && ! value.named)
-          // bare array, prepend the name
-          result = [name].concat(Array.prototype.slice.call(value));
-        else
-          // token or named array; construct a new named array
-          result = [name, value];
-
-        // don't name the same thing twice
-        result.named = true;
-
-        return result;
-      }));
-};
-
-var or = function (/*parsers*/) {
-  var args = arguments;
-  return function (t) {
-    var result;
-    for(var i = 0, N = args.length; i < N; i++) {
-      result = args[i](t);
-      if (result)
-        return result;
-    }
-    return null;
-  };
-};
-
-// Parses a left-recursive expression with zero or more occurrences
-// of a binary op.  Leaves the term unwrapped if no op.  For example
-// (in a hypothetical use case):
-// `1` => "1"
-// `1+2` => ["binary", "1", "+", "2"]
-// `1+2+3` => ["binary", ["binary", "1", "+", "2"], "+", "3"]
-//
-// opParser can also be an array of op parsers from high to low
-// precedence (tightest-binding first)
-var binaryLeft = function (termParser, opParser) {
-  if (isArray(opParser)) {
-    if (opParser.length === 1) {
-      // take single opParser out of its array
-      opParser = opParser[0];
-    } else {
-      // pop off last opParser (non-destructively) and replace
-      // termParser with a recursive binaryLeft on the remaining
-      // ops.
-      termParser = binaryLeft(termParser, opParser.slice(0, -1));
-      opParser = opParser[opParser.length - 1];
-    }
-  }
-
-  return describe(
-    termParser.description,
-    function (t) {
-      var result = termParser(t);
-      if (! result)
-        return null;
-
-      var op;
-      while ((op = opParser(t))) {
-        result = named(
-          'binary',
-          [result, op, runRequired(termParser, t, op)]);
-      }
-      return result;
-    });
-};
-
-// Parses a list of one or more items with a separator, listing the
-// items and separators.  (Separator is optional.)  For example:
-// `x` => ["x"]
-// `x,y` => ["x", ",", "y"]
-// `x,y,z` => ["x", ",", "y", ",", "z"]
-var list = function (itemParser, sepParser) {
-  return describe(
-    itemParser.description,
-    function (t) {
-      var result = [itemParser(t)];
-      if (! result[0])
-        return null;
-
-      if (sepParser) {
-        var sep;
-        while ((sep = sepParser(t)))
-          result.push(sep, runRequired(itemParser, t, sep));
-      } else {
-        var item;
-        while ((item = itemParser(t)))
-          result.push(item);
-      }
-      return result;
-    });
-};
-
-var seq = function (/*parsers*/) {
-  var args = arguments;
-  if (! args.length)
-    return describe("(empty)",
-                    function (t) { return []; });
-
-  var description = args[0].description;
-  for (var i = 1; i < args.length; i++)
-    description += " " + args[i].description;
-  return describe(
-    description,
-    function (t) {
-      var result = [];
-      for (var i = 0, N = args.length; i < N; i++) {
-        // first item in sequence can fail, and we
-        // fail (without error); after that, error on failure
-        var r = runMaybeRequired(i > 0, args[i], t);
-        if (! r)
-          return null;
-
-        if (r.unpack) // append array!
-          result.push.apply(result, r);
-        else
-          result.push(r);
-      }
-      return result;
-    });
-};
-
-var unpack = function (arrayParser) {
-  return revalue(arrayParser, function (v) {
-    if (v && isArray(v))
-      v.unpack = true;
-    return v;
-  });
-};
-
-// lookAhead parser must never consume
-var lookAhead = function (lookAheadParser, nextParser) {
-  return describe(
-    lookAheadParser.description,
-    function (t) {
-      if (! lookAheadParser(t))
-        return null;
-      return nextParser(t);
-    });
-};
-var negLookAhead = function (lookAheadParser, nextParser) {
-  if (! nextParser)
-    return function (t) {
-      return lookAheadParser(t) ? null : [];
-    };
-
-  return describe(
-    nextParser.description,
-    function (t) {
-      if (lookAheadParser(t))
-        return null;
-      return nextParser(t);
-    });
-};
-
-// parser that looks at nothing and returns result
-var constant = function (result) {
-  // no description
-  return function (t) {
-    return result;
-  };
-};
-
-// afterLookAhead allows the parser to fail rather than
-// succeed if would otherwise fail at a position where
-// afterLookAhead doesn't match, potentially providing
-// a better error message.  For example, the illegal
-// object literal `{true:1}` will stop at the `true`
-// and say something like "expected property name"
-// instead of "expected }".  As another example,
-// `for(;var;) {}` will lead to "Expected expression"
-// instead of "Expected ;" when the optional expression
-// turns out to be an illegal `var`.
-var opt = function (parser, afterLookAhead) {
-  return describe(parser.description,
-                  or(parser, afterLookAhead ? afterLookAhead : seq()));
-};
-
-// note: valueTransformFunc gets the tokenizer as a second argument
-// if it's called on a parser.  This func is allowed to then
-// run more parsers.
-var revalue = function (parserOrValue, valueTransformFunc) {
-  if (typeof parserOrValue === 'function')
-    // it's a parser
-    return describe(parserOrValue.description,
-                    function (t) {
-                      return valueTransformFunc(parserOrValue(t), t);
-                    });
-  else
-    return valueTransformFunc(parserOrValue);
-};
+// XXX unit tests
 
 var parse = function (tokenizer) {
   var noLineTerminatorHere = describe(
diff --git a/packages/jsparse/parserlib.js b/packages/jsparse/parserlib.js