|
| 1 | +///// JAVASCRIPT PARSER |
1 | 2 |
|
2 | | -// NOTES |
3 | | - |
4 | | -// push/pop lexer position |
5 | | -// need to support [no LineTerminator here] |
6 | | -// need to be able to look back at original whitespace later, |
7 | | -// find all the whitespace before a token |
8 | | -// "token" means anything but whitespace, newline, or comment |
9 | | -// multiline comments produce virtual newlines |
10 | | -// maybe conform to the spec's token input to the syntactic grammar? |
11 | | - |
12 | | -// XXX track line/col position, for errors and maybe token info |
13 | | - |
14 | | -var isArray = function (obj) { |
15 | | - return obj && (typeof obj === 'object') && (typeof obj.length === 'number'); |
16 | | -}; |
17 | | - |
18 | | -Tokenizer = function (codeOrLexer) { |
19 | | - // XXX rethink codeOrLexer later |
20 | | - this.lexer = (codeOrLexer instanceof Lexer ? codeOrLexer : |
21 | | - new Lexer(codeOrLexer)); |
22 | | - this.peekType = null; |
23 | | - this.peekText = null; |
24 | | - this.tokenType = null; |
25 | | - this.tokenText = null; |
26 | | - this.lastPos = 0; |
27 | | - this.pos = 0; |
28 | | - this.isLineTerminatorHere = false; |
29 | | - |
30 | | - // load peekType and peekText |
31 | | - this.consume(); |
32 | | -}; |
33 | | - |
34 | | -_.extend(Tokenizer.prototype, { |
35 | | - // consumes the token (peekType, peekText) and moves |
36 | | - // it into (type, text), loading the next token |
37 | | - // into (peekType, peekText). A token is a lexeme |
38 | | - // besides WHITESPACE, COMMENT, and NEWLINE. |
39 | | - consume: function () { |
40 | | - var self = this; |
41 | | - var lexer = self.lexer; |
42 | | - self.type = self.peekType; |
43 | | - self.text = self.peekText; |
44 | | - self.lastPos = self.pos; |
45 | | - self.isLineTerminatorHere = false; |
46 | | - do { |
47 | | - lexer.next(); |
48 | | - if (lexer.type === "ERROR") |
49 | | - throw new Error("Bad token at position " + lexer.lastPos + |
50 | | - ", text `" + lexer.text + "`"); |
51 | | - else if (lexer.type === "NEWLINE") |
52 | | - self.isLineTerminatorHere = true; |
53 | | - else if (lexer.type === "COMMENT" && ! /^.*$/.test(lexer.text)) |
54 | | - // multiline comments containing line terminators count |
55 | | - // as line terminators. |
56 | | - self.isLineTerminatorHere = true; |
57 | | - } while (lexer.type !== "EOF" && ! Lexer.isToken(lexer.type)); |
58 | | - self.peekType = lexer.type; |
59 | | - self.peekText = lexer.text; |
60 | | - self.pos = lexer.lastPos; |
61 | | - } |
62 | | -}); |
63 | | - |
64 | | -// A parser that consume()s has to succeed. |
65 | | -// Similarly, a parser that fails can't have consumed. |
66 | | - |
67 | | -// mutates the parser; don't describe an existing parser. |
68 | | -var describe = function (description, parser) { |
69 | | - parser.description = description; |
70 | | - return parser; |
71 | | -}; |
72 | | - |
73 | | -// Call this as `throw parseError(...)`. |
74 | | -// `expected` is a parser, `after` is a string. |
75 | | -var parseError = function (t, expected, after) { |
76 | | - var str = (expected.description ? "Expected " + expected.description : |
77 | | - // all parsers that might error should have descriptions, |
78 | | - // but just in case: |
79 | | - "Unexpected token"); |
80 | | - if (after) |
81 | | - str += " after " + (after.text ? "`" + after.text + "`" : after); |
82 | | - var pos = t.pos; |
83 | | - str += " at position " + pos; |
84 | | - str += ", found " + (t.peekText ? "`" + t.peekText + "`" : "EOF"); |
85 | | - var e = new Error(str); |
86 | | - return e; |
87 | | -}; |
88 | | - |
89 | | -///// TERMINAL PARSER CONSTRUCTORS |
90 | | - |
91 | | -var _tokenClassImpl = function (type, text, dontConsume) { |
92 | | - var textSet = (text ? makeSet(text.split(' ')) : null); |
93 | | - var description = (text ? text.split(' ').join(', ') : type); |
94 | | - return describe( |
95 | | - description, |
96 | | - function (t) { |
97 | | - if (t.peekType == type && (!text || textSet[t.peekText])) { |
98 | | - if (dontConsume) |
99 | | - return []; |
100 | | - var ret = {text: t.peekText, pos: t.pos}; |
101 | | - t.consume(); |
102 | | - return ret; |
103 | | - } |
104 | | - return null; |
105 | | - }); |
106 | | -}; |
107 | | - |
108 | | -var _tokenImpl = function (text, dontConsume) { |
109 | | - if (/\w/.test(text)) |
110 | | - return _tokenClassImpl('KEYWORD', text, dontConsume); |
111 | | - return _tokenClassImpl('PUNCTUATION', text, dontConsume); |
112 | | -}; |
113 | | - |
114 | | -var tokenClass = function (type, text) { |
115 | | - if (type === "ERROR" || type === "EOF") |
116 | | - throw new Error("Can't create EOF or ERROR tokens, can only look ahead"); |
117 | | - return _tokenClassImpl(type, text); |
118 | | -}; |
119 | | - |
120 | | -var token = function (text) { |
121 | | - return _tokenImpl(text); |
122 | | -}; |
123 | | - |
124 | | -// Like token, but marks tokens that need to defy the lexer's |
125 | | -// heuristic about whether the next '/' is a division or |
126 | | -// starts a regex. |
127 | | -var preSlashToken = function (text, divisionNotRegex) { |
128 | | - var impl = _tokenImpl(text); |
129 | | - return describe(impl.description, |
130 | | - function (t) { |
131 | | - // temporarily set divisionPermitted, |
132 | | - // restoring it if we don't match. |
133 | | - var oldValue = t.lexer.divisionPermitted; |
134 | | - var result; |
135 | | - try { |
136 | | - t.lexer.divisionPermitted = divisionNotRegex; |
137 | | - result = impl(t); |
138 | | - return result; |
139 | | - } finally { |
140 | | - if (! result) |
141 | | - t.lexer.divisionPermitted = oldValue; |
142 | | - } |
143 | | - }); |
144 | | -}; |
145 | | - |
146 | | -// NON-CONSUMING PARSER CONSTRUCTORS |
147 | | - |
148 | | -var lookAheadTokenClass = function (type, text) { |
149 | | - return _tokenClassImpl(type, text, true); |
150 | | -}; |
151 | | - |
152 | | -var lookAheadToken = function (text) { |
153 | | - return _tokenImpl(text, true); |
154 | | -}; |
155 | | - |
156 | | -///// NON-TERMINAL PARSER CONSTRUCTORS |
157 | | - |
158 | | -// call as: runRequired(parser, tokenizer[, prevToken]) |
159 | | -// to run parser(tokenizer) and assert it matches |
160 | | -var runRequired = function (parser, tokenizer, prevToken) { |
161 | | - return revalue( |
162 | | - tokenizer ? parser(tokenizer) : parser, |
163 | | - function (v, t) { |
164 | | - if (! v) |
165 | | - throw parseError(t || tokenizer, parser, prevToken); |
166 | | - return v; |
167 | | - }); |
168 | | -}; |
169 | | - |
170 | | -var runMaybeRequired = function (require, parser, tokenizer, prevToken) { |
171 | | - if (require) |
172 | | - return runRequired(parser, tokenizer, prevToken); |
173 | | - else |
174 | | - return parser(tokenizer); |
175 | | -}; |
176 | | - |
177 | | -// Polymorphic in parsers and results; an experiment. |
178 | | -var named = function (name, parserOrResult) { |
179 | | - return describe( |
180 | | - name, |
181 | | - revalue( |
182 | | - parserOrResult, |
183 | | - function (value) { |
184 | | - if (! value) |
185 | | - return null; |
186 | | - |
187 | | - var result; |
188 | | - if (isArray(value) && ! value.named) |
189 | | - // bare array, prepend the name |
190 | | - result = [name].concat(Array.prototype.slice.call(value)); |
191 | | - else |
192 | | - // token or named array; construct a new named array |
193 | | - result = [name, value]; |
194 | | - |
195 | | - // don't name the same thing twice |
196 | | - result.named = true; |
197 | | - |
198 | | - return result; |
199 | | - })); |
200 | | -}; |
201 | | - |
202 | | -var or = function (/*parsers*/) { |
203 | | - var args = arguments; |
204 | | - return function (t) { |
205 | | - var result; |
206 | | - for(var i = 0, N = args.length; i < N; i++) { |
207 | | - result = args[i](t); |
208 | | - if (result) |
209 | | - return result; |
210 | | - } |
211 | | - return null; |
212 | | - }; |
213 | | -}; |
214 | | - |
215 | | -// Parses a left-recursive expression with zero or more occurrences |
216 | | -// of a binary op. Leaves the term unwrapped if no op. For example |
217 | | -// (in a hypothetical use case): |
218 | | -// `1` => "1" |
219 | | -// `1+2` => ["binary", "1", "+", "2"] |
220 | | -// `1+2+3` => ["binary", ["binary", "1", "+", "2"], "+", "3"] |
221 | | -// |
222 | | -// opParser can also be an array of op parsers from high to low |
223 | | -// precedence (tightest-binding first) |
224 | | -var binaryLeft = function (termParser, opParser) { |
225 | | - if (isArray(opParser)) { |
226 | | - if (opParser.length === 1) { |
227 | | - // take single opParser out of its array |
228 | | - opParser = opParser[0]; |
229 | | - } else { |
230 | | - // pop off last opParser (non-destructively) and replace |
231 | | - // termParser with a recursive binaryLeft on the remaining |
232 | | - // ops. |
233 | | - termParser = binaryLeft(termParser, opParser.slice(0, -1)); |
234 | | - opParser = opParser[opParser.length - 1]; |
235 | | - } |
236 | | - } |
237 | | - |
238 | | - return describe( |
239 | | - termParser.description, |
240 | | - function (t) { |
241 | | - var result = termParser(t); |
242 | | - if (! result) |
243 | | - return null; |
244 | | - |
245 | | - var op; |
246 | | - while ((op = opParser(t))) { |
247 | | - result = named( |
248 | | - 'binary', |
249 | | - [result, op, runRequired(termParser, t, op)]); |
250 | | - } |
251 | | - return result; |
252 | | - }); |
253 | | -}; |
254 | | - |
255 | | -// Parses a list of one or more items with a separator, listing the |
256 | | -// items and separators. (Separator is optional.) For example: |
257 | | -// `x` => ["x"] |
258 | | -// `x,y` => ["x", ",", "y"] |
259 | | -// `x,y,z` => ["x", ",", "y", ",", "z"] |
260 | | -var list = function (itemParser, sepParser) { |
261 | | - return describe( |
262 | | - itemParser.description, |
263 | | - function (t) { |
264 | | - var result = [itemParser(t)]; |
265 | | - if (! result[0]) |
266 | | - return null; |
267 | | - |
268 | | - if (sepParser) { |
269 | | - var sep; |
270 | | - while ((sep = sepParser(t))) |
271 | | - result.push(sep, runRequired(itemParser, t, sep)); |
272 | | - } else { |
273 | | - var item; |
274 | | - while ((item = itemParser(t))) |
275 | | - result.push(item); |
276 | | - } |
277 | | - return result; |
278 | | - }); |
279 | | -}; |
280 | | - |
281 | | -var seq = function (/*parsers*/) { |
282 | | - var args = arguments; |
283 | | - if (! args.length) |
284 | | - return describe("(empty)", |
285 | | - function (t) { return []; }); |
286 | | - |
287 | | - var description = args[0].description; |
288 | | - for (var i = 1; i < args.length; i++) |
289 | | - description += " " + args[i].description; |
290 | | - return describe( |
291 | | - description, |
292 | | - function (t) { |
293 | | - var result = []; |
294 | | - for (var i = 0, N = args.length; i < N; i++) { |
295 | | - // first item in sequence can fail, and we |
296 | | - // fail (without error); after that, error on failure |
297 | | - var r = runMaybeRequired(i > 0, args[i], t); |
298 | | - if (! r) |
299 | | - return null; |
300 | | - |
301 | | - if (r.unpack) // append array! |
302 | | - result.push.apply(result, r); |
303 | | - else |
304 | | - result.push(r); |
305 | | - } |
306 | | - return result; |
307 | | - }); |
308 | | -}; |
309 | | - |
310 | | -var unpack = function (arrayParser) { |
311 | | - return revalue(arrayParser, function (v) { |
312 | | - if (v && isArray(v)) |
313 | | - v.unpack = true; |
314 | | - return v; |
315 | | - }); |
316 | | -}; |
317 | | - |
318 | | -// lookAhead parser must never consume |
319 | | -var lookAhead = function (lookAheadParser, nextParser) { |
320 | | - return describe( |
321 | | - lookAheadParser.description, |
322 | | - function (t) { |
323 | | - if (! lookAheadParser(t)) |
324 | | - return null; |
325 | | - return nextParser(t); |
326 | | - }); |
327 | | -}; |
328 | | -var negLookAhead = function (lookAheadParser, nextParser) { |
329 | | - if (! nextParser) |
330 | | - return function (t) { |
331 | | - return lookAheadParser(t) ? null : []; |
332 | | - }; |
333 | | - |
334 | | - return describe( |
335 | | - nextParser.description, |
336 | | - function (t) { |
337 | | - if (lookAheadParser(t)) |
338 | | - return null; |
339 | | - return nextParser(t); |
340 | | - }); |
341 | | -}; |
342 | | - |
343 | | -// parser that looks at nothing and returns result |
344 | | -var constant = function (result) { |
345 | | - // no description |
346 | | - return function (t) { |
347 | | - return result; |
348 | | - }; |
349 | | -}; |
350 | | - |
351 | | -// afterLookAhead allows the parser to fail rather than |
352 | | -// succeed if would otherwise fail at a position where |
353 | | -// afterLookAhead doesn't match, potentially providing |
354 | | -// a better error message. For example, the illegal |
355 | | -// object literal `{true:1}` will stop at the `true` |
356 | | -// and say something like "expected property name" |
357 | | -// instead of "expected }". As another example, |
358 | | -// `for(;var;) {}` will lead to "Expected expression" |
359 | | -// instead of "Expected ;" when the optional expression |
360 | | -// turns out to be an illegal `var`. |
361 | | -var opt = function (parser, afterLookAhead) { |
362 | | - return describe(parser.description, |
363 | | - or(parser, afterLookAhead ? afterLookAhead : seq())); |
364 | | -}; |
365 | | - |
366 | | -// note: valueTransformFunc gets the tokenizer as a second argument |
367 | | -// if it's called on a parser. This func is allowed to then |
368 | | -// run more parsers. |
369 | | -var revalue = function (parserOrValue, valueTransformFunc) { |
370 | | - if (typeof parserOrValue === 'function') |
371 | | - // it's a parser |
372 | | - return describe(parserOrValue.description, |
373 | | - function (t) { |
374 | | - return valueTransformFunc(parserOrValue(t), t); |
375 | | - }); |
376 | | - else |
377 | | - return valueTransformFunc(parserOrValue); |
378 | | -}; |
| 3 | +// XXX unit tests |
379 | 4 |
|
380 | 5 | var parse = function (tokenizer) { |
381 | 6 | var noLineTerminatorHere = describe( |
|
0 commit comments