From bc48c02a185c496505c999fc1f265acefb3913bc Mon Sep 17 00:00:00 2001 From: Marijn Haverbeke Date: Mon, 23 Feb 2015 15:38:06 +0100 Subject: [PATCH] Move the parser state into an object Makes almost everything in acorn.js a method of this object. Performance is not significantly affected on a modern V8. Makes the code reentrant and allows us to more easily expose more methods externally in the future. On the other hand, the proliferation of `this.` is unfortunate. --- acorn.js | 3256 ++++++++++++++++++++++++------------------------ test/driver.js | 3 +- 2 files changed, 1607 insertions(+), 1652 deletions(-) diff --git a/acorn.js b/acorn.js index 2e33050623..dac6d21f94 100644 --- a/acorn.js +++ b/acorn.js @@ -1,9 +1,7 @@ // Acorn is a tiny, fast JavaScript parser written in JavaScript. // // Acorn was written by Marijn Haverbeke and various contributors and -// released under an MIT license. The Unicode regexps (for identifiers -// and whitespace) were taken from [Esprima](http://esprima.org) by -// Ariya Hidayat. +// released under an MIT license. // // Git repositories for Acorn are available at // @@ -33,19 +31,16 @@ // The main exported interface (under `self.acorn` when in the // browser) is a `parse` function that takes a code string and // returns an abstract syntax tree as specified by [Mozilla parser - // API][api], with the caveat that inline XML is not recognized. + // API][api]. // // [api]: https://developer.mozilla.org/en-US/docs/SpiderMonkey/Parser_API - var options, input, inputLen, sourceFile; - - exports.parse = function(inpt, opts) { - input = String(inpt); inputLen = input.length; - setOptions(opts); - initTokenState(); - var startPos = options.locations ? [tokPos, curPosition()] : tokPos; - initParserState(); - return parseTopLevel(options.program || startNodeAt(startPos)); + exports.parse = function(input, options) { + var p = new Parser(options, input); + var startPos = p.options.locations ? [p.pos, p.curPosition()] : p.pos; + p.skipSpace(); + p.readToken(); + return p.parseTopLevel(p.options.program || p.startNodeAt(startPos)); }; // A second optional argument can be given to further configure @@ -129,51 +124,13 @@ // offset in a string. Useful for parsing mixed-language formats // that embed JavaScript expressions. - exports.parseExpressionAt = function(inpt, pos, opts) { - input = String(inpt); inputLen = input.length; - setOptions(opts); - initTokenState(pos); - initParserState(); - return parseExpression(); + exports.parseExpressionAt = function(input, pos, options) { + var p = new Parser(options, input, pos); + p.skipSpace(); + p.readToken(); + return p.parseExpression(); }; - var isArray = function (obj) { - return Object.prototype.toString.call(obj) === "[object Array]"; - }; - - function setOptions(opts) { - options = {}; - for (var opt in defaultOptions) - options[opt] = opts && has(opts, opt) ? opts[opt] : defaultOptions[opt]; - sourceFile = options.sourceFile || null; - if (isArray(options.onToken)) { - var tokens = options.onToken; - options.onToken = function (token) { - tokens.push(token); - }; - } - if (isArray(options.onComment)) { - var comments = options.onComment; - options.onComment = function (block, text, start, end, startLoc, endLoc) { - var comment = { - type: block ? 'Block' : 'Line', - value: text, - start: start, - end: end - }; - if (options.locations) { - comment.loc = new SourceLocation(); - comment.loc.start = startLoc; - comment.loc.end = endLoc; - } - if (options.ranges) - comment.range = [start, end]; - comments.push(comment); - }; - } - isKeyword = options.ecmaVersion >= 6 ? isEcma6Keyword : isEcma5AndLessKeyword; - } - // The `getLineInfo` function is mostly useful when the // `locations` option is off (for performance reasons) and you // want to find the line/column position for a given character @@ -189,58 +146,61 @@ cur = match.index + match[0].length; } else break; } - return {line: line, column: offset - cur}; + return new Position(line, offset - cur); }; - function Token() { - this.type = tokType; - this.value = tokVal; - this.start = tokStart; - this.end = tokEnd; - if (options.locations) { - this.loc = new SourceLocation(); - this.loc.end = tokEndLoc; - } - if (options.ranges) - this.range = [tokStart, tokEnd]; - } + // Object type used to represent tokens. Note that normally, tokens + // simply exist as properties on the parser object. This is only + // used for the onToken callback and the external tokenizer. - exports.Token = Token; + var Token = exports.Token = function(p) { + this.type = p.type; + this.value = p.value; + this.start = p.start; + this.end = p.end; + if (p.options.locations) { + this.loc = new SourceLocation(p); + this.loc.end = p.endLoc; + } + if (p.options.ranges) + this.range = [p.start, p.end]; + }; // Acorn is organized as a tokenizer and a recursive-descent parser. // The `tokenize` export provides an interface to the tokenizer. // Because the tokenizer is optimized for being efficiently used by // the Acorn parser itself, this interface is somewhat crude and not - // very modular. Performing another parse or call to `tokenize` will - // reset the internal state, and invalidate existing tokenizers. + // very modular. - exports.tokenize = function(inpt, opts) { - input = String(inpt); inputLen = input.length; - setOptions(opts); - initTokenState(); - skipSpace(); + exports.tokenize = function(input, options) { + var p = new Parser(options, input); + p.skipSpace(); function getToken() { - lastEnd = tokEnd; - readToken(); - return new Token(); + p.lastTokEnd = p.end; + p.readToken(); + return new Token(p); } + + getToken.current = function() { return new Token(p); }; + getToken.jumpTo = function(pos, exprAllowed) { - tokPos = pos; - if (options.locations) { - tokCurLine = 1; - tokLineStart = lineBreak.lastIndex = 0; + p.pos = pos; + if (p.options.locations) { + p.curLine = 1; + p.lineStart = lineBreak.lastIndex = 0; var match; - while ((match = lineBreak.exec(input)) && match.index < pos) { - ++tokCurLine; - tokLineStart = match.index + match[0].length; + while ((match = lineBreak.exec(p.input)) && match.index < pos) { + ++p.curLine; + p.lineStart = match.index + match[0].length; } } - tokExprAllowed = !!exprAllowed; - skipSpace(); + p.exprAllowed = !!exprAllowed; + p.skipSpace(); }; - getToken.current = function() { return new Token(); }; - if (typeof Symbol !== 'undefined') { + + // If we're in an ES6 environment, make this an iterator. + if (typeof Symbol !== "undefined") { getToken[Symbol.iterator] = function () { return { next: function () { @@ -253,98 +213,45 @@ }; }; } - getToken.options = options; + + getToken.options = p.options; return getToken; }; - // State is kept in (closure-)global variables. We already saw the - // `options`, `input`, and `inputLen` variables above. + // Interpret and default an options object - // The current position of the tokenizer in the input. + function parseOptions(opts) { + var options = {}; + for (var opt in defaultOptions) + options[opt] = opts && has(opts, opt) ? opts[opt] : defaultOptions[opt]; - var tokPos; + if (isArray(options.onToken)) { + var tokens = options.onToken; + options.onToken = function (token) { tokens.push(token); }; + } + if (isArray(options.onComment)) + options.onComment = pushComment(options, options.onComment); - // The start and end offsets of the current token. - - var tokStart, tokEnd; - - // When `options.locations` is true, these hold objects - // containing the tokens start and end line/column pairs. - - var tokStartLoc, tokEndLoc; - - // The type and value of the current token. Token types are objects, - // named by variables against which they can be compared, and - // holding properties that describe them (indicating, for example, - // the precedence of an infix operator, and the original name of a - // keyword token). The kind of value that's held in `tokVal` depends - // on the type of the token. For literals, it is the literal value, - // for operators, the operator name, and so on. - - var tokType, tokVal; - - // Internal state for the tokenizer. To distinguish between division - // operators and regular expressions, it remembers whether the last - // token was one that is allowed to be followed by an expression. In - // some cases, notably after ')' or '}' tokens, the situation - // depends on the context before the matching opening bracket, so - // tokContext keeps a stack of information about current bracketed - // forms. - - var tokContext, tokExprAllowed; - - // When `options.locations` is true, these are used to keep - // track of the current line, and know when a new line has been - // entered. - - var tokCurLine, tokLineStart; - - // These store the position of the previous token, which is useful - // when finishing a node and assigning its `end` position. - - var lastStart, lastEnd, lastEndLoc; - - // This is the parser's state. `inFunction` is used to reject - // `return` statements outside of functions, `inGenerator` to - // reject `yield`s outside of generators, `labels` to verify - // that `break` and `continue` have somewhere to jump to, and - // `strict` indicates whether strict mode is on. - - var inFunction, inGenerator, labels, strict; - - function initParserState() { - lastStart = lastEnd = tokPos; - if (options.locations) lastEndLoc = curPosition(); - inFunction = inGenerator = false; - labels = []; - skipSpace(); - readToken(); + return options; } - // This function is used to raise exceptions on parse errors. It - // takes an offset integer (into the current `input`) to indicate - // the location of the error, attaches the position to the end - // of the error message, and then raises a `SyntaxError` with that - // message. - - function raise(pos, message) { - var loc = getLineInfo(input, pos); - message += " (" + loc.line + ":" + loc.column + ")"; - var err = new SyntaxError(message); - err.pos = pos; err.loc = loc; err.raisedAt = tokPos; - throw err; - } - - function fullCharCodeAtPos() { - var code = input.charCodeAt(tokPos); - if (code <= 0xd7ff || code >= 0xe000) return code; - var next = input.charCodeAt(tokPos + 1); - return (code << 10) + next - 0x35fdc00; - } - - function skipChar(code) { - if (code <= 0xffff) tokPos++; - else tokPos += 2; + function pushComment(options, array) { + return function (block, text, start, end, startLoc, endLoc) { + var comment = { + type: block ? 'Block' : 'Line', + value: text, + start: start, + end: end + }; + if (options.locations) { + comment.loc = new SourceLocation(this); + comment.loc.start = startLoc; + comment.loc.end = endLoc; + } + if (options.ranges) + comment.range = [start, end]; + array.push(comment); + }; } // Reused empty array added for node fields that are always empty. @@ -545,8 +452,6 @@ var isEcma6Keyword = makePredicate(ecma5AndLessKeywords + " let const class extends export import yield"); - var isKeyword = isEcma5AndLessKeyword; - // ## Character categories // Big ugly regular expressions that match characters in the @@ -626,7 +531,7 @@ // ## Tokenizer // These are used when `options.locations` is on, for the - // `tokStartLoc` and `tokEndLoc` properties. + // `startLoc` and `endLoc` properties. function Position(line, col) { this.line = line; @@ -637,670 +542,6 @@ return new Position(this.line, this.column + n); }; - function curPosition() { - return new Position(tokCurLine, tokPos - tokLineStart); - } - - // Reset the token state. Used at the start of a parse. - - function initTokenState(pos) { - if (pos) { - tokPos = pos; - tokLineStart = Math.max(0, input.lastIndexOf("\n", pos)); - tokCurLine = input.slice(0, tokLineStart).split(newline).length; - } else { - tokCurLine = 1; - tokPos = tokLineStart = 0; - } - tokType = _eof; - tokContext = [b_stat]; - tokExprAllowed = true; - strict = false; - if (tokPos === 0 && options.allowHashBang && input.slice(0, 2) === '#!') { - skipLineComment(2); - } - } - - // The algorithm used to determine whether a regexp can appear at a - // given point in the program is loosely based on sweet.js' approach. - // See https://github.com/mozilla/sweet.js/wiki/design - - var b_stat = {token: "{", isExpr: false}, b_expr = {token: "{", isExpr: true}, b_tmpl = {token: "${", isExpr: true}; - var p_stat = {token: "(", isExpr: false}, p_expr = {token: "(", isExpr: true}; - var q_tmpl = {token: "`", isExpr: true}, f_expr = {token: "function", isExpr: true}; - - function curTokContext() { - return tokContext[tokContext.length - 1]; - } - - function braceIsBlock(prevType) { - var parent; - if (prevType === _colon && (parent = curTokContext()).token == "{") - return !parent.isExpr; - if (prevType === _return) - return newline.test(input.slice(lastEnd, tokStart)); - if (prevType === _else || prevType === _semi || prevType === _eof) - return true; - if (prevType == _braceL) - return curTokContext() === b_stat; - return !tokExprAllowed; - } - - // Called at the end of every token. Sets `tokEnd`, `tokVal`, and - // maintains `tokContext` and `tokExprAllowed`, and skips the space - // after the token, so that the next one's `tokStart` will point at - // the right position. - - function finishToken(type, val) { - tokEnd = tokPos; - if (options.locations) tokEndLoc = curPosition(); - var prevType = tokType, preserveSpace = false; - tokType = type; - tokVal = val; - - // Update context info - if (type === _parenR || type === _braceR) { - var out = tokContext.pop(); - if (out === b_tmpl) { - preserveSpace = true; - } else if (out === b_stat && curTokContext() === f_expr) { - tokContext.pop(); - tokExprAllowed = false; - } else { - tokExprAllowed = !(out && out.isExpr); - } - } else if (type === _braceL) { - tokContext.push(braceIsBlock(prevType) ? b_stat : b_expr); - tokExprAllowed = true; - } else if (type === _dollarBraceL) { - tokContext.push(b_tmpl); - tokExprAllowed = true; - } else if (type == _parenL) { - var statementParens = prevType === _if || prevType === _for || prevType === _with || prevType === _while; - tokContext.push(statementParens ? p_stat : p_expr); - tokExprAllowed = true; - } else if (type == _incDec) { - // tokExprAllowed stays unchanged - } else if (type.keyword && prevType == _dot) { - tokExprAllowed = false; - } else if (type == _function) { - if (curTokContext() !== b_stat) { - tokContext.push(f_expr); - } - tokExprAllowed = false; - } else if (type === _backQuote) { - if (curTokContext() === q_tmpl) { - tokContext.pop(); - } else { - tokContext.push(q_tmpl); - preserveSpace = true; - } - tokExprAllowed = false; - } else { - tokExprAllowed = type.beforeExpr; - } - - if (!preserveSpace) skipSpace(); - } - - function skipBlockComment() { - var startLoc = options.onComment && options.locations && curPosition(); - var start = tokPos, end = input.indexOf("*/", tokPos += 2); - if (end === -1) raise(tokPos - 2, "Unterminated comment"); - tokPos = end + 2; - if (options.locations) { - lineBreak.lastIndex = start; - var match; - while ((match = lineBreak.exec(input)) && match.index < tokPos) { - ++tokCurLine; - tokLineStart = match.index + match[0].length; - } - } - if (options.onComment) - options.onComment(true, input.slice(start + 2, end), start, tokPos, - startLoc, options.locations && curPosition()); - } - - function skipLineComment(startSkip) { - var start = tokPos; - var startLoc = options.onComment && options.locations && curPosition(); - var ch = input.charCodeAt(tokPos+=startSkip); - while (tokPos < inputLen && ch !== 10 && ch !== 13 && ch !== 8232 && ch !== 8233) { - ++tokPos; - ch = input.charCodeAt(tokPos); - } - if (options.onComment) - options.onComment(false, input.slice(start + startSkip, tokPos), start, tokPos, - startLoc, options.locations && curPosition()); - } - - // Called at the start of the parse and after every token. Skips - // whitespace and comments, and. - - function skipSpace() { - while (tokPos < inputLen) { - var ch = input.charCodeAt(tokPos); - if (ch === 32) { // ' ' - ++tokPos; - } else if (ch === 13) { - ++tokPos; - var next = input.charCodeAt(tokPos); - if (next === 10) { - ++tokPos; - } - if (options.locations) { - ++tokCurLine; - tokLineStart = tokPos; - } - } else if (ch === 10 || ch === 8232 || ch === 8233) { - ++tokPos; - if (options.locations) { - ++tokCurLine; - tokLineStart = tokPos; - } - } else if (ch > 8 && ch < 14) { - ++tokPos; - } else if (ch === 47) { // '/' - var next = input.charCodeAt(tokPos + 1); - if (next === 42) { // '*' - skipBlockComment(); - } else if (next === 47) { // '/' - skipLineComment(2); - } else break; - } else if (ch === 160) { // '\xa0' - ++tokPos; - } else if (ch >= 5760 && nonASCIIwhitespace.test(String.fromCharCode(ch))) { - ++tokPos; - } else { - break; - } - } - } - - // ### Token reading - - // This is the function that is called to fetch the next token. It - // is somewhat obscure, because it works in character codes rather - // than characters, and because operator parsing has been inlined - // into it. - // - // All in the name of speed. - // - function readToken_dot() { - var next = input.charCodeAt(tokPos + 1); - if (next >= 48 && next <= 57) return readNumber(true); - var next2 = input.charCodeAt(tokPos + 2); - if (options.ecmaVersion >= 6 && next === 46 && next2 === 46) { // 46 = dot '.' - tokPos += 3; - return finishToken(_ellipsis); - } else { - ++tokPos; - return finishToken(_dot); - } - } - - function readToken_slash() { // '/' - var next = input.charCodeAt(tokPos + 1); - if (tokExprAllowed) {++tokPos; return readRegexp();} - if (next === 61) return finishOp(_assign, 2); - return finishOp(_slash, 1); - } - - function readToken_mult_modulo(code) { // '%*' - var next = input.charCodeAt(tokPos + 1); - if (next === 61) return finishOp(_assign, 2); - return finishOp(code === 42 ? _star : _modulo, 1); - } - - function readToken_pipe_amp(code) { // '|&' - var next = input.charCodeAt(tokPos + 1); - if (next === code) return finishOp(code === 124 ? _logicalOR : _logicalAND, 2); - if (next === 61) return finishOp(_assign, 2); - return finishOp(code === 124 ? _bitwiseOR : _bitwiseAND, 1); - } - - function readToken_caret() { // '^' - var next = input.charCodeAt(tokPos + 1); - if (next === 61) return finishOp(_assign, 2); - return finishOp(_bitwiseXOR, 1); - } - - function readToken_plus_min(code) { // '+-' - var next = input.charCodeAt(tokPos + 1); - if (next === code) { - if (next == 45 && input.charCodeAt(tokPos + 2) == 62 && - newline.test(input.slice(lastEnd, tokPos))) { - // A `-->` line comment - skipLineComment(3); - skipSpace(); - return readToken(); - } - return finishOp(_incDec, 2); - } - if (next === 61) return finishOp(_assign, 2); - return finishOp(_plusMin, 1); - } - - function readToken_lt_gt(code) { // '<>' - var next = input.charCodeAt(tokPos + 1); - var size = 1; - if (next === code) { - size = code === 62 && input.charCodeAt(tokPos + 2) === 62 ? 3 : 2; - if (input.charCodeAt(tokPos + size) === 61) return finishOp(_assign, size + 1); - return finishOp(_bitShift, size); - } - if (next == 33 && code == 60 && input.charCodeAt(tokPos + 2) == 45 && - input.charCodeAt(tokPos + 3) == 45) { - // `` line comment + this.skipLineComment(3); + this.skipSpace(); + return this.readToken(); + } + return this.finishOp(_incDec, 2); + } + if (next === 61) return this.finishOp(_assign, 2); + return this.finishOp(_plusMin, 1); + }; + + pp.readToken_lt_gt = function(code) { // '<>' + var next = this.input.charCodeAt(this.pos + 1); + var size = 1; + if (next === code) { + size = code === 62 && this.input.charCodeAt(this.pos + 2) === 62 ? 3 : 2; + if (this.input.charCodeAt(this.pos + size) === 61) return this.finishOp(_assign, size + 1); + return this.finishOp(_bitShift, size); + } + if (next == 33 && code == 60 && this.input.charCodeAt(this.pos + 2) == 45 && + this.input.charCodeAt(this.pos + 3) == 45) { + // `