diff --git a/src/state.js b/src/state.js new file mode 100644 index 0000000000..cb5e4b4fbe --- /dev/null +++ b/src/state.js @@ -0,0 +1,56 @@ +import {reservedWords, keywords} from "./identifier" +import {types as tt} from "./tokentype" + +export default function Parser(options, input, startPos) { + this.options = options + this.loadPlugins(this.options.plugins) + this.sourceFile = this.options.sourceFile || null + this.isKeyword = keywords[this.options.ecmaVersion >= 6 ? 6 : 5] + this.isReservedWord = reservedWords[this.options.ecmaVersion] + this.input = input + + // Set up token state + + // The current position of the tokenizer in the input. + if (startPos) { + this.pos = startPos; + this.lineStart = Math.max(0, this.input.lastIndexOf("\n", startPos)); + this.curLine = this.input.slice(0, this.lineStart).split(lineBreak).length; + } else { + this.pos = this.lineStart = 0; + this.curLine = 1; + } + + // Properties of the current token: + // Its type + this.type = tt.eof; + // For tokens that include more information than their type, the value + this.value = null; + // Its start and end offset + this.start = this.end = this.pos; + // And, if locations are used, the {line, column} object + // corresponding to those offsets + this.startLoc = this.endLoc = null; + + // Position information for the previous token + this.lastTokEndLoc = this.lastTokStartLoc = null; + this.lastTokStart = this.lastTokEnd = this.pos; + + // The context stack is used to superficially track syntactic + // context to predict whether a regular expression is allowed in a + // given position. + this.context = this.initialContext() + this.exprAllowed = true + + // Figure out if it's a module code. + this.strict = this.inModule = this.options.sourceType === "module"; + + // Flags to track whether we are in a function, a generator. + this.inFunction = this.inGenerator = false; + // Labels in scope. + this.labels = []; + + // If enabled, skip leading hashbang line. + if (this.pos === 0 && this.options.allowHashBang && this.input.slice(0, 2) === '#!') + this.skipLineComment(2); +} diff --git a/src/tokencontext.js b/src/tokencontext.js new file mode 100644 index 0000000000..870f3f1325 --- /dev/null +++ b/src/tokencontext.js @@ -0,0 +1,102 @@ +// The algorithm used to determine whether a regexp can appear at a +// given point in the program is loosely based on sweet.js' approach. +// See https://github.com/mozilla/sweet.js/wiki/design + +import Parser from "./state" +import {types as tt} from "./tokentype" + +export class TokContext { + constructor(token, isExpr, preserveSpace, override) { + this.token = token + this.isExpr = isExpr + this.preserveSpace = preserveSpace + this.override = override + } +} + +export const types = { + b_stat: new TokContext("{", false), + b_expr: new TokContext("{", true), + b_tmpl: new TokContext("${", true, null, p => p.readTmplToken()), + p_stat: new TokContext("(", false), + p_expr: new TokContext("(", true), + q_tmpl: new TokContext("`", true, true), + f_expr: new TokContext("function", true) +} + +const pp = Parser.prototype + +pp.initialContext = function() { + return [types.b_stat] +} + +pp.braceIsBlock = function(prevType) { + var parent; + if (prevType === tt.colon && (parent = this.curContext()).token == "{") + return !parent.isExpr; + if (prevType === tt._return) + return lineBreak.test(this.input.slice(this.lastTokEnd, this.start)); + if (prevType === tt._else || prevType === tt.semi || prevType === tt.eof) + return true; + if (prevType == tt.braceL) + return this.curContext() === types.b_stat; + return !this.exprAllowed; +}; + +pp.updateContext = function(prevType) { + var update, type = this.type; + if (type.keyword && prevType == tt.dot) + this.exprAllowed = false; + else if (update = type.updateContext) + update.call(this, prevType); + else + this.exprAllowed = type.beforeExpr; +}; + +// Token-specific context update code + +tt.parenR.updateContext = tt.braceR.updateContext = function() { + var out = this.context.pop(); + if (out === types.b_stat && this.curContext() === types.f_expr) { + this.context.pop(); + this.exprAllowed = false; + } else if (out === types.b_tmpl) { + this.exprAllowed = true; + } else { + this.exprAllowed = !(out && out.isExpr); + } +}; + +tt.braceL.updateContext = function(prevType) { + this.context.push(this.braceIsBlock(prevType) ? types.b_stat : types.b_expr); + this.exprAllowed = true; +}; + +tt.dollarBraceL.updateContext = function() { + this.context.push(types.b_tmpl); + this.exprAllowed = true; +}; + +tt.parenL.updateContext = function(prevType) { + var statementParens = prevType === tt._if || prevType === tt._for || prevType === tt._with || prevType === tt._while; + this.context.push(statementParens ? types.p_stat : types.p_expr); + this.exprAllowed = true; +}; + +tt.incDec.updateContext = function() { + // tokExprAllowed stays unchanged +}; + +tt._function.updateContext = function() { + if (this.curContext() !== types.b_stat) + this.context.push(types.f_expr); + this.exprAllowed = false; +}; + +tt.backQuote.updateContext = function() { + if (this.curContext() === types.q_tmpl) + this.context.pop(); + else + this.context.push(types.q_tmpl); + this.exprAllowed = false; +}; diff --git a/src/tokenize.js b/src/tokenize.js new file mode 100644 index 0000000000..9a2007e15d --- /dev/null +++ b/src/tokenize.js @@ -0,0 +1,724 @@ +import {isIdentifierStart, isIdentifierChar} from "./identifier" +import {types as tt, keywords as keywordTypes} from "./tokentype" +import Parser from "./state" +import "./tokencontext" + +// Object type used to represent tokens. Note that normally, tokens +// simply exist as properties on the parser object. This is only +// used for the onToken callback and the external tokenizer. + +export class Token { + constructor(p) { + this.type = p.type; + this.value = p.value; + this.start = p.start; + this.end = p.end; + if (p.options.locations) + this.loc = new SourceLocation(p, p.startLoc, p.endLoc); + if (p.options.ranges) + this.range = [p.start, p.end]; + } +} + +// Matches a whole line break (where CRLF is considered a single +// line break). Used to count lines. + +export const lineBreak = /\r\n?|\n|\u2028|\u2029/g + +export function isNewLine(code) { + return code === 10 || code === 13 || code === 0x2028 || code == 0x2029 +} + +// ## Tokenizer + +// These are used when `options.locations` is on, for the +// `startLoc` and `endLoc` properties. + +class Position { + constructor(line, col) { + this.line = line + this.column = col + } + + offset(n) { + return new Position(this.line, this.column + n) + } +} + +// Shorthand because we are going to be adding a _lot_ of methods to +// this. +const pp = Parser.prototype + +pp.extend = function(name, f) { + this[name] = f(this[name]); +}; + +pp.loadPlugins = function(plugins) { + for (var name in plugins) { + var plugin = exports.plugins[name]; + if (!plugin) throw new Error("Plugin '" + name + "' not found"); + plugin(this, plugins[name]); + } +}; + +// Move to the next token + +pp.next = function() { + if (this.options.onToken) + this.options.onToken(new Token(this)); + + this.lastTokEnd = this.end; + this.lastTokStart = this.start; + this.lastTokEndLoc = this.endLoc; + this.lastTokStartLoc = this.startLoc; + this.nextToken(); +}; + +pp.getToken = function() { + this.next(); + return new Token(this); +}; + +// If we're in an ES6 environment, make parsers iterable +if (typeof Symbol !== "undefined") + pp[Symbol.iterator] = function () { + var self = this; + return {next: function () { + var token = self.getToken(); + return { + done: token.type === tt.eof, + value: token + }; + }}; + }; + +// Toggle strict mode. Re-reads the next number or string to please +// pedantic tests (`"use strict"; 010;` should fail). + +pp.setStrict = function(strict) { + this.strict = strict; + if (this.type !== tt.num && this.type !== tt.string) return; + this.pos = this.start; + if (this.options.locations) { + while (this.pos < this.lineStart) { + this.lineStart = this.input.lastIndexOf("\n", this.lineStart - 2) + 1; + --this.curLine; + } + } + this.nextToken(); +}; + +pp.curContext = function() { + return this.context[this.context.length - 1]; +}; + +// Read a single token, updating the parser object's token-related +// properties. + +pp.nextToken = function() { + let curContext = this.curContext() + if (!curContext || !curContext.preserveSpace) this.skipSpace() + + this.start = this.pos + if (this.options.locations) this.startLoc = this.curPosition() + if (this.pos >= this.input.length) return this.finishToken(tt.eof) + + if (curContext.override) return curContext.override(this) + else this.readToken(this.fullCharCodeAtPos()) +} + +pp.readToken = function(code) { + // Identifier or keyword. '\uXXXX' sequences are allowed in + // identifiers, so '\' also dispatches to that. + if (isIdentifierStart(code, this.options.ecmaVersion >= 6) || code === 92 /* '\' */) + return this.readWord(); + + return this.getTokenFromCode(code); +}; + +pp.fullCharCodeAtPos = function() { + var code = this.input.charCodeAt(this.pos); + if (code <= 0xd7ff || code >= 0xe000) return code; + var next = this.input.charCodeAt(this.pos + 1); + return (code << 10) + next - 0x35fdc00; +}; + +pp.skipBlockComment = function() { + var startLoc = this.options.onComment && this.options.locations && this.curPosition(); + var start = this.pos, end = this.input.indexOf("*/", this.pos += 2); + if (end === -1) this.raise(this.pos - 2, "Unterminated comment"); + this.pos = end + 2; + if (this.options.locations) { + lineBreak.lastIndex = start; + var match; + while ((match = lineBreak.exec(this.input)) && match.index < this.pos) { + ++this.curLine; + this.lineStart = match.index + match[0].length; + } + } + if (this.options.onComment) + this.options.onComment(true, this.input.slice(start + 2, end), start, this.pos, + startLoc, this.options.locations && this.curPosition()); +}; + +pp.skipLineComment = function(startSkip) { + var start = this.pos; + var startLoc = this.options.onComment && this.options.locations && this.curPosition(); + var ch = this.input.charCodeAt(this.pos+=startSkip); + while (this.pos < this.input.length && ch !== 10 && ch !== 13 && ch !== 8232 && ch !== 8233) { + ++this.pos; + ch = this.input.charCodeAt(this.pos); + } + if (this.options.onComment) + this.options.onComment(false, this.input.slice(start + startSkip, this.pos), start, this.pos, + startLoc, this.options.locations && this.curPosition()); +}; + +// Called at the start of the parse and after every token. Skips +// whitespace and comments, and. + +pp.skipSpace = function() { + while (this.pos < this.input.length) { + var ch = this.input.charCodeAt(this.pos); + if (ch === 32) { // ' ' + ++this.pos; + } else if (ch === 13) { + ++this.pos; + var next = this.input.charCodeAt(this.pos); + if (next === 10) { + ++this.pos; + } + if (this.options.locations) { + ++this.curLine; + this.lineStart = this.pos; + } + } else if (ch === 10 || ch === 8232 || ch === 8233) { + ++this.pos; + if (this.options.locations) { + ++this.curLine; + this.lineStart = this.pos; + } + } else if (ch > 8 && ch < 14) { + ++this.pos; + } else if (ch === 47) { // '/' + var next = this.input.charCodeAt(this.pos + 1); + if (next === 42) { // '*' + this.skipBlockComment(); + } else if (next === 47) { // '/' + this.skipLineComment(2); + } else break; + } else if (ch === 160) { // '\xa0' + ++this.pos; + } else if (ch >= 5760 && nonASCIIwhitespace.test(String.fromCharCode(ch))) { + ++this.pos; + } else { + break; + } + } +}; + +pp.curPosition = function() { + return new Position(this.curLine, this.pos - this.lineStart); +}; + +// Called at the end of every token. Sets `end`, `val`, and +// maintains `context` and `exprAllowed`, and skips the space after +// the token, so that the next one's `start` will point at the +// right position. + +pp.finishToken = function(type, val) { + this.end = this.pos; + if (this.options.locations) this.endLoc = this.curPosition(); + var prevType = this.type; + this.type = type; + this.value = val; + + this.updateContext(prevType); +}; + +// ### Token reading + +// This is the function that is called to fetch the next token. It +// is somewhat obscure, because it works in character codes rather +// than characters, and because operator parsing has been inlined +// into it. +// +// All in the name of speed. +// +pp.readToken_dot = function() { + var next = this.input.charCodeAt(this.pos + 1); + if (next >= 48 && next <= 57) return this.readNumber(true); + var next2 = this.input.charCodeAt(this.pos + 2); + if (this.options.ecmaVersion >= 6 && next === 46 && next2 === 46) { // 46 = dot '.' + this.pos += 3; + return this.finishToken(tt.ellipsis); + } else { + ++this.pos; + return this.finishToken(tt.dot); + } +}; + +pp.readToken_slash = function() { // '/' + var next = this.input.charCodeAt(this.pos + 1); + if (this.exprAllowed) {++this.pos; return this.readRegexp();} + if (next === 61) return this.finishOp(tt.assign, 2); + return this.finishOp(tt.slash, 1); +}; + +pp.readToken_mult_modulo = function(code) { // '%*' + var next = this.input.charCodeAt(this.pos + 1); + if (next === 61) return this.finishOp(tt.assign, 2); + return this.finishOp(code === 42 ? tt.star : tt.modulo, 1); +}; + +pp.readToken_pipe_amp = function(code) { // '|&' + var next = this.input.charCodeAt(this.pos + 1); + if (next === code) return this.finishOp(code === 124 ? tt.logicalOR : tt.logicalAND, 2); + if (next === 61) return this.finishOp(tt.assign, 2); + return this.finishOp(code === 124 ? tt.bitwiseOR : tt.bitwiseAND, 1); +}; + +pp.readToken_caret = function() { // '^' + var next = this.input.charCodeAt(this.pos + 1); + if (next === 61) return this.finishOp(tt.assign, 2); + return this.finishOp(tt.bitwiseXOR, 1); +}; + +pp.readToken_plus_min = function(code) { // '+-' + var next = this.input.charCodeAt(this.pos + 1); + if (next === code) { + if (next == 45 && this.input.charCodeAt(this.pos + 2) == 62 && + lineBreak.test(this.input.slice(this.lastTokEnd, this.pos))) { + // A `-->` line comment + this.skipLineComment(3); + this.skipSpace(); + return this.nextToken(); + } + return this.finishOp(tt.incDec, 2); + } + if (next === 61) return this.finishOp(tt.assign, 2); + return this.finishOp(tt.plusMin, 1); +}; + +pp.readToken_lt_gt = function(code) { // '<>' + var next = this.input.charCodeAt(this.pos + 1); + var size = 1; + if (next === code) { + size = code === 62 && this.input.charCodeAt(this.pos + 2) === 62 ? 3 : 2; + if (this.input.charCodeAt(this.pos + size) === 61) return this.finishOp(tt.assign, size + 1); + return this.finishOp(tt.bitShift, size); + } + if (next == 33 && code == 60 && this.input.charCodeAt(this.pos + 2) == 45 && + this.input.charCodeAt(this.pos + 3) == 45) { + if (this.inModule) unexpected(); + // `