Faster tokenizer lookahead (#13341)

* refactor: simplify token context structure * add benchmark * perf: return a sub-state on tokenizer lookahead * Update packages/babel-parser/src/tokenizer/index.js Co-authored-by: Brian Ng <bng412@gmail.com> * Update packages/babel-parser/src/tokenizer/index.js Co-authored-by: Brian Ng <bng412@gmail.com> * remove irrelevant comment * fix: guard curPosition with isLookahead * add test cases Co-authored-by: Brian Ng <bng412@gmail.com>
2021-05-25 21:12:38 -04:00
parent b1f57e5fb5
commit acf2a10899
10 changed files with 684 additions and 29 deletions
--- a/packages/babel-parser/src/tokenizer/context.js
+++ b/packages/babel-parser/src/tokenizer/context.js
@@ -7,22 +7,15 @@
 import { types as tt } from "./types";

 export class TokContext {
-  constructor(
-    token: string,
-    isExpr?: boolean,
-    preserveSpace?: boolean,
-    override?: ?Function, // Takes a Tokenizer as a this-parameter, and returns void.
-  ) {
+  constructor(token: string, isExpr?: boolean, preserveSpace?: boolean) {
    this.token = token;
    this.isExpr = !!isExpr;
    this.preserveSpace = !!preserveSpace;
-    this.override = override;
  }

  token: string;
  isExpr: boolean;
  preserveSpace: boolean;
-  override: ?Function;
 }

 export const types: {
@@ -34,7 +27,7 @@ export const types: {
  templateQuasi: new TokContext("${", false),
  parenStatement: new TokContext("(", false),
  parenExpression: new TokContext("(", true),
-  template: new TokContext("`", true, true, p => p.readTmplToken()),
+  template: new TokContext("`", true, true),
  functionExpression: new TokContext("function", true),
  functionStatement: new TokContext("function", false),
 };
--- a/packages/babel-parser/src/tokenizer/index.js
+++ b/packages/babel-parser/src/tokenizer/index.js
@@ -19,6 +19,7 @@ import {
  skipWhiteSpace,
 } from "../util/whitespace";
 import State from "./state";
+import type { LookaheadState } from "./state";

 const VALID_REGEX_FLAGS = new Set(["g", "m", "s", "i", "y", "u"]);

@@ -144,11 +145,9 @@ export default class Tokenizer extends ParserErrors {
  // Move to the next token

  next(): void {
-    if (!this.isLookahead) {
-      this.checkKeywordEscapes();
-      if (this.options.tokens) {
-        this.pushToken(new Token(this.state));
-      }
+    this.checkKeywordEscapes();
+    if (this.options.tokens) {
+      this.pushToken(new Token(this.state));
    }

    this.state.lastTokEnd = this.state.end;
@@ -175,14 +174,51 @@ export default class Tokenizer extends ParserErrors {
    return this.state.type === type;
  }

-  // TODO
+  /**
+   * Create a LookaheadState from current parser state
+   *
+   * @param {State} state
+   * @returns {LookaheadState}
+   * @memberof Tokenizer
+   */
+  createLookaheadState(state: State): LookaheadState {
+    return {
+      pos: state.pos,
+      value: null,
+      type: state.type,
+      start: state.start,
+      end: state.end,
+      lastTokEnd: state.end,
+      context: [this.curContext()],
+      exprAllowed: state.exprAllowed,
+      inType: state.inType,
+    };
+  }

-  lookahead(): State {
+  /**
+   * lookahead peeks the next token, skipping changes to token context and
+   * comment stack. For performance it returns a limited LookaheadState
+   * instead of full parser state.
+   *
+   * The { column, line } Loc info is not included in lookahead since such usage
+   * is rare. Although it may return other location properties e.g. `curLine` and
+   * `lineStart`, these properties are not listed in the LookaheadState interface
+   * and thus the returned value is _NOT_ reliable.
+   *
+   * The tokenizer should make best efforts to avoid using any parser state
+   * other than those defined in LookaheadState
+   *
+   * @returns {LookaheadState}
+   * @memberof Tokenizer
+   */
+  lookahead(): LookaheadState {
    const old = this.state;
-    this.state = old.clone(true);
+    // For performance we use a simpified tokenizer state structure
+    // $FlowIgnore
+    this.state = this.createLookaheadState(old);

    this.isLookahead = true;
-    this.next();
+    this.nextToken();
    this.isLookahead = false;

    const curr = this.state;
@@ -247,17 +283,16 @@ export default class Tokenizer extends ParserErrors {

  nextToken(): void {
    const curContext = this.curContext();
-    if (!curContext?.preserveSpace) this.skipSpace();
+    if (!curContext.preserveSpace) this.skipSpace();
    this.state.start = this.state.pos;
-    this.state.startLoc = this.state.curPosition();
+    if (!this.isLookahead) this.state.startLoc = this.state.curPosition();
    if (this.state.pos >= this.length) {
      this.finishToken(tt.eof);
      return;
    }

-    const override = curContext?.override;
-    if (override) {
-      override(this);
+    if (curContext === ct.template) {
+      this.readTmplToken();
    } else {
      this.getTokenFromCode(this.codePointAtPos(this.state.pos));
    }
@@ -285,7 +320,8 @@ export default class Tokenizer extends ParserErrors {
  }

  skipBlockComment(): void {
-    const startLoc = this.state.curPosition();
+    let startLoc;
+    if (!this.isLookahead) startLoc = this.state.curPosition();
    const start = this.state.pos;
    const end = this.input.indexOf("*/", this.state.pos + 2);
    if (end === -1) throw this.raise(start, Errors.UnterminatedComment);
@@ -304,6 +340,7 @@ export default class Tokenizer extends ParserErrors {
    // If we are doing a lookahead right now we need to advance the position (above code)
    // but we do not want to push the comment to the state.
    if (this.isLookahead) return;
+    /*:: invariant(startLoc) */

    this.pushComment(
      true,
@@ -317,7 +354,8 @@ export default class Tokenizer extends ParserErrors {

  skipLineComment(startSkip: number): void {
    const start = this.state.pos;
-    const startLoc = this.state.curPosition();
+    let startLoc;
+    if (!this.isLookahead) startLoc = this.state.curPosition();
    let ch = this.input.charCodeAt((this.state.pos += startSkip));
    if (this.state.pos < this.length) {
      while (!isNewLine(ch) && ++this.state.pos < this.length) {
@@ -328,6 +366,7 @@ export default class Tokenizer extends ParserErrors {
    // If we are doing a lookahead right now we need to advance the position (above code)
    // but we do not want to push the comment to the state.
    if (this.isLookahead) return;
+    /*:: invariant(startLoc) */

    this.pushComment(
      false,
@@ -398,12 +437,14 @@ export default class Tokenizer extends ParserErrors {

  finishToken(type: TokenType, val: any): void {
    this.state.end = this.state.pos;
-    this.state.endLoc = this.state.curPosition();
    const prevType = this.state.type;
    this.state.type = type;
    this.state.value = val;

-    if (!this.isLookahead) this.updateContext(prevType);
+    if (!this.isLookahead) {
+      this.state.endLoc = this.state.curPosition();
+      this.updateContext(prevType);
+    }
  }

  // ### Token reading
--- a/packages/babel-parser/src/tokenizer/state.js
+++ b/packages/babel-parser/src/tokenizer/state.js
@@ -178,3 +178,14 @@ export default class State {
    return state;
  }
 }
+
+export type LookaheadState = {
+  pos: number,
+  value: any,
+  type: TokenType,
+  start: number,
+  end: number,
+  /* Used only in readSlashToken */
+  exprAllowed: boolean,
+  inType: boolean,
+};