diff --git a/acorn.js b/acorn.js index ede0d9e2bb..2e33050623 100644 --- a/acorn.js +++ b/acorn.js @@ -566,7 +566,8 @@ // These are a run-length and offset encoded representation of the // >0xffff code points that are a valid part of identifiers. The // offset starts at 0x10000, and each pair of numbers represents an - // offset to the next range, and then a size of the range. + // offset to the next range, and then a size of the range. They were + // generated by tools/generate-identifier-regex.js var astralIdentifierStartCodes = [0,11,2,25,2,18,2,1,2,14,3,13,35,122,70,52,268,28,4,48,48,31,17,26,6,37,11,29,3,35,5,7,2,4,43,157,99,39,9,51,157,310,10,21,11,7,153,5,3,0,2,43,2,1,4,0,3,22,11,22,10,30,98,21,11,25,71,55,7,1,65,0,16,3,2,2,2,26,45,28,4,28,36,7,2,27,28,53,11,21,11,18,14,17,111,72,955,52,76,44,33,24,27,35,42,34,4,0,13,47,15,3,22,0,38,17,2,24,133,46,39,7,3,1,3,21,2,6,2,1,2,4,4,0,32,4,287,47,21,1,2,0,185,46,82,47,21,0,60,42,502,63,32,0,449,56,1288,920,104,110,2962,1070,13266,568,8,30,114,29,19,47,17,3,32,20,6,18,881,68,12,0,67,12,16481,1,3071,106,6,12,4,8,8,9,5991,84,2,70,2,1,3,0,3,1,3,3,2,11,2,0,2,6,2,64,2,3,3,7,2,6,2,27,2,3,2,4,2,0,4,6,2,339,3,24,2,24,2,30,2,24,2,30,2,24,2,30,2,24,2,30,2,24,2,7,4149,196,1340,3,2,26,2,1,2,0,3,0,2,9,2,3,2,0,2,0,7,0,5,0,2,0,2,0,2,2,2,1,2,0,3,0,2,0,2,0,2,0,2,0,2,1,2,0,3,3,2,6,2,3,2,3,2,0,2,9,2,16,6,2,2,4,2,16,4421,42710,42,4148,12,221,16355,541]; var astralIdentifierCodes = [509,0,227,0,150,4,294,9,1368,2,2,1,6,3,41,2,5,0,166,1,1306,2,54,14,32,9,16,3,46,10,54,9,7,2,37,13,2,9,52,0,13,2,49,13,16,9,83,11,168,11,6,9,8,2,57,0,2,6,3,1,3,2,10,0,11,1,3,6,4,4,316,19,13,9,214,6,3,8,112,16,16,9,82,12,9,9,535,9,20855,9,135,4,60,6,26,9,1016,45,17,3,19723,1,5319,4,4,5,9,7,3,6,31,3,149,2,1418,49,4305,6,792618,239]; diff --git a/tools/generate-identifier-regex.js b/tools/generate-identifier-regex.js index 88146e2224..0d7c50fc38 100644 --- a/tools/generate-identifier-regex.js +++ b/tools/generate-identifier-regex.js @@ -1,51 +1,47 @@ -// Based on https://gist.github.com/mathiasbynens/6334847 by @mathias - -var regenerate = require('regenerate'); +// Note: run `npm install unicode-7.0.0` first. // Which Unicode version should be used? -var version = '7.0.0'; // note: also update `package.json` when this changes +var version = '7.0.0'; -// Shorthand function -var get = function(what) { - return require('unicode-' + version + '/' + what + '/code-points'); -}; +var start = require('unicode-' + version + '/properties/ID_Start/code-points') + .filter(function(ch) { return ch > 127; }); +var cont = [0x200c, 0x200d].concat(require('unicode-' + version + '/properties/ID_Continue/code-points') + .filter(function(ch) { return ch > 127 && start.indexOf(ch) == -1; })); -// Unicode categories needed to construct the ES5 regex -var Lu = get('categories/Lu'); -var Ll = get('categories/Ll'); -var Lt = get('categories/Lt'); -var Lm = get('categories/Lm'); -var Lo = get('categories/Lo'); -var Nl = get('categories/Nl'); -var Mn = get('categories/Mn'); -var Mc = get('categories/Mc'); -var Nd = get('categories/Nd'); -var Pc = get('categories/Pc'); +function pad(str, width) { + while (str.length < width) str = "0" + str; + return str; +} -var generateES5Regex = function() { // ES 5.1 - // http://mathiasbynens.be/notes/javascript-identifiers#valid-identifier-names - var identifierStart = regenerate('$', '_') - .add(Lu, Ll, Lt, Lm, Lo, Nl) - .removeRange(0x010000, 0x10FFFF) // remove astral symbols - .removeRange(0x0, 0x7F); // remove ASCII symbols (Acorn-specific) - var identifierPart = regenerate('\u200C', '\u200D', Mn, Mc, Nd, Pc) - .removeRange(0x010000, 0x10FFFF) // remove astral symbols - .remove(identifierStart) // (Acorn-specific) - .removeRange(0x0, 0x7F); // remove ASCII symbols (Acorn-specific) - return { - 'NonAsciiIdentifierStart': identifierStart.toString(), - 'NonAsciiIdentifierPart': identifierPart.toString() - }; -}; +function esc(code) { + var hex = code.toString(16); + if (hex.length <= 2) return "\\x" + pad(hex, 2); + else return "\\u" + pad(hex, 4); +} -var result = generateES5Regex(); -console.log( - '// ECMAScript 5.1/Unicode v%s `nonASCIIidentifierStart`:\n\n%s\n', - version, - result.NonAsciiIdentifierStart -); -console.log( - '// ECMAScript 5.1/Unicode v%s `nonASCIIidentifier`:\n\n%s', - version, - result.NonAsciiIdentifierPart -); +function generate(chars) { + var astral = [], re = ""; + for (var i = 0, at = 0x10000; i < chars.length; i++) { + var from = chars[i], to = from; + while (i < chars.length - 1 && chars[i + 1] == to + 1) { + i++; + to++; + } + if (to <= 0xffff) { + if (from == to) re += esc(from); + else if (from + 1 == to) re += esc(from) + esc(to); + else re += esc(from) + "-" + esc(to); + } else { + astral.push(from - at, to - from); + at = to; + } + } + return {nonASCII: re, astral: astral}; +} + +var startData = generate(start), contData = generate(cont); + +console.log(" var nonASCIIidentifierStartChars = \"" + startData.nonASCII + "\";"); +console.log(" var nonASCIIidentifierChars = \"" + contData.nonASCII + "\";"); +console.log(" var astralIdentifierStartCodes = " + JSON.stringify(startData.astral) + ";"); +console.log(" var astralIdentifierCodes = " + JSON.stringify(contData.astral) + ";");