|
|
- 'use strict';
-
- const generate = require('regjsgen').generate;
- const parse = require('regjsparser').parse;
- const regenerate = require('regenerate');
- const unicodeMatchProperty = require('unicode-match-property-ecmascript');
- const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
- const iuMappings = require('./data/iu-mappings.js');
- const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
-
- // Prepare a Regenerate set containing all code points, used for negative
- // character classes (if any).
- const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
- // Without the `u` flag, the range stops at 0xFFFF.
- // https://mths.be/es6#sec-pattern-semantics
- const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
-
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./u`. https://mths.be/es6#sec-atom
- const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
- .remove(
- // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
- 0x000A, // Line Feed <LF>
- 0x000D, // Carriage Return <CR>
- 0x2028, // Line Separator <LS>
- 0x2029 // Paragraph Separator <PS>
- );
-
- const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
- if (unicode) {
- if (ignoreCase) {
- return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
- }
- return ESCAPE_SETS.UNICODE.get(character);
- }
- return ESCAPE_SETS.REGULAR.get(character);
- };
-
- const getUnicodeDotSet = (dotAll) => {
- return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
- };
-
- const getUnicodePropertyValueSet = (property, value) => {
- const path = value ?
- `${ property }/${ value }` :
- `Binary_Property/${ property }`;
- try {
- return require(`regenerate-unicode-properties/${ path }.js`);
- } catch (exception) {
- throw new Error(
- `Failed to recognize value \`${ value }\` for property ` +
- `\`${ property }\`.`
- );
- }
- };
-
- const handleLoneUnicodePropertyNameOrValue = (value) => {
- // It could be a `General_Category` value or a binary property.
- // Note: `unicodeMatchPropertyValue` throws on invalid values.
- try {
- const property = 'General_Category';
- const category = unicodeMatchPropertyValue(property, value);
- return getUnicodePropertyValueSet(property, category);
- } catch (exception) {}
- // It’s not a `General_Category` value, so check if it’s a binary
- // property. Note: `unicodeMatchProperty` throws on invalid properties.
- const property = unicodeMatchProperty(value);
- return getUnicodePropertyValueSet(property);
- };
-
- const getUnicodePropertyEscapeSet = (value, isNegative) => {
- const parts = value.split('=');
- const firstPart = parts[0];
- let set;
- if (parts.length == 1) {
- set = handleLoneUnicodePropertyNameOrValue(firstPart);
- } else {
- // The pattern consists of two parts, i.e. `Property=Value`.
- const property = unicodeMatchProperty(firstPart);
- const value = unicodeMatchPropertyValue(property, parts[1]);
- set = getUnicodePropertyValueSet(property, value);
- }
- if (isNegative) {
- return UNICODE_SET.clone().remove(set);
- }
- return set.clone();
- };
-
- // Given a range of code points, add any case-folded code points in that range
- // to a set.
- regenerate.prototype.iuAddRange = function(min, max) {
- const $this = this;
- do {
- const folded = caseFold(min);
- if (folded) {
- $this.add(folded);
- }
- } while (++min <= max);
- return $this;
- };
-
- const update = (item, pattern) => {
- let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
- switch (tree.type) {
- case 'characterClass':
- case 'group':
- case 'value':
- // No wrapping needed.
- break;
- default:
- // Wrap the pattern in a non-capturing group.
- tree = wrap(tree, pattern);
- }
- Object.assign(item, tree);
- };
-
- const wrap = (tree, pattern) => {
- // Wrap the pattern in a non-capturing group.
- return {
- 'type': 'group',
- 'behavior': 'ignore',
- 'body': [tree],
- 'raw': `(?:${ pattern })`
- };
- };
-
- const caseFold = (codePoint) => {
- return iuMappings.get(codePoint) || false;
- };
-
- const processCharacterClass = (characterClassItem, regenerateOptions) => {
- let set = regenerate();
- for (const item of characterClassItem.body) {
- switch (item.type) {
- case 'value':
- set.add(item.codePoint);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- const folded = caseFold(item.codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- break;
- case 'characterClassRange':
- const min = item.min.codePoint;
- const max = item.max.codePoint;
- set.addRange(min, max);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- set.iuAddRange(min, max);
- }
- break;
- case 'characterClassEscape':
- set.add(getCharacterClassEscapeSet(
- item.value,
- config.unicode,
- config.ignoreCase
- ));
- break;
- case 'unicodePropertyEscape':
- set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- }
- if (characterClassItem.negative) {
- set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
- }
- update(characterClassItem, set.toString(regenerateOptions));
- return characterClassItem;
- };
-
- const updateNamedReference = (item, index) => {
- delete item.name;
- item.matchIndex = index;
- };
-
- const assertNoUnmatchedReferences = (groups) => {
- const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
- if (unmatchedReferencesNames.length > 0) {
- throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
- }
- };
-
- const processTerm = (item, regenerateOptions, groups) => {
- switch (item.type) {
- case 'dot':
- if (config.unicode) {
- update(
- item,
- getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
- );
- } else if (config.dotAll) {
- // TODO: consider changing this at the regenerate level.
- update(item, '[\\s\\S]');
- }
- break;
- case 'characterClass':
- item = processCharacterClass(item, regenerateOptions);
- break;
- case 'unicodePropertyEscape':
- update(
- item,
- getUnicodePropertyEscapeSet(item.value, item.negative)
- .toString(regenerateOptions)
- );
- break;
- case 'characterClassEscape':
- update(
- item,
- getCharacterClassEscapeSet(
- item.value,
- config.unicode,
- config.ignoreCase
- ).toString(regenerateOptions)
- );
- break;
- case 'group':
- if (item.behavior == 'normal') {
- groups.lastIndex++;
- }
- if (item.name) {
- const name = item.name.value;
-
- if (groups.names[name]) {
- throw new Error(
- `Multiple groups with the same name (${ name }) are not allowed.`
- );
- }
-
- const index = groups.lastIndex;
- delete item.name;
-
- groups.names[name] = index;
- if (groups.onNamedGroup) {
- groups.onNamedGroup.call(null, name, index);
- }
-
- if (groups.unmatchedReferences[name]) {
- groups.unmatchedReferences[name].forEach(reference => {
- updateNamedReference(reference, index);
- });
- delete groups.unmatchedReferences[name];
- }
- }
- /* falls through */
- case 'alternative':
- case 'disjunction':
- case 'quantifier':
- item.body = item.body.map(term => {
- return processTerm(term, regenerateOptions, groups);
- });
- break;
- case 'value':
- const codePoint = item.codePoint;
- const set = regenerate(codePoint);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- const folded = caseFold(codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- update(item, set.toString(regenerateOptions));
- break;
- case 'reference':
- if (item.name) {
- const name = item.name.value;
- const index = groups.names[name];
- if (index) {
- updateNamedReference(item, index);
- break;
- }
-
- if (!groups.unmatchedReferences[name]) {
- groups.unmatchedReferences[name] = [];
- }
- // Keep track of references used before the corresponding group.
- groups.unmatchedReferences[name].push(item);
- }
- break;
- case 'anchor':
- case 'empty':
- case 'group':
- // Nothing to do here.
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- return item;
- };
-
- const config = {
- 'ignoreCase': false,
- 'unicode': false,
- 'dotAll': false,
- 'useUnicodeFlag': false
- };
- const rewritePattern = (pattern, flags, options) => {
- const regjsparserFeatures = {
- 'unicodePropertyEscape': options && options.unicodePropertyEscape,
- 'namedGroups': options && options.namedGroup,
- 'lookbehind': options && options.lookbehind
- };
- config.ignoreCase = flags && flags.includes('i');
- config.unicode = flags && flags.includes('u');
- const supportDotAllFlag = options && options.dotAllFlag;
- config.dotAll = supportDotAllFlag && flags && flags.includes('s');
- config.useUnicodeFlag = options && options.useUnicodeFlag;
- const regenerateOptions = {
- 'hasUnicodeFlag': config.useUnicodeFlag,
- 'bmpOnly': !config.unicode
- };
- const groups = {
- 'onNamedGroup': options && options.onNamedGroup,
- 'lastIndex': 0,
- 'names': Object.create(null), // { [name]: index }
- 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
- };
- const tree = parse(pattern, flags, regjsparserFeatures);
- // Note: `processTerm` mutates `tree` and `groups`.
- processTerm(tree, regenerateOptions, groups);
- assertNoUnmatchedReferences(groups);
- return generate(tree);
- };
-
- module.exports = rewritePattern;
|