-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Correct Regular Expressions Behavior Related to Annex B #58320
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
e049438
358eb30
8facb0a
f5c0b60
cff993f
603c3cf
2e62d25
ed08ef7
8b67d77
b48f0d0
c72f92f
70a3214
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2675,215 +2675,46 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean | |
| const digitsStart = pos; | ||
| scanDigits(); | ||
| const min = tokenValue; | ||
| if (annexB && !min) { | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| if (charCodeChecked(pos) === CharacterCodes.comma) { | ||
| pos++; | ||
| scanDigits(); | ||
| const min = tokenValue; | ||
| if (annexB && !min) { | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| if (text.charCodeAt(pos) === CharacterCodes.comma) { | ||
| pos++; | ||
| scanDigits(); | ||
| const max = tokenValue; | ||
| if (!min) { | ||
| if (max || text.charCodeAt(pos) === CharacterCodes.closeBrace) { | ||
| error(Diagnostics.Incomplete_quantifier_Digit_expected, digitsStart, 0); | ||
| } | ||
| else { | ||
| error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch)); | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| const max = tokenValue; | ||
| if (!min) { | ||
| if (max || charCodeChecked(pos) === CharacterCodes.closeBrace) { | ||
| error(Diagnostics.Incomplete_quantifier_Digit_expected, digitsStart, 0); | ||
| } | ||
| else if (max && Number.parseInt(min) > Number.parseInt(max) && (!annexB || text.charCodeAt(pos) === CharacterCodes.closeBrace)) { | ||
| error(Diagnostics.Numbers_out_of_order_in_quantifier, digitsStart, pos - digitsStart); | ||
| } | ||
| } | ||
| else if (!min) { | ||
| if (!annexB) { | ||
| else { | ||
| error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch)); | ||
| } | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| if (text.charCodeAt(pos) !== CharacterCodes.closeBrace) { | ||
| if (annexB) { | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| else { | ||
| error(Diagnostics._0_expected, pos, 0, String.fromCharCode(CharacterCodes.closeBrace)); | ||
| pos--; | ||
| } | ||
| } | ||
| // falls through | ||
| case CharacterCodes.asterisk: | ||
| case CharacterCodes.plus: | ||
| case CharacterCodes.question: | ||
| pos++; | ||
| if (text.charCodeAt(pos) === CharacterCodes.question) { | ||
| // Non-greedy | ||
| pos++; | ||
| else if (max && Number.parseInt(min) > Number.parseInt(max) && (!annexB || text.charCodeAt(pos) === CharacterCodes.closeBrace)) { | ||
| error(Diagnostics.Numbers_out_of_order_in_quantifier, digitsStart, pos - digitsStart); | ||
| } | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| case CharacterCodes.openBracket: | ||
| pos++; | ||
| if (unicodeSetsMode) { | ||
| scanClassSetExpression(); | ||
| } | ||
| else { | ||
| scanClassRanges(); | ||
| } | ||
| else if (!min) { | ||
| if (!annexB) { | ||
| error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch)); | ||
| } | ||
| scanExpectedChar(CharacterCodes.closeBracket); | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| case CharacterCodes.closeParen: | ||
| if (isInGroup) { | ||
| return; | ||
| } | ||
| // falls through | ||
| case CharacterCodes.closeBracket: | ||
| case CharacterCodes.closeBrace: | ||
| if (isUnterminated && !isInGroup) { | ||
| // Assume what starting from the character to be outside of the regex | ||
| return; | ||
| } | ||
| if (charCodeChecked(pos) !== CharacterCodes.closeBrace) { | ||
| if (annexB) { | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| if (!annexB || ch === CharacterCodes.closeParen) { | ||
| error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch)); | ||
| else { | ||
| error(Diagnostics._0_expected, pos, 0, String.fromCharCode(CharacterCodes.closeBrace)); | ||
| pos--; | ||
| } | ||
| pos++; | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| case CharacterCodes.slash: | ||
| case CharacterCodes.bar: | ||
| return; | ||
| default: | ||
| scanSourceCharacter(); | ||
| isPreviousTermQuantifiable = true; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags { | ||
| while (pos < end) { | ||
| const ch = text.charCodeAt(pos); | ||
| if (!isIdentifierPart(ch, languageVersion)) { | ||
| break; | ||
| } | ||
| const flag = characterToRegularExpressionFlag(String.fromCharCode(ch)); | ||
| if (flag === undefined) { | ||
| error(Diagnostics.Unknown_regular_expression_flag, pos, 1); | ||
| } | ||
| else if (currFlags & flag) { | ||
| error(Diagnostics.Duplicate_regular_expression_flag, pos, 1); | ||
| } | ||
| else if (!(flag & RegularExpressionFlags.Modifiers)) { | ||
| error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1); | ||
| } | ||
| else { | ||
| currFlags |= flag; | ||
| const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag)!; | ||
| if (languageVersion < availableFrom) { | ||
| error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom)); | ||
| } | ||
| } | ||
| pos++; | ||
| } | ||
| return currFlags; | ||
| } | ||
|
|
||
| // AtomEscape ::= | ||
| // | DecimalEscape | ||
| // | CharacterClassEscape | ||
| // | CharacterEscape | ||
| // | 'k<' RegExpIdentifierName '>' | ||
| function scanAtomEscape() { | ||
| Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash); | ||
| switch (text.charCodeAt(pos)) { | ||
| case CharacterCodes.k: | ||
| pos++; | ||
| if (text.charCodeAt(pos) === CharacterCodes.lessThan) { | ||
| pos++; | ||
| scanGroupName(/*isReference*/ true); | ||
| scanExpectedChar(CharacterCodes.greaterThan); | ||
| } | ||
| else { | ||
| // This is actually allowed in Annex B if there are no named capturing groups in the regex, | ||
| // but if we were going to suppress these errors, we would have to record the positions of all '\k's | ||
| // and defer the errors until after the scanning to know if the regex has any named capturing groups. | ||
| error(Diagnostics.k_must_be_followed_by_a_capturing_group_name_enclosed_in_angle_brackets, pos - 2, 2); | ||
| } | ||
| break; | ||
| case CharacterCodes.q: | ||
| if (unicodeSetsMode) { | ||
| pos++; | ||
| error(Diagnostics.q_is_only_available_inside_character_class, pos - 2, 2); | ||
| break; | ||
| } | ||
| scanExpectedChar(CharacterCodes.closeBrace); | ||
| pos--; | ||
| // falls through | ||
| default: | ||
| // The scanEscapeSequence call in scanCharacterEscape must return non-empty strings | ||
| // since there must not be line breaks in a regex literal | ||
| Debug.assert(scanCharacterClassEscape() || scanDecimalEscape() || scanCharacterEscape(/*atomEscape*/ true)); | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // DecimalEscape ::= [1-9] [0-9]* | ||
| function scanDecimalEscape(): boolean { | ||
| Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash); | ||
| const ch = text.charCodeAt(pos); | ||
| if (ch >= CharacterCodes._1 && ch <= CharacterCodes._9) { | ||
| const start = pos; | ||
| scanDigits(); | ||
| decimalEscapes.push({ pos: start, end: pos, value: +tokenValue }); | ||
| return true; | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| // CharacterEscape ::= | ||
| // | `c` ControlLetter | ||
| // | IdentityEscape | ||
| // | (Other sequences handled by `scanEscapeSequence`) | ||
| // IdentityEscape ::= | ||
| // | '^' | '$' | '/' | '\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | ||
| // | [~UnicodeMode] (any other non-identifier characters) | ||
| function scanCharacterEscape(atomEscape: boolean): string { | ||
| Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash); | ||
| let ch = text.charCodeAt(pos); | ||
| switch (ch) { | ||
| case CharacterCodes.c: | ||
| pos++; | ||
| ch = text.charCodeAt(pos); | ||
| if (isASCIILetter(ch)) { | ||
| pos++; | ||
| return String.fromCharCode(ch & 0x1f); | ||
| } | ||
| if (!annexB) { | ||
| error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2); | ||
| } | ||
| else if (atomEscape) { | ||
| // Annex B treats | ||
| // | ||
| // ExtendedAtom : `\` [lookahead = `c`] | ||
| // | ||
| // as the single character `\` when `c` isn't followed by a valid control character | ||
| pos--; | ||
| return "\\"; | ||
| } | ||
| return String.fromCharCode(ch); | ||
| case CharacterCodes.caret: | ||
| case CharacterCodes.$: | ||
| case CharacterCodes.slash: | ||
| case CharacterCodes.backslash: | ||
| case CharacterCodes.dot: | ||
| case CharacterCodes.asterisk: | ||
| case CharacterCodes.plus: | ||
| case CharacterCodes.question: | ||
|
|
@@ -2923,7 +2754,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean | |
| // Assume what starting from the character to be outside of the regex | ||
| return; | ||
| } | ||
| if (unicodeMode || ch === CharacterCodes.closeParen) { | ||
| if (!annexB || ch === CharacterCodes.closeParen) { | ||
| error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch)); | ||
| } | ||
| pos++; | ||
|
|
@@ -2980,7 +2811,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean | |
| scanGroupName(/*isReference*/ true); | ||
| scanExpectedChar(CharacterCodes.greaterThan); | ||
| } | ||
| else if (unicodeMode) { | ||
| else { | ||
| // This is actually allowed in Annex B if there are no named capturing groups in the regex, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we keep track of whether we encountered a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, didn’t think of this clever but dirty solution.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's better to error on
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If then I would personally lean towards linting also
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Every decimal escape in a character class I've ever seen has been a bug, so it makes sense to error for that case. |
||
| // but if we were going to suppress these errors, we would have to record the positions of all '\k's | ||
| // and defer the errors until after the scanning to know if the regex has any named capturing groups. | ||
| error(Diagnostics.k_must_be_followed_by_a_capturing_group_name_enclosed_in_angle_brackets, pos - 2, 2); | ||
| } | ||
| break; | ||
|
|
@@ -3030,10 +2864,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean | |
| pos++; | ||
| return String.fromCharCode(ch & 0x1f); | ||
| } | ||
| if (unicodeMode) { | ||
| if (!annexB) { | ||
| error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2); | ||
| } | ||
| else if (atomEscape && annexB) { | ||
| else if (atomEscape) { | ||
| // Annex B treats | ||
| // | ||
| // ExtendedAtom : `\` [lookahead = `c`] | ||
|
|
@@ -3588,15 +3422,39 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean | |
| } | ||
| } | ||
| } | ||
| else if (annexB) { | ||
| pos--; | ||
| return false; | ||
| } | ||
| else { | ||
| error(Diagnostics._0_must_be_followed_by_a_Unicode_property_value_expression_enclosed_in_braces, pos - 2, 2, String.fromCharCode(ch)); | ||
| if (pos === propertyNameOrValueStart) { | ||
| error(Diagnostics.Expected_a_Unicode_property_name_or_value); | ||
| } | ||
| else if (binaryUnicodePropertiesOfStrings.has(propertyNameOrValue)) { | ||
| if (!unicodeSetsMode) { | ||
| error(Diagnostics.Any_Unicode_property_that_would_possibly_match_more_than_a_single_character_is_only_available_when_the_Unicode_Sets_v_flag_is_set, propertyNameOrValueStart, pos - propertyNameOrValueStart); | ||
| } | ||
| else if (isCharacterComplement) { | ||
| error(Diagnostics.Anything_that_would_possibly_match_more_than_a_single_character_is_invalid_inside_a_negated_character_class, propertyNameOrValueStart, pos - propertyNameOrValueStart); | ||
| } | ||
| else { | ||
| mayContainStrings = true; | ||
| } | ||
| } | ||
| else if (!valuesOfNonBinaryUnicodeProperties.General_Category.has(propertyNameOrValue) && !binaryUnicodeProperties.has(propertyNameOrValue)) { | ||
| error(Diagnostics.Unknown_Unicode_property_name_or_value, propertyNameOrValueStart, pos - propertyNameOrValueStart); | ||
| const suggestion = getSpellingSuggestion(propertyNameOrValue, [...valuesOfNonBinaryUnicodeProperties.General_Category, ...binaryUnicodeProperties, ...binaryUnicodePropertiesOfStrings], identity); | ||
| if (suggestion) { | ||
| error(Diagnostics.Did_you_mean_0, propertyNameOrValueStart, pos - propertyNameOrValueStart, suggestion); | ||
| } | ||
| } | ||
| } | ||
| scanExpectedChar(CharacterCodes.closeBrace); | ||
| if (!unicodeMode) { | ||
| error(Diagnostics.Unicode_property_value_expressions_are_only_available_when_the_Unicode_u_flag_or_the_Unicode_Sets_v_flag_is_set, start, pos - start); | ||
| } | ||
| } | ||
| else if (unicodeMode) { | ||
| else if (annexB) { | ||
| pos--; | ||
| return false; | ||
| } | ||
| else { | ||
| error(Diagnostics._0_must_be_followed_by_a_Unicode_property_value_expression_enclosed_in_braces, pos - 2, 2, String.fromCharCode(ch)); | ||
| } | ||
| return true; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Though it may be redundant, I think it might be better to still indicate
unicodeModehere so that someone editing this code in the future doesn't mistakenly think this only applies to non-Annex B code. It may be better to useunicodeMode || !annexBand remove theif (unicodeMode) { annexB = false; }at the top ofscanRegularExpressionWorker.The same would go for other uses of
annexBas well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure you are really fine with a dozen of occurrences of
unicodeMode || !annexB?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, but
would work.