Skip to content
Merged
Prev Previous commit
Next Next commit
Merge from main
  • Loading branch information
rbuckton committed May 16, 2024
commit 603c3cf18ce89adab5c35a3275934f3ae9324a2c
262 changes: 60 additions & 202 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2675,215 +2675,46 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
const digitsStart = pos;
scanDigits();
const min = tokenValue;
if (annexB && !min) {
isPreviousTermQuantifiable = true;
break;
}
if (charCodeChecked(pos) === CharacterCodes.comma) {
pos++;
scanDigits();
const min = tokenValue;
if (annexB && !min) {
isPreviousTermQuantifiable = true;
break;
}
if (text.charCodeAt(pos) === CharacterCodes.comma) {
pos++;
scanDigits();
const max = tokenValue;
if (!min) {
if (max || text.charCodeAt(pos) === CharacterCodes.closeBrace) {
error(Diagnostics.Incomplete_quantifier_Digit_expected, digitsStart, 0);
}
else {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
isPreviousTermQuantifiable = true;
break;
}
const max = tokenValue;
if (!min) {
if (max || charCodeChecked(pos) === CharacterCodes.closeBrace) {
error(Diagnostics.Incomplete_quantifier_Digit_expected, digitsStart, 0);
}
else if (max && Number.parseInt(min) > Number.parseInt(max) && (!annexB || text.charCodeAt(pos) === CharacterCodes.closeBrace)) {
error(Diagnostics.Numbers_out_of_order_in_quantifier, digitsStart, pos - digitsStart);
}
}
else if (!min) {
if (!annexB) {
else {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
}
isPreviousTermQuantifiable = true;
break;
}
if (text.charCodeAt(pos) !== CharacterCodes.closeBrace) {
if (annexB) {
isPreviousTermQuantifiable = true;
break;
}
else {
error(Diagnostics._0_expected, pos, 0, String.fromCharCode(CharacterCodes.closeBrace));
pos--;
}
}
// falls through
case CharacterCodes.asterisk:
case CharacterCodes.plus:
case CharacterCodes.question:
pos++;
if (text.charCodeAt(pos) === CharacterCodes.question) {
// Non-greedy
pos++;
else if (max && Number.parseInt(min) > Number.parseInt(max) && (!annexB || text.charCodeAt(pos) === CharacterCodes.closeBrace)) {
error(Diagnostics.Numbers_out_of_order_in_quantifier, digitsStart, pos - digitsStart);
}
isPreviousTermQuantifiable = true;
break;
case CharacterCodes.openBracket:
pos++;
if (unicodeSetsMode) {
scanClassSetExpression();
}
else {
scanClassRanges();
}
else if (!min) {
if (!annexB) {
Copy link
Copy Markdown
Contributor

@rbuckton rbuckton May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though it may be redundant, I think it might be better to still indicate unicodeMode here so that someone editing this code in the future doesn't mistakenly think this only applies to non-Annex B code. It may be better to use unicodeMode || !annexB and remove the if (unicodeMode) { annexB = false; } at the top of scanRegularExpressionWorker.

The same would go for other uses of annexB as well.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure you are really fine with a dozen of occurrences of unicodeMode || !annexB?

Copy link
Copy Markdown
Contributor

@rbuckton rbuckton May 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, but

const anyUnicodeModeOrNonAnnexB = unicodeMode || !annexB;

would work.

error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
}
scanExpectedChar(CharacterCodes.closeBracket);
isPreviousTermQuantifiable = true;
break;
case CharacterCodes.closeParen:
if (isInGroup) {
return;
}
// falls through
case CharacterCodes.closeBracket:
case CharacterCodes.closeBrace:
if (isUnterminated && !isInGroup) {
// Assume what starting from the character to be outside of the regex
return;
}
if (charCodeChecked(pos) !== CharacterCodes.closeBrace) {
if (annexB) {
isPreviousTermQuantifiable = true;
break;
}
if (!annexB || ch === CharacterCodes.closeParen) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
else {
error(Diagnostics._0_expected, pos, 0, String.fromCharCode(CharacterCodes.closeBrace));
pos--;
}
pos++;
isPreviousTermQuantifiable = true;
break;
case CharacterCodes.slash:
case CharacterCodes.bar:
return;
default:
scanSourceCharacter();
isPreviousTermQuantifiable = true;
break;
}
}
}

function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
while (pos < end) {
const ch = text.charCodeAt(pos);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
}
else if (currFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
}
else if (!(flag & RegularExpressionFlags.Modifiers)) {
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
}
else {
currFlags |= flag;
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag)!;
if (languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
}
}
pos++;
}
return currFlags;
}

// AtomEscape ::=
// | DecimalEscape
// | CharacterClassEscape
// | CharacterEscape
// | 'k<' RegExpIdentifierName '>'
function scanAtomEscape() {
Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash);
switch (text.charCodeAt(pos)) {
case CharacterCodes.k:
pos++;
if (text.charCodeAt(pos) === CharacterCodes.lessThan) {
pos++;
scanGroupName(/*isReference*/ true);
scanExpectedChar(CharacterCodes.greaterThan);
}
else {
// This is actually allowed in Annex B if there are no named capturing groups in the regex,
// but if we were going to suppress these errors, we would have to record the positions of all '\k's
// and defer the errors until after the scanning to know if the regex has any named capturing groups.
error(Diagnostics.k_must_be_followed_by_a_capturing_group_name_enclosed_in_angle_brackets, pos - 2, 2);
}
break;
case CharacterCodes.q:
if (unicodeSetsMode) {
pos++;
error(Diagnostics.q_is_only_available_inside_character_class, pos - 2, 2);
break;
}
scanExpectedChar(CharacterCodes.closeBrace);
pos--;
// falls through
default:
// The scanEscapeSequence call in scanCharacterEscape must return non-empty strings
// since there must not be line breaks in a regex literal
Debug.assert(scanCharacterClassEscape() || scanDecimalEscape() || scanCharacterEscape(/*atomEscape*/ true));
break;
}
}

// DecimalEscape ::= [1-9] [0-9]*
function scanDecimalEscape(): boolean {
Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash);
const ch = text.charCodeAt(pos);
if (ch >= CharacterCodes._1 && ch <= CharacterCodes._9) {
const start = pos;
scanDigits();
decimalEscapes.push({ pos: start, end: pos, value: +tokenValue });
return true;
}
return false;
}

// CharacterEscape ::=
// | `c` ControlLetter
// | IdentityEscape
// | (Other sequences handled by `scanEscapeSequence`)
// IdentityEscape ::=
// | '^' | '$' | '/' | '\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
// | [~UnicodeMode] (any other non-identifier characters)
function scanCharacterEscape(atomEscape: boolean): string {
Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash);
let ch = text.charCodeAt(pos);
switch (ch) {
case CharacterCodes.c:
pos++;
ch = text.charCodeAt(pos);
if (isASCIILetter(ch)) {
pos++;
return String.fromCharCode(ch & 0x1f);
}
if (!annexB) {
error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2);
}
else if (atomEscape) {
// Annex B treats
//
// ExtendedAtom : `\` [lookahead = `c`]
//
// as the single character `\` when `c` isn't followed by a valid control character
pos--;
return "\\";
}
return String.fromCharCode(ch);
case CharacterCodes.caret:
case CharacterCodes.$:
case CharacterCodes.slash:
case CharacterCodes.backslash:
case CharacterCodes.dot:
case CharacterCodes.asterisk:
case CharacterCodes.plus:
case CharacterCodes.question:
Expand Down Expand Up @@ -2923,7 +2754,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// Assume what starting from the character to be outside of the regex
return;
}
if (unicodeMode || ch === CharacterCodes.closeParen) {
if (!annexB || ch === CharacterCodes.closeParen) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
}
pos++;
Expand Down Expand Up @@ -2980,7 +2811,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
scanGroupName(/*isReference*/ true);
scanExpectedChar(CharacterCodes.greaterThan);
}
else if (unicodeMode) {
else {
// This is actually allowed in Annex B if there are no named capturing groups in the regex,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we keep track of whether we encountered a (?< during reScanSlashToken and add an entry to the RegularExpressionFlags enum? The spec passes NamedCaptureGroups as a production parameter just as it does for UnicodeMode and UnicodeSetsMode, but only ever passes it as ~NamedCaptureGroups in Annex B.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, didn’t think of this clever but dirty solution.
I can implement this but… the point is still about how much Annex B things we are going to respect. If we allow \k then should we allow \u and \x (and also \8 and \9 inside character classes) too? (#58320 (comment))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to error on \u, \x, \8, \9 because in those cases you are more likely to have actually meant something different. Writing \k when there are no named capture groups is far less ambiguous.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If then I would personally lean towards linting also \1~\7 inside character classes to prevent this kind of mistakes. Or perhaps the opposite, only outside character classes. I don’t know. I understand that you don’t want the syntax checking to be too breaky, but someone must find this useful.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every decimal escape in a character class I've ever seen has been a bug, so it makes sense to error for that case.

// but if we were going to suppress these errors, we would have to record the positions of all '\k's
// and defer the errors until after the scanning to know if the regex has any named capturing groups.
error(Diagnostics.k_must_be_followed_by_a_capturing_group_name_enclosed_in_angle_brackets, pos - 2, 2);
}
break;
Expand Down Expand Up @@ -3030,10 +2864,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
pos++;
return String.fromCharCode(ch & 0x1f);
}
if (unicodeMode) {
if (!annexB) {
error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2);
}
else if (atomEscape && annexB) {
else if (atomEscape) {
// Annex B treats
//
// ExtendedAtom : `\` [lookahead = `c`]
Expand Down Expand Up @@ -3588,15 +3422,39 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
}
}
}
else if (annexB) {
pos--;
return false;
}
else {
error(Diagnostics._0_must_be_followed_by_a_Unicode_property_value_expression_enclosed_in_braces, pos - 2, 2, String.fromCharCode(ch));
if (pos === propertyNameOrValueStart) {
error(Diagnostics.Expected_a_Unicode_property_name_or_value);
}
else if (binaryUnicodePropertiesOfStrings.has(propertyNameOrValue)) {
if (!unicodeSetsMode) {
error(Diagnostics.Any_Unicode_property_that_would_possibly_match_more_than_a_single_character_is_only_available_when_the_Unicode_Sets_v_flag_is_set, propertyNameOrValueStart, pos - propertyNameOrValueStart);
}
else if (isCharacterComplement) {
error(Diagnostics.Anything_that_would_possibly_match_more_than_a_single_character_is_invalid_inside_a_negated_character_class, propertyNameOrValueStart, pos - propertyNameOrValueStart);
}
else {
mayContainStrings = true;
}
}
else if (!valuesOfNonBinaryUnicodeProperties.General_Category.has(propertyNameOrValue) && !binaryUnicodeProperties.has(propertyNameOrValue)) {
error(Diagnostics.Unknown_Unicode_property_name_or_value, propertyNameOrValueStart, pos - propertyNameOrValueStart);
const suggestion = getSpellingSuggestion(propertyNameOrValue, [...valuesOfNonBinaryUnicodeProperties.General_Category, ...binaryUnicodeProperties, ...binaryUnicodePropertiesOfStrings], identity);
if (suggestion) {
error(Diagnostics.Did_you_mean_0, propertyNameOrValueStart, pos - propertyNameOrValueStart, suggestion);
}
}
}
scanExpectedChar(CharacterCodes.closeBrace);
if (!unicodeMode) {
error(Diagnostics.Unicode_property_value_expressions_are_only_available_when_the_Unicode_u_flag_or_the_Unicode_Sets_v_flag_is_set, start, pos - start);
}
}
else if (unicodeMode) {
else if (annexB) {
pos--;
return false;
}
else {
error(Diagnostics._0_must_be_followed_by_a_Unicode_property_value_expression_enclosed_in_braces, pos - 2, 2, String.fromCharCode(ch));
}
return true;
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.