Skip to content

Commit 64707dc

Browse files
committed
Change number of capturing groups from int to uint16 where possible
1 parent c74d4dc commit 64707dc

6 files changed

Lines changed: 30 additions & 34 deletions

File tree

lib/Parser/RegexParser.cpp

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2982,14 +2982,6 @@ namespace UnifiedRegex
29822982
this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
29832983
#endif
29842984

2985-
// We expect pattern->NumGroups() to be positive, because the full regexp itself always
2986-
// counts as a capturing group.
2987-
Assert(pattern->NumGroups() > 0);
2988-
if (pattern->NumGroups() > MAX_NUM_GROUPS)
2989-
{
2990-
Js::JavascriptError::ThrowSyntaxError(this->scriptContext, JSERR_RegExpTooManyCapturingGroups);
2991-
}
2992-
29932985
return pattern;
29942986
}
29952987

@@ -3020,15 +3012,17 @@ namespace UnifiedRegex
30203012
program->source[bodyChars] = 0;
30213013
program->sourceLen = bodyChars;
30223014

3023-
program->numGroups = nextGroupId;
3024-
3025-
// We expect program->numGroups to be positive, because the full regexp itself always
3015+
// We expect nextGroupId to be positive, because the full regexp itself always
30263016
// counts as a capturing group.
3027-
Assert(program->numGroups > 0);
3028-
if (program->numGroups > MAX_NUM_GROUPS)
3017+
Assert(nextGroupId > 0);
3018+
if (nextGroupId > MAX_NUM_GROUPS)
30293019
{
30303020
Js::JavascriptError::ThrowSyntaxError(this->scriptContext, JSERR_RegExpTooManyCapturingGroups);
30313021
}
3022+
else
3023+
{
3024+
program->numGroups = static_cast<uint16>(nextGroupId);
3025+
}
30323026

30333027
// Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
30343028
}

lib/Parser/RegexParser.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,18 @@ namespace UnifiedRegex
8282
// group, plus a few additional values, to a JavaScript function without overflowing the
8383
// number of arguments. This is important, for example, in the implementation of
8484
// String.prototype.replace, where the second argument is a function.
85-
//
86-
// This should really be an unsigned, but we compare it against numGroups, so make it
87-
// an int for now to avoid a bunch of compiler warnings until we can go back and clean this up.
88-
static const int MAX_NUM_GROUPS = INT16_MAX;
85+
static const uint16 MAX_NUM_GROUPS = INT16_MAX;
86+
87+
uint16 numGroups; // determined in first parse
8988

90-
int numGroups; // determined in first parse
89+
// Keeps track of how many capturing groups we've seen during parsing. We use an int, rather than
90+
// a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
91+
// the end. (We know we can't overflow an int because strings and regex literals are limited to
92+
// 2G characters and therefore to 1G pairs of parentheses, which can fit into an int. I'd prefer
93+
// to use size_t here, but making that change would go down a serious rabbit hole changing the
94+
// interface to UnifiedRegex::Node::AccumDefineGroups.)
9195
int nextGroupId;
96+
9297
// Buffer accumulating all literals.
9398
// In compile-time allocator, must be transferred to runtime allocator when build program
9499
Char* litbuf;

lib/Parser/RegexPattern.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ namespace UnifiedRegex
7272
return rep.unified.program->flags;
7373
}
7474

75-
int RegexPattern::NumGroups() const
75+
uint16 RegexPattern::NumGroups() const
7676
{
7777
return rep.unified.program->numGroups;
7878
}

lib/Parser/RegexPattern.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ namespace UnifiedRegex
5858
Js::ScriptContext *GetScriptContext() const;
5959

6060
inline bool IsLiteral() const { return isLiteral; }
61-
int NumGroups() const;
61+
uint16 NumGroups() const;
6262
bool IsIgnoreCase() const;
6363
bool IsGlobal() const;
6464
bool IsMultiline() const;

lib/Parser/RegexRuntime.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ namespace UnifiedRegex
6060
Field(Char*) source;
6161
Field(CharCount) sourceLen; // length in char16's, NOT including terminating null
6262
// Number of capturing groups (including implicit overall group at index 0)
63-
Field(int) numGroups;
63+
Field(uint16) numGroups;
6464
Field(int) numLoops;
6565
Field(RegexFlags) flags;
6666

@@ -1847,7 +1847,7 @@ namespace UnifiedRegex
18471847
return !groupInfos[0].IsUndefined();
18481848
}
18491849

1850-
inline int NumGroups() const
1850+
inline uint16 NumGroups() const
18511851
{
18521852
return program->numGroups;
18531853
}

lib/Runtime/Library/RegexHelper.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -773,11 +773,8 @@ namespace Js
773773
{
774774
// We've found a substitution ref, like $32. In accordance with the standard (sec-getsubstitution),
775775
// we recognize at most two decimal digits after the dollar sign.
776-
777-
// This should be unsigned, but this would cause lots of compiler warnings unless we also make
778-
// numGroups unsigned, because of a comparison below.
779-
int captureIndex = (int)(currentChar - _u('0'));
780-
Assert(0 <= captureIndex && captureIndex <= 9); // numeric value of single decimal digit
776+
uint16 captureIndex = (uint16)(currentChar - _u('0'));
777+
Assert(captureIndex < 10); // numeric value of single decimal digit
781778

782779
offset = substitutionOffset + 2;
783780

@@ -786,9 +783,8 @@ namespace Js
786783
currentChar = replaceStr[substitutionOffset + 2];
787784
if (currentChar >= _u('0') && currentChar <= _u('9'))
788785
{
789-
// Should also be unsigned; see captureIndex above.
790-
int tempCaptureIndex = (10 * captureIndex) + (int)(currentChar - _u('0'));
791-
Assert(0 <= tempCaptureIndex && tempCaptureIndex < 100); // numeric value of 2-digit positive decimal number
786+
uint16 tempCaptureIndex = (10 * captureIndex) + (uint16)(currentChar - _u('0'));
787+
Assert(tempCaptureIndex < 100); // numeric value of 2-digit positive decimal number
792788
if (tempCaptureIndex < numGroups)
793789
{
794790
captureIndex = tempCaptureIndex;
@@ -797,7 +793,7 @@ namespace Js
797793
}
798794
}
799795

800-
Assert(0 <= captureIndex && captureIndex < 100); // as above, value of 2-digit positive decimal number
796+
Assert(captureIndex < 100); // as above, value of 2-digit positive decimal number
801797
if (captureIndex < numGroups && (captureIndex != 0))
802798
{
803799
Var group = getGroup(captureIndex, nonMatchValue);
@@ -1261,12 +1257,13 @@ namespace Js
12611257
JavascriptString* newString = nullptr;
12621258
const char16* inputStr = input->GetString();
12631259
CharCount inputLength = input->GetLength();
1264-
const int rawNumGroups = pattern->NumGroups();
1260+
const uint16 numGroups = pattern->NumGroups();
12651261
Var nonMatchValue = NonMatchValue(scriptContext, false);
12661262
UnifiedRegex::GroupInfo lastMatch; // initially undefined
12671263

1268-
AssertOrFailFast(0 < rawNumGroups && rawNumGroups <= INT16_MAX);
1269-
const uint16 numGroups = uint16(rawNumGroups);
1264+
// Regex parser should ensure this condition holds, but let's be doubly sure.
1265+
// numGroups is always positive because the entire regex counts as a capturing group.
1266+
AssertOrFailFast(0 < numGroups && numGroups <= INT16_MAX);
12701267

12711268
#if ENABLE_REGEX_CONFIG_OPTIONS
12721269
RegexHelperTrace(scriptContext, UnifiedRegex::RegexStats::Replace, regularExpression, input, scriptContext->GetLibrary()->CreateStringFromCppLiteral(_u("<replace function>")));

0 commit comments

Comments
 (0)