未验证 提交 55e012eb 编写于 作者: S Stephen Toub 提交者: GitHub

Add tighter bound to range check for matching Regex char classes (#67133)

When we emit a bitmap lookup for character classes containing only ASCII characters, we currently bound the check by 128, e.g.
```C#
if (ch < 128 && lookupTable[...])
```
but we can easily lower that 128 to instead be the actual exclusive upper bound based on the char set.  Doing so means we don't need to hit the lookup table for a larger set of characters.

(We could also actually shrink the size of the lookup table itself, but doing so would only save a few bytes, and it didn't seem worth the complexity right now.  We could also add a lower range check, but that's also additional checks to execute whereas this one is just improving an existing check that's also required for correctness.)
上级 6a889d23
......@@ -4006,8 +4006,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we
// can just fail the comparison.
return negate ?
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
$"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
}
if (analysis.AllNonAsciiContained)
......
......@@ -921,6 +921,8 @@ internal struct CharClassAnalysisResults
public bool AllAsciiContained;
/// <summary>true if we know for sure that all non-ASCII values are in the set; otherwise, false.</summary>
public bool AllNonAsciiContained;
/// <summary>The exclusive upper bound. Only valid if <see cref="ContainsOnlyAscii"/> is true.</summary>
public int UpperBoundExclusiveIfContainsOnlyAscii;
}
/// <summary>Analyzes the set to determine some basic properties that can be used to optimize usage.</summary>
......@@ -962,6 +964,7 @@ internal static CharClassAnalysisResults Analyze(string set)
AllAsciiContained = false,
ContainsOnlyAscii = set[set.Length - 1] <= 128,
ContainsNoAscii = set[SetStartIndex] >= 128,
UpperBoundExclusiveIfContainsOnlyAscii = set[set.Length - 1],
};
}
......
......@@ -5020,7 +5020,7 @@ void EmitCharInClass()
// ch < 128 ? (bitVectorString[ch >> 4] & (1 << (ch & 0xF))) != 0 :
Ldloc(tempLocal);
Ldc(128);
Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfContainsOnlyAscii : 128);
Bge(comparisonLabel);
Ldstr(bitVectorString);
Ldloc(tempLocal);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册