From 55e012eb916a712666fb8d5a1e1fc38ae4cccb66 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 25 Mar 2022 08:40:06 -0400 Subject: [PATCH] Add tighter bound to range check for matching Regex char classes (#67133) When we emit a bitmap lookup for character classes containing only ASCII characters, we currently bound the check by 128, e.g. ```C# if (ch < 128 && lookupTable[...]) ``` but we can easily lower that 128 to instead be the actual exclusive upper bound based on the char set. Doing so means we don't need to hit the lookup table for a larger set of characters. (We could also actually shrink the size of the lookup table itself, but doing so would only save a few bytes, and it didn't seem worth the complexity right now. We could also add a lower range check, but that's also additional checks to execute whereas this one is just improving an existing check that's also required for correctness.) --- .../gen/RegexGenerator.Emitter.cs | 4 ++-- .../src/System/Text/RegularExpressions/RegexCharClass.cs | 3 +++ .../src/System/Text/RegularExpressions/RegexCompiler.cs | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 2c546be1452..0d42d4c357c 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -4006,8 +4006,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options // character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we // can just fail the comparison. return negate ? - $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : - $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; + $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : + $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; } if (analysis.AllNonAsciiContained) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index a85ecef2303..398f882c79c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -921,6 +921,8 @@ internal struct CharClassAnalysisResults public bool AllAsciiContained; /// true if we know for sure that all non-ASCII values are in the set; otherwise, false. public bool AllNonAsciiContained; + /// The exclusive upper bound. Only valid if is true. + public int UpperBoundExclusiveIfContainsOnlyAscii; } /// Analyzes the set to determine some basic properties that can be used to optimize usage. @@ -962,6 +964,7 @@ internal static CharClassAnalysisResults Analyze(string set) AllAsciiContained = false, ContainsOnlyAscii = set[set.Length - 1] <= 128, ContainsNoAscii = set[SetStartIndex] >= 128, + UpperBoundExclusiveIfContainsOnlyAscii = set[set.Length - 1], }; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index b3dfa8d120c..a64f098a661 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -5020,7 +5020,7 @@ void EmitCharInClass() // ch < 128 ? (bitVectorString[ch >> 4] & (1 << (ch & 0xF))) != 0 : Ldloc(tempLocal); - Ldc(128); + Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfContainsOnlyAscii : 128); Bge(comparisonLabel); Ldstr(bitVectorString); Ldloc(tempLocal); -- GitLab