未验证 提交 093bdc46 编写于 作者: S Stephen Toub 提交者: GitHub

Avoid RegexCode/RegexWriter for all engines other than RegexInterpreter (#65986)

* Avoid RegexCode/RegexWriter for all engines other than RegexInterpreter

* Address PR feedback
上级 2330b4be
......@@ -178,10 +178,10 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
}
// Parse the input pattern
RegexCode code;
RegexTree tree;
try
{
code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture), culture);
tree = RegexParser.Parse(pattern, regexOptions, culture);
}
catch (Exception e)
{
......@@ -199,7 +199,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
pattern,
regexOptions,
matchTimeout ?? Timeout.Infinite,
code);
tree);
var regexType = new RegexType(
regexMethod,
......@@ -233,7 +233,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
}
/// <summary>A regex method.</summary>
internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexCode Code);
internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexTree Tree);
/// <summary>A type holding a regex method.</summary>
internal sealed record RegexType(RegexMethod? Method, string Keyword, string Namespace, string Name)
......
......@@ -33,7 +33,6 @@
<Compile Include="..\src\System\Threading\StackHelper.cs" Link="Production\StackHelper.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCharClass.cs" Link="Production\RegexCharClass.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" Link="Production\RegexCharClass.MappingTable.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCode.cs" Link="Production\RegexCode.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexFindOptimizations.cs" Link="Production\RegexFindOptimizations.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexNode.cs" Link="Production\RegexNode.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexNodeKind.cs" Link="Production\RegexNodeKind.cs" />
......@@ -45,7 +44,6 @@
<Compile Include="..\src\System\Text\RegularExpressions\RegexPrefixAnalyzer.cs" Link="Production\RegexPrefixAnalyzer.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexTree.cs" Link="Production\RegexTree.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexTreeAnalyzer.cs" Link="Production\RegexTreeAnalyzer.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexWriter.cs" Link="Production\RegexWriter.cs" />
<Compile Include="..\src\System\Collections\HashtableExtensions.cs" Link="Production\HashtableExtensions.cs" />
</ItemGroup>
......
......@@ -26,11 +26,11 @@
<Compile Include="System\Text\RegularExpressions\Regex.Timeout.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCharClass.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCode.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCompilationInfo.cs" />
<Compile Include="System\Text\RegularExpressions\RegexFindOptimizations.cs" />
<Compile Include="System\Text\RegularExpressions\RegexGeneratorAttribute.cs" />
<Compile Include="System\Text\RegularExpressions\RegexInterpreter.cs" />
<Compile Include="System\Text\RegularExpressions\RegexInterpreterCode.cs" />
<Compile Include="System\Text\RegularExpressions\RegexMatchTimeoutException.cs" />
<Compile Include="System\Text\RegularExpressions\RegexNode.cs" />
<Compile Include="System\Text\RegularExpressions\RegexNodeKind.cs" />
......
......@@ -77,7 +77,7 @@ public bool IsMatch(string input)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null;
return Run(quick: true, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null;
}
/// <summary>
......@@ -87,7 +87,7 @@ public bool IsMatch(string input)
/// <returns><see langword="true"/> if the regular expression finds a match; otherwise, <see langword="false"/>.</returns>
/// <exception cref="RegexMatchTimeoutException">A time-out ocurred.</exception>
public bool IsMatch(ReadOnlySpan<char> input) =>
Run(input, UseOptionR() ? input.Length : 0) is null;
Run(input, RightToLeft ? input.Length : 0) is null;
/// <summary>
/// Searches the input string for one or more matches using the previous pattern and options,
......@@ -132,7 +132,7 @@ public Match Match(string input)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Run(quick: false, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0)!;
return Run(quick: false, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!;
}
/// <summary>
......@@ -159,7 +159,7 @@ public Match Match(string input, int beginning, int length)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Run(quick: false, -1, input, beginning, length, UseOptionR() ? beginning + length : beginning)!;
return Run(quick: false, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!;
}
/// <summary>
......@@ -187,7 +187,7 @@ public MatchCollection Matches(string input)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return new MatchCollection(this, input, UseOptionR() ? input.Length : 0);
return new MatchCollection(this, input, RightToLeft ? input.Length : 0);
}
/// <summary>
......
......@@ -42,7 +42,7 @@ public string Replace(string input, string replacement)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Replace(input, replacement, -1, UseOptionR() ? input.Length : 0);
return Replace(input, replacement, -1, RightToLeft ? input.Length : 0);
}
/// <summary>
......@@ -57,7 +57,7 @@ public string Replace(string input, string replacement, int count)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Replace(input, replacement, count, UseOptionR() ? input.Length : 0);
return Replace(input, replacement, count, RightToLeft ? input.Length : 0);
}
/// <summary>
......@@ -111,7 +111,7 @@ public string Replace(string input, MatchEvaluator evaluator)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Replace(evaluator, this, input, -1, UseOptionR() ? input.Length : 0);
return Replace(evaluator, this, input, -1, RightToLeft ? input.Length : 0);
}
/// <summary>
......@@ -125,7 +125,7 @@ public string Replace(string input, MatchEvaluator evaluator, int count)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Replace(evaluator, this, input, count, UseOptionR() ? input.Length : 0);
return Replace(evaluator, this, input, count, RightToLeft ? input.Length : 0);
}
/// <summary>
......
......@@ -35,7 +35,7 @@ public string[] Split(string input)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Split(this, input, 0, UseOptionR() ? input.Length : 0);
return Split(this, input, 0, RightToLeft ? input.Length : 0);
}
/// <summary>
......@@ -49,7 +49,7 @@ public string[] Split(string input, int count)
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
}
return Split(this, input, count, UseOptionR() ? input.Length : 0);
return Split(this, input, count, RightToLeft ? input.Length : 0);
}
/// <summary>
......
......@@ -20,8 +20,6 @@ namespace System.Text.RegularExpressions
/// </summary>
public partial class Regex : ISerializable
{
internal const int MaxOptionShift = 11;
[StringSyntax(StringSyntaxAttribute.Regex)]
protected internal string? pattern; // The string pattern provided
protected internal RegexOptions roptions; // the top-level options from the options string
......@@ -33,7 +31,6 @@ public partial class Regex : ISerializable
private WeakReference<RegexReplacement?>? _replref; // cached parsed replacement pattern
private volatile RegexRunner? _runner; // cached runner
private RegexCode? _code; // if interpreted, this is the code for RegexInterpreter
protected Regex()
{
......@@ -63,64 +60,69 @@ protected Regex()
internal Regex(string pattern, CultureInfo? culture)
{
// Call Init directly rather than delegating to a Regex ctor that takes
// options to enable linking / tree shaking to remove the Regex compiler
// and NonBacktracking implementation if it's not used.
Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture ?? CultureInfo.CurrentCulture);
// Validate arguments.
ValidatePattern(pattern);
// Parse and store the argument information.
RegexTree tree = Init(pattern, RegexOptions.None, s_defaultMatchTimeout, ref culture);
// Create the interpreter factory.
factory = new RegexInterpreterFactory(tree, culture);
// NOTE: This overload _does not_ delegate to the one that takes options, in order
// to avoid unnecessarily rooting the support for RegexOptions.NonBacktracking/Compiler
// if no options are ever used.
}
internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture)
{
culture ??= RegexParser.GetTargetCulture(options);
Init(pattern, options, matchTimeout, culture);
// Validate arguments.
ValidatePattern(pattern);
ValidateOptions(options);
ValidateMatchTimeout(matchTimeout);
// Parse and store the argument information.
RegexTree tree = Init(pattern, options, matchTimeout, ref culture);
// Create the appropriate factory.
if ((options & RegexOptions.NonBacktracking) != 0)
{
// If we're in non-backtracking mode, create the appropriate factory.
factory = new SymbolicRegexRunnerFactory(_code, options, matchTimeout, culture);
_code = null;
factory = new SymbolicRegexRunnerFactory(tree, options, matchTimeout, culture);
}
else if (RuntimeFeature.IsDynamicCodeCompiled && UseOptionC())
else
{
// If the compile option is set and compilation is supported, then compile the code.
// If the compiler can't compile this regex, it'll return null, and we'll fall back
// to the interpreter.
factory = Compile(pattern, _code, options, matchTimeout != InfiniteMatchTimeout);
if (factory is not null)
if (RuntimeFeature.IsDynamicCodeCompiled && (options & RegexOptions.Compiled) != 0)
{
_code = null;
// If the compile option is set and compilation is supported, then compile the code.
// If the compiler can't compile this regex, it'll return null, and we'll fall back
// to the interpreter.
factory = Compile(pattern, tree, options, matchTimeout != InfiniteMatchTimeout);
}
// If no factory was created, fall back to creating one for the interpreter.
factory ??= new RegexInterpreterFactory(tree, culture);
}
}
/// <summary>Initializes the instance.</summary>
/// <remarks>
/// This is separated out of the constructor so that an app only using 'new Regex(pattern)'
/// rather than 'new Regex(pattern, options)' can avoid statically referencing the Regex
/// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used.
/// </remarks>
[MemberNotNull(nameof(_code))]
private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture)
/// <summary>Stores the supplied arguments and capture information, returning the parsed expression.</summary>
private RegexTree Init(string pattern, RegexOptions options, TimeSpan matchTimeout, [NotNull] ref CultureInfo? culture)
{
ValidatePattern(pattern);
ValidateOptions(options);
ValidateMatchTimeout(matchTimeout);
this.pattern = pattern;
internalMatchTimeout = matchTimeout;
roptions = options;
internalMatchTimeout = matchTimeout;
culture ??= RegexParser.GetTargetCulture(options);
// Parse the input
RegexTree tree = RegexParser.Parse(pattern, roptions, culture);
// Parse the pattern.
RegexTree tree = RegexParser.Parse(pattern, options, culture);
// Generate the RegexCode from the node tree. This is required for interpreting,
// and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking.
_code = RegexWriter.Write(tree, culture);
// Store the relevant information, constructing the appropriate factory.
capnames = tree.CaptureNameToNumberMapping;
capslist = tree.CaptureNames;
caps = tree.CaptureNumberSparseMapping;
capsize = tree.CaptureCount;
capnames = tree.CapNames;
capslist = tree.CapsList;
caps = _code.Caps;
capsize = _code.CapSize;
return tree;
}
internal static void ValidatePattern(string pattern)
......@@ -133,9 +135,9 @@ internal static void ValidatePattern(string pattern)
internal static void ValidateOptions(RegexOptions options)
{
const int MaxOptionShift = 11;
if (((((uint)options) >> MaxOptionShift) != 0) ||
((options & RegexOptions.ECMAScript) != 0 &&
(options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.NonBacktracking | RegexOptions.CultureInvariant)) != 0))
((options & RegexOptions.ECMAScript) != 0 && (options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.NonBacktracking | RegexOptions.CultureInvariant)) != 0))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.options);
}
......@@ -199,8 +201,8 @@ protected internal static void ValidateMatchTimeout(TimeSpan matchTimeout)
/// instantiating a non-compiled regex.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
private static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) =>
RegexCompiler.Compile(pattern, code, options, hasTimeout);
private static RegexRunnerFactory? Compile(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout) =>
RegexCompiler.Compile(pattern, regexTree, options, hasTimeout);
[Obsolete(Obsoletions.RegexCompileToAssemblyMessage, DiagnosticId = Obsoletions.RegexCompileToAssemblyDiagId, UrlFormat = Obsoletions.SharedUrlFormat)]
public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, AssemblyName assemblyname) =>
......@@ -254,7 +256,7 @@ public static string Unescape(string str)
/// <summary>
/// Indicates whether the regular expression matches from right to left.
/// </summary>
public bool RightToLeft => UseOptionR();
public bool RightToLeft => (roptions & RegexOptions.RightToLeft) != 0;
/// <summary>
/// Returns the regular expression pattern passed into the constructor
......@@ -554,13 +556,14 @@ internal void Run<TState>(string input, int startat, ref TState state, MatchCall
/// <summary>Creates a new runner instance.</summary>
private RegexRunner CreateRunner() =>
factory?.CreateInstance() ??
new RegexInterpreter(_code!, RegexParser.GetTargetCulture(roptions));
// The factory needs to be set by the ctor. `factory` is a protected field, so it's possible a derived
// type nulls out the factory after we've set it, but that's the nature of the design.
factory!.CreateInstance();
/// <summary>True if the <see cref="RegexOptions.Compiled"/> option was set.</summary>
protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0;
/// <summary>True if the <see cref="RegexOptions.RightToLeft"/> option was set.</summary>
protected internal bool UseOptionR() => (roptions & RegexOptions.RightToLeft) != 0;
protected internal bool UseOptionR() => RightToLeft;
}
}
......@@ -68,8 +68,8 @@ internal abstract class RegexCompiler
protected ILGenerator? _ilg;
/// <summary>The options for the expression.</summary>
protected RegexOptions _options;
/// <summary>The code written for the expression.</summary>
protected RegexCode? _code;
/// <summary>The <see cref="RegexTree"/> written for the expression.</summary>
protected RegexTree? _regexTree;
/// <summary>Whether this expression has a non-infinite timeout.</summary>
protected bool _hasTimeout;
......@@ -93,8 +93,8 @@ internal abstract class RegexCompiler
/// Entry point to dynamically compile a regular expression. The expression is compiled to
/// an in-memory assembly.
/// </summary>
internal static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) =>
new RegexLWCGCompiler().FactoryInstanceFromCode(pattern, code, options, hasTimeout);
internal static RegexRunnerFactory? Compile(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout) =>
new RegexLWCGCompiler().FactoryInstanceFromCode(pattern, regexTree, options, hasTimeout);
/// <summary>A macro for _ilg.DefineLabel</summary>
private Label DefineLabel() => _ilg!.DefineLabel();
......@@ -366,7 +366,7 @@ private void CallToLower()
/// <summary>Generates the implementation for TryFindNextPossibleStartingPosition.</summary>
protected void EmitTryFindNextPossibleStartingPosition()
{
Debug.Assert(_code != null);
Debug.Assert(_regexTree != null);
_int32LocalsPool?.Clear();
_readOnlySpanCharLocalsPool?.Clear();
......@@ -377,13 +377,13 @@ protected void EmitTryFindNextPossibleStartingPosition()
_textInfo = null;
if ((_options & RegexOptions.CultureInvariant) == 0)
{
bool needsCulture = _code.FindOptimizations.FindMode switch
bool needsCulture = _regexTree.FindOptimizations.FindMode switch
{
FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or
FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive => true,
_ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive),
_ when _regexTree.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive),
_ => false,
};
......@@ -407,7 +407,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
// Generate length check. If the input isn't long enough to possibly match, fail quickly.
// It's rare for min required length to be 0, so we don't bother special-casing the check,
// especially since we want the "return false" code regardless.
int minRequiredLength = _code.Tree.MinRequiredLength;
int minRequiredLength = _regexTree.FindOptimizations.MinRequiredLength;
Debug.Assert(minRequiredLength >= 0);
Label returnFalse = DefineLabel();
Label finishedLengthCheck = DefineLabel();
......@@ -442,28 +442,28 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
}
// Either anchors weren't specified, or they don't completely root all matches to a specific location.
switch (_code.FindOptimizations.FindMode)
switch (_regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix));
EmitIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix);
Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix));
EmitIndexOf_LeftToRight(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix);
break;
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
EmitFixedSet_LeftToRight();
break;
case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive:
Debug.Assert(_code.FindOptimizations.LiteralAfterLoop is not null);
Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null);
EmitLiteralAfterAtomicLoop();
break;
default:
Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}");
Debug.Fail($"Unexpected mode: {_regexTree.FindOptimizations.FindMode}");
goto case FindNextStartingPositionMode.NoSearch;
case FindNextStartingPositionMode.NoSearch:
......@@ -480,7 +480,7 @@ bool GenerateAnchors()
Label label;
// Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination.
switch (_code.FindOptimizations.FindMode)
switch (_regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
label = DefineLabel();
......@@ -538,16 +538,16 @@ bool GenerateAnchors()
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ:
// Jump to the end, minus the min required length, which in this case is actually the fixed length.
{
int extraNewlineBump = _code.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0;
int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0;
label = DefineLabel();
Ldloc(pos);
Ldloc(end);
Ldc(_code.Tree.MinRequiredLength + extraNewlineBump);
Ldc(_regexTree.FindOptimizations.MinRequiredLength + extraNewlineBump);
Sub();
Bge(label);
Ldthis();
Ldloc(end);
Ldc(_code.Tree.MinRequiredLength + extraNewlineBump);
Ldc(_regexTree.FindOptimizations.MinRequiredLength + extraNewlineBump);
Sub();
Stfld(s_runtextposField);
MarkLabel(label);
......@@ -559,7 +559,7 @@ bool GenerateAnchors()
// Now handle anchors that boost the position but don't determine immediate success or failure.
switch (_code.FindOptimizations.LeadingAnchor)
switch (_regexTree.FindOptimizations.LeadingAnchor)
{
case RegexNodeKind.Bol:
{
......@@ -625,12 +625,12 @@ bool GenerateAnchors()
break;
}
switch (_code.FindOptimizations.TrailingAnchor)
switch (_regexTree.FindOptimizations.TrailingAnchor)
{
case RegexNodeKind.End or RegexNodeKind.EndZ when _code.FindOptimizations.MaxPossibleLength is int maxLength:
case RegexNodeKind.End or RegexNodeKind.EndZ when _regexTree.FindOptimizations.MaxPossibleLength is int maxLength:
// Jump to the end, minus the max allowed length.
{
int extraNewlineBump = _code.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0;
int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0;
label = DefineLabel();
Ldloc(pos);
Ldloc(end);
......@@ -683,7 +683,7 @@ void EmitIndexOf_LeftToRight(string prefix)
void EmitFixedSet_LeftToRight()
{
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets;
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _regexTree.FindOptimizations.FixedDistanceSets;
(char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
const int MaxSets = 4;
int setsToUse = Math.Min(sets.Count, MaxSets);
......@@ -882,8 +882,8 @@ void EmitFixedSet_LeftToRight()
// Emits a search for a literal following a leading atomic single-character loop.
void EmitLiteralAfterAtomicLoop()
{
Debug.Assert(_code.FindOptimizations.LiteralAfterLoop is not null);
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _code.FindOptimizations.LiteralAfterLoop.Value;
Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null);
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value;
Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
Debug.Assert(target.LoopNode.N == int.MaxValue);
......@@ -1048,12 +1048,12 @@ protected void EmitTryMatchAtCurrentPosition()
// "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to
// the calling scan loop that nothing was matched.
Debug.Assert(_code != null);
Debug.Assert(_regexTree != null);
_int32LocalsPool?.Clear();
_readOnlySpanCharLocalsPool?.Clear();
// Get the root Capture node of the tree.
RegexNode node = _code.Tree.Root;
RegexNode node = _regexTree.Root;
Debug.Assert(node.Kind == RegexNodeKind.Capture, "Every generated tree should begin with a capture node");
Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child");
......@@ -1090,6 +1090,8 @@ protected void EmitTryMatchAtCurrentPosition()
// performance. Since that's not applicable to RegexCompiler, that code isn't mirrored here.
}
AnalysisResults analysis = RegexTreeAnalyzer.Analyze(_regexTree);
// Initialize the main locals used throughout the implementation.
LocalBuilder inputSpan = DeclareReadOnlySpanChar();
LocalBuilder originalPos = DeclareInt32();
......@@ -1104,7 +1106,7 @@ protected void EmitTryMatchAtCurrentPosition()
}
// CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
InitializeCultureForTryMatchAtCurrentPositionIfNecessary();
InitializeCultureForTryMatchAtCurrentPositionIfNecessary(analysis);
// ReadOnlySpan<char> inputSpan = input;
// int end = base.runtextend;
......@@ -1133,8 +1135,6 @@ protected void EmitTryMatchAtCurrentPosition()
int sliceStaticPos = 0;
SliceInputSpan();
AnalysisResults analysis = RegexTreeAnalyzer.Analyze(_code);
// Check whether there are captures anywhere in the expression. If there isn't, we can skip all
// the boilerplate logic around uncapturing, as there won't be anything to uncapture.
bool expressionHasCaptures = analysis.MayContainCapture(node);
......@@ -1470,7 +1470,7 @@ void EmitBackreference(RegexNode node)
{
Debug.Assert(node.Kind is RegexNodeKind.Backreference, $"Unexpected type: {node.Kind}");
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping);
TransferSliceStaticPosToPos();
......@@ -1569,7 +1569,7 @@ void EmitBackreferenceConditional(RegexNode node)
TransferSliceStaticPosToPos();
// Get the capture number to test.
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping);
// Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus
// somewhat likely to be Empty.
......@@ -1889,8 +1889,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
Debug.Assert(node.Kind is RegexNodeKind.Capture, $"Unexpected type: {node.Kind}");
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps);
int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps);
int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping);
int uncapnum = RegexParser.MapCaptureNumber(node.N, _regexTree.CaptureNumberSparseMapping);
bool isAtomic = analysis.IsAtomicByAncestor(node);
// pos += sliceStaticPos;
......@@ -4016,31 +4016,14 @@ protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, Dynamic
Ret();
}
private void InitializeCultureForTryMatchAtCurrentPositionIfNecessary()
private void InitializeCultureForTryMatchAtCurrentPositionIfNecessary(AnalysisResults analysis)
{
_textInfo = null;
if ((_options & RegexOptions.CultureInvariant) == 0)
if (analysis.HasIgnoreCase && (_options & RegexOptions.CultureInvariant) == 0)
{
bool needsCulture = (_options & RegexOptions.IgnoreCase) != 0;
if (!needsCulture)
{
int[] codes = _code!.Codes;
for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize((RegexOpcode)codes[codepos]))
{
if (((RegexOpcode)codes[codepos] & RegexOpcode.CaseInsensitive) == RegexOpcode.CaseInsensitive)
{
needsCulture = true;
break;
}
}
}
if (needsCulture)
{
// cache CultureInfo in local variable which saves excessive thread local storage accesses
_textInfo = DeclareTextInfo();
InitLocalCultureInfo();
}
// cache CultureInfo in local variable which saves excessive thread local storage accesses
_textInfo = DeclareTextInfo();
InitLocalCultureInfo();
}
}
......
......@@ -10,9 +10,6 @@ namespace System.Text.RegularExpressions
/// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
internal sealed class RegexFindOptimizations
{
/// <summary>The minimum required length an input need be to match the pattern.</summary>
/// <remarks>0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.</remarks>
private readonly int _minRequiredLength;
/// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
private readonly bool _rightToLeft;
/// <summary>Provides the ToLower routine for lowercasing characters.</summary>
......@@ -20,15 +17,16 @@ internal sealed class RegexFindOptimizations
/// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
private readonly uint[]?[]? _asciiLookups;
public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
public RegexFindOptimizations(RegexNode root, RegexOptions options, CultureInfo culture)
{
_rightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0;
_minRequiredLength = tree.MinRequiredLength;
_rightToLeft = (options & RegexOptions.RightToLeft) != 0;
_textInfo = culture.TextInfo;
MinRequiredLength = root.ComputeMinLength();
// Compute any anchor starting the expression. If there is one, we won't need to search for anything,
// as we can just match at that single location.
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree.Root);
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
{
// Filter out Bol for RightToLeft, as we don't currently optimize for it.
......@@ -56,15 +54,15 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
{
bool triedToComputeMaxLength = false;
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(tree.Root);
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ)
{
triedToComputeMaxLength = true;
if (tree.Root.ComputeMaxLength() is int maxLength)
if (root.ComputeMaxLength() is int maxLength)
{
Debug.Assert(maxLength >= _minRequiredLength, $"{maxLength} should have been greater than {_minRequiredLength} minimum");
Debug.Assert(maxLength >= MinRequiredLength, $"{maxLength} should have been greater than {MinRequiredLength} minimum");
MaxPossibleLength = maxLength;
if (_minRequiredLength == maxLength)
if (MinRequiredLength == maxLength)
{
FindMode = TrailingAnchor == RegexNodeKind.End ?
FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End :
......@@ -74,16 +72,16 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
}
}
if ((tree.Options & RegexOptions.NonBacktracking) != 0 && !triedToComputeMaxLength)
if ((options & RegexOptions.NonBacktracking) != 0 && !triedToComputeMaxLength)
{
// NonBacktracking also benefits from knowing whether the pattern is a fixed length, as it can use that
// knowledge to avoid multiple match phases in some situations.
MaxPossibleLength = tree.Root.ComputeMaxLength();
MaxPossibleLength = root.ComputeMaxLength();
}
}
// If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations.
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree.Root);
string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(root);
if (caseSensitivePrefix.Length > 1)
{
LeadingCaseSensitivePrefix = caseSensitivePrefix;
......@@ -98,8 +96,8 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
// If we're compiling, then the compilation process already handles sets that reduce to a single literal,
// so we can simplify and just always go for the sets.
bool dfa = (tree.Options & RegexOptions.NonBacktracking) != 0;
bool compiled = (tree.Options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled
bool dfa = (options & RegexOptions.NonBacktracking) != 0;
bool compiled = (options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled
bool interpreter = !compiled && !dfa;
// For interpreter, we want to employ optimizations, but we don't want to make construction significantly
......@@ -109,7 +107,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
if (_rightToLeft)
{
// Determine a set for anything that can possibly start the expression.
if (RegexPrefixAnalyzer.FindFirstCharClass(tree, culture) is (string CharClass, bool CaseInsensitive) set)
if (RegexPrefixAnalyzer.FindFirstCharClass(root, culture) is (string CharClass, bool CaseInsensitive) set)
{
// See if the set is limited to holding only a few characters.
Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
......@@ -148,10 +146,10 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
// As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
// we want to know whether we have one in our pocket before deciding whether to use a leading set.
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(tree);
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(tree, culture, thorough: !interpreter);
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, culture, thorough: !interpreter);
Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0);
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized
......@@ -214,6 +212,10 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
/// <summary>Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
public RegexNodeKind TrailingAnchor { get; }
/// <summary>Gets the minimum required length an input need be to match the pattern.</summary>
/// <remarks>0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.</remarks>
public int MinRequiredLength { get; }
/// <summary>The maximum possible length an input could be to match the pattern.</summary>
/// <remarks>
/// This is currently only set when <see cref="TrailingAnchor"/> is found to be an end anchor.
......@@ -246,7 +248,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
// Return early if we know there's not enough input left to match.
if (!_rightToLeft)
{
if (pos > end - _minRequiredLength)
if (pos > end - MinRequiredLength)
{
pos = end;
return false;
......@@ -254,7 +256,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
}
else
{
if (pos - _minRequiredLength < beginning)
if (pos - MinRequiredLength < beginning)
{
pos = beginning;
return false;
......@@ -351,16 +353,16 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
return true;
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ:
if (pos < end - _minRequiredLength - 1)
if (pos < end - MinRequiredLength - 1)
{
pos = end - _minRequiredLength - 1;
pos = end - MinRequiredLength - 1;
}
return true;
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End:
if (pos < end - _minRequiredLength)
if (pos < end - MinRequiredLength)
{
pos = end - _minRequiredLength;
pos = end - MinRequiredLength;
}
return true;
......@@ -522,7 +524,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive:
{
Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength);
Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength);
int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal);
if (i >= 0)
......@@ -537,7 +539,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive:
{
Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength);
Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength);
char ch = FixedDistanceLiteral.Literal;
TextInfo ti = _textInfo;
......@@ -562,7 +564,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
{
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!;
(char[]? primaryChars, string primarySet, int primaryDistance, _) = sets[0];
int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength);
int endMinusRequiredLength = end - Math.Max(1, MinRequiredLength);
if (primaryChars is not null)
{
......@@ -637,7 +639,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!;
(_, string primarySet, int primaryDistance, _) = sets[0];
int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength);
int endMinusRequiredLength = end - Math.Max(1, MinRequiredLength);
TextInfo ti = _textInfo;
ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
......
......@@ -8,12 +8,26 @@
namespace System.Text.RegularExpressions
{
/// <summary>A <see cref="RegexRunnerFactory"/> for creating <see cref="RegexInterpreter"/>s.</summary>
internal sealed class RegexInterpreterFactory : RegexRunnerFactory
{
private readonly RegexInterpreterCode _code;
public RegexInterpreterFactory(RegexTree tree, CultureInfo culture) =>
// Generate and store the RegexInterpretedCode for the RegexTree and the specified culture
_code = RegexWriter.Write(tree, culture);
protected internal override RegexRunner CreateInstance() =>
// Create a new interpreter instance.
new RegexInterpreter(_code, RegexParser.GetTargetCulture(_code.Options));
}
/// <summary>Executes a block of regular expression codes while consuming input.</summary>
internal sealed class RegexInterpreter : RegexRunner
{
private const int LoopTimeoutCheckCount = 2048; // conservative value to provide reasonably-accurate timeout handling.
private readonly RegexCode _code;
private readonly RegexInterpreterCode _code;
private readonly TextInfo _textInfo;
private RegexOpcode _operator;
......@@ -21,7 +35,7 @@ internal sealed class RegexInterpreter : RegexRunner
private bool _rightToLeft;
private bool _caseInsensitive;
public RegexInterpreter(RegexCode code, CultureInfo culture)
public RegexInterpreter(RegexInterpreterCode code, CultureInfo culture)
{
Debug.Assert(code != null, "code must not be null.");
Debug.Assert(culture != null, "culture must not be null.");
......
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
namespace System.Text.RegularExpressions
{
/// <summary>Representation of a regular expression, written by <see cref="RegexWriter"/> and containing the code evaluated by <see cref="RegexInterpreter"/>.</summary>
/// <remarks>It currently stores some data used by engines other than the interpreter; that can be refactored out in the future.</remarks>
internal sealed class RegexCode
/// <summary>Contains the code, written by <see cref="RegexWriter"/>, for <see cref="RegexInterpreter"/> to evaluate a regular expression.</summary>
internal sealed class RegexInterpreterCode
{
/// <summary>The optimized parse tree.</summary>
public readonly RegexTree Tree;
/// <summary>Find logic to use to find the next possible location for a match.</summary>
public readonly RegexFindOptimizations FindOptimizations;
/// <summary>The options associated with the regex.</summary>
public readonly RegexOptions Options;
/// <summary>RegexOpcodes and arguments written by <see cref="RegexWriter"/>.</summary>
public readonly int[] Codes;
/// <summary>The string / set table. <see cref="Codes"/> includes offsets into this table, for string and set arguments.</summary>
......@@ -22,26 +21,15 @@ internal sealed class RegexCode
public readonly uint[]?[] StringsAsciiLookup;
/// <summary>How many instructions in <see cref="Codes"/> use backtracking.</summary>
public readonly int TrackCount;
/// <summary>Mapping of user group numbers to impl group slots.</summary>
public readonly Hashtable? Caps;
/// <summary>Number of impl group slots.</summary>
public readonly int CapSize;
/// <summary>True if right to left.</summary>
public readonly bool RightToLeft;
/// <summary>Optimization mode and supporting data to enable quickly finding the next possible match location.</summary>
public readonly RegexFindOptimizations FindOptimizations;
public RegexCode(RegexTree tree, CultureInfo culture, int[] codes, string[] strings, int trackcount, Hashtable? caps, int capsize)
public RegexInterpreterCode(RegexFindOptimizations findOptimizations, RegexOptions options, int[] codes, string[] strings, int trackcount)
{
Tree = tree;
FindOptimizations = findOptimizations;
Options = options;
Codes = codes;
Strings = strings;
StringsAsciiLookup = new uint[strings.Length][];
TrackCount = trackcount;
Caps = caps;
CapSize = capsize;
RightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0;
FindOptimizations = new RegexFindOptimizations(tree, culture);
}
/// <summary>Gets whether the specified opcode may incur backtracking.</summary>
......@@ -152,8 +140,7 @@ public override string ToString()
{
var sb = new StringBuilder();
sb.AppendLine($"Direction: {(RightToLeft ? "right-to-left" : "left-to-right")}");
sb.AppendLine($"Anchor: {FindOptimizations.LeadingAnchor}");
sb.AppendLine($"Direction: {((Options & RegexOptions.RightToLeft) != 0 ? "right-to-left" : "left-to-right")}");
sb.AppendLine();
for (int i = 0; i < Codes.Length; i += OpcodeSize((RegexOpcode)Codes[i]))
{
......
......@@ -30,14 +30,14 @@ internal sealed class RegexLWCGCompiler : RegexCompiler
private static int s_regexCount;
/// <summary>The top-level driver. Initializes everything then calls the Generate* methods.</summary>
public RegexRunnerFactory? FactoryInstanceFromCode(string pattern, RegexCode code, RegexOptions options, bool hasTimeout)
public RegexRunnerFactory? FactoryInstanceFromCode(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout)
{
if (!code.Tree.Root.SupportsCompilation())
if (!regexTree.Root.SupportsCompilation())
{
return null;
}
_code = code;
_regexTree = regexTree;
_options = options;
_hasTimeout = hasTimeout;
......
......@@ -497,8 +497,29 @@ private void EliminateEndingBacktracking()
/// <summary>
/// Removes redundant nodes from the subtree, and returns an optimized subtree.
/// </summary>
internal RegexNode Reduce() =>
Kind switch
internal RegexNode Reduce()
{
// TODO: https://github.com/dotnet/runtime/issues/61048
// As part of overhauling IgnoreCase handling, the parser shouldn't produce any nodes other than Backreference
// that ever have IgnoreCase set on them. For now, though, remove IgnoreCase from any nodes for which it
// has no behavioral effect.
switch (Kind)
{
default:
// No effect
Options &= ~RegexOptions.IgnoreCase;
break;
case RegexNodeKind.One or RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Notone or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Set or RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic:
case RegexNodeKind.Multi:
case RegexNodeKind.Backreference:
// Still meaningful
break;
}
return Kind switch
{
RegexNodeKind.Alternate => ReduceAlternation(),
RegexNodeKind.Atomic => ReduceAtomic(),
......@@ -512,6 +533,7 @@ Kind switch
RegexNodeKind.BackreferenceConditional => ReduceTestref(),
_ => this,
};
}
/// <summary>Remove an unnecessary Concatenation or Alternation node</summary>
/// <remarks>
......
......@@ -5,7 +5,7 @@ namespace System.Text.RegularExpressions
{
/// <summary>Opcodes written by <see cref="RegexWriter"/> and used by <see cref="RegexInterpreter"/> to process a regex.</summary>
/// <remarks>
/// <see cref="RegexCode"/> stores an int[] containing all of the codes that make up the instructions for
/// <see cref="RegexInterpreterCode"/> stores an int[] containing all of the codes that make up the instructions for
/// the interpreter to process the regular expression. The array contains a packed sequence of operations,
/// each of which is an <see cref="RegexOpcode"/> stored as an int, followed immediately by all of the operands
/// required for that operation. For example, the subexpression `a{2,7}[^b]` would be represented as the sequence
......
......@@ -76,28 +76,41 @@ private RegexParser(string pattern, RegexOptions options, CultureInfo culture, H
_ignoreNextParen = false;
}
private RegexParser(string pattern, RegexOptions options, CultureInfo culture, Span<int> optionSpan)
: this(pattern, options, culture, new Hashtable(), default, null, optionSpan)
{
}
/// <summary>Gets the culture to use based on the specified options.</summary>
internal static CultureInfo GetTargetCulture(RegexOptions options) =>
(options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture)
{
var parser = new RegexParser(pattern, options, culture, stackalloc int[OptionStackDefaultSize]);
using var parser = new RegexParser(pattern, options, culture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]);
parser.CountCaptures();
parser.Reset(options);
RegexNode root = parser.ScanRegex();
int minRequiredLength = root.ComputeMinLength();
string[]? capnamelist = parser._capnamelist?.ToArray();
var tree = new RegexTree(root, parser._caps, parser._capnumlist!, parser._captop, parser._capnames!, capnamelist!, options, minRequiredLength);
parser.Dispose();
return tree;
int[]? captureNumberList = parser._capnumlist;
Hashtable? sparseMapping = parser._caps;
int captop = parser._captop;
int captureCount;
if (captureNumberList == null || captop == captureNumberList.Length)
{
// The capture list isn't sparse. Null out the capture mapping as it's not necessary,
// and store the number of captures.
captureCount = captop;
sparseMapping = null;
}
else
{
// The capture list is sparse. Store the number of captures, and populate the number-to-names-list.
captureCount = captureNumberList.Length;
for (int i = 0; i < captureNumberList.Length; i++)
{
sparseMapping[captureNumberList[i]] = i;
}
}
return new RegexTree(root, captureCount, parser._capnamelist?.ToArray(), parser._capnames!, sparseMapping, options, culture);
}
/// <summary>
......@@ -106,11 +119,10 @@ public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo
public static RegexReplacement ParseReplacement(string pattern, RegexOptions options, Hashtable caps, int capsize, Hashtable capnames)
{
CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, stackalloc int[OptionStackDefaultSize]);
using var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, stackalloc int[OptionStackDefaultSize]);
RegexNode root = parser.ScanReplacement();
var regexReplacement = new RegexReplacement(pattern, root, caps);
parser.Dispose();
return regexReplacement;
}
......@@ -198,7 +210,7 @@ public static string Unescape(string input)
private static string UnescapeImpl(string input, int i)
{
var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, stackalloc int[OptionStackDefaultSize]);
using var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]);
// In the worst case the escaped string has the same length.
// For small inputs we use stack allocation.
......@@ -226,8 +238,6 @@ private static string UnescapeImpl(string input, int i)
vsb.Append(input.AsSpan(lastpos, i - lastpos));
} while (i < input.Length);
parser.Dispose();
return vsb.ToString();
}
......
......@@ -191,12 +191,11 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
}
/// <summary>Finds sets at fixed-offsets from the beginning of the pattern/</summary>
/// <param name="tree">The RegexNode tree.</param>
/// <param name="root">The RegexNode tree root.</param>
/// <param name="culture">The culture to use for any case conversions.</param>
/// <param name="thorough">true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete.</param>
/// <returns>The array of found sets, or null if there aren't any.</returns>
public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets(
RegexTree tree, CultureInfo culture, bool thorough)
public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets(RegexNode root, CultureInfo culture, bool thorough)
{
const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing
const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily
......@@ -204,13 +203,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
// Find all fixed-distance sets.
var results = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>();
int distance = 0;
TryFindFixedSets(tree.Root, results, ref distance, culture, thorough);
#if DEBUG
foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) result in results)
{
Debug.Assert(result.Distance <= tree.MinRequiredLength, $"Min: {tree.MinRequiredLength}, Distance: {result.Distance}, Tree: {tree}");
}
#endif
TryFindFixedSets(root, results, ref distance, culture, thorough);
// Remove any sets that match everything; they're not helpful. (This check exists primarily to weed
// out use of . in Singleline mode.)
......@@ -233,7 +226,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
// doesn't.
if (results.Count == 0)
{
(string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(tree, culture);
(string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(root, culture);
if (first is not null)
{
results.Add((null, first.Value.CharClass, 0, first.Value.CaseInsensitive));
......@@ -540,10 +533,10 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in
/// variable position, but this will find [ab] as it's instead looking for anything that under any
/// circumstance could possibly start a match.
/// </summary>
public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexTree tree, CultureInfo culture)
public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexNode root, CultureInfo culture)
{
var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]);
RegexFC? fc = s.RegexFCFromRegexTree(tree);
RegexFC? fc = s.RegexFCFromRegexTree(root);
s.Dispose();
if (fc == null || fc._nullable)
......@@ -563,9 +556,8 @@ public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(Regex
/// Analyzes the pattern for a leading set loop followed by a non-overlapping literal. If such a pattern is found, an implementation
/// can search for the literal and then walk backward through all matches for the loop until the beginning is found.
/// </summary>
public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexTree tree)
public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node)
{
RegexNode node = tree.Root;
if ((node.Options & RegexOptions.RightToLeft) != 0)
{
// As a simplification, ignore RightToLeft.
......@@ -788,9 +780,9 @@ private RegexFC PopFC()
/// through the tree and calls CalculateFC to emits code before
/// and after each child of an interior node, and at each leaf.
/// </summary>
private RegexFC? RegexFCFromRegexTree(RegexTree tree)
private RegexFC? RegexFCFromRegexTree(RegexNode root)
{
RegexNode? curNode = tree.Root;
RegexNode? curNode = root;
int curChild = 0;
while (true)
......
......@@ -2,31 +2,79 @@
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections;
using System.Diagnostics;
using System.Globalization;
namespace System.Text.RegularExpressions
{
/// <summary>Wrapper for a node tree with additional information attached.</summary>
/// <summary>
/// Provides the core data describing a parsed <see cref="RegexNode"/> tree, along with necessary
/// information about captures in the tree and computed optimizations about its structure.
/// </summary>
internal sealed class RegexTree
{
public readonly RegexNode Root;
public readonly Hashtable Caps;
public readonly int[] CapNumList;
public readonly int CapTop;
public readonly Hashtable CapNames;
public readonly string[] CapsList;
/// <summary>The options associated with the regular expression.</summary>
public readonly RegexOptions Options;
public readonly int MinRequiredLength;
/// <summary>The root node of the parsed <see cref="RegexNode"/> tree.</summary>
public readonly RegexNode Root;
/// <summary>The "find" optimizations computed for the regular expression to quickly find the next viable location to start looking for a match.</summary>
public readonly RegexFindOptimizations FindOptimizations;
/// <summary>The number of captures in the regex.</summary>
public readonly int CaptureCount;
/// <summary>A list of all the captures' names.</summary>
/// <remarks>
/// For numbered (implicitly or explicitly) captures, these are string representations of the numbers. This may be null if all captures were numbered
/// and dense, e.g. for `(a)(bc)(def)` and `(?&lt;1&gt;a)(?&lt;2&gt;bc)(?&lt;3&gt;def)` this will be null, but it will be non-null for
/// `(?&lt;1&gt;a)(?&lt;2&gt;bc)(?&lt;4&gt;def)` as well as for `(?&lt;2&gt;a)(?&lt;3&gt;bc)(?&lt;4&gt;def)`, as the groups now have a gap in the numbering.
/// </remarks>
public readonly string[]? CaptureNames;
/// <summary>A mapping of capture group name to capture group number.</summary>
/// <remarks>This is null iff <see cref="CaptureNames"/> is not null.</remarks>
public readonly Hashtable? CaptureNameToNumberMapping;
/// <summary>A mapping of capture group number to the associated name slot in <see cref="CaptureNames"/>.</summary>
/// <remarks>
/// This is non-null if the mapping is sparse. If non-null, each key/value pair entry represents one capture group, where the key is the
/// capture group number and the value is the index into <see cref="CaptureNames"/> for that capture group.
/// </remarks>
public readonly Hashtable? CaptureNumberSparseMapping;
internal RegexTree(RegexNode root, Hashtable caps, int[] capNumList, int capTop, Hashtable capNames, string[] capsList, RegexOptions options, int minRequiredLength)
internal RegexTree(RegexNode root, int captureCount, string[]? captureNames, Hashtable? captureNameToNumberMapping, Hashtable? captureNumberSparseMapping, RegexOptions options, CultureInfo culture)
{
#if DEBUG
// Asserts to both demonstrate and validate the relationships between the various capture data structures.
Debug.Assert(captureNumberSparseMapping is null || captureNames is not null);
Debug.Assert((captureNames is null) == (captureNameToNumberMapping is null));
Debug.Assert(captureNames is null || captureCount == captureNames.Length);
Debug.Assert(captureNumberSparseMapping is null || captureCount == captureNumberSparseMapping.Count);
Debug.Assert(captureNameToNumberMapping is null || captureCount == captureNameToNumberMapping.Count);
if (captureNames is not null)
{
Debug.Assert(captureNameToNumberMapping is not null);
for (int i = 0; i < captureNames.Length; i++)
{
string captureName = captureNames[i];
int? captureNumber = captureNameToNumberMapping[captureName] as int?;
Debug.Assert(captureNumber is not null);
if (captureNumberSparseMapping is not null)
{
captureNumber = captureNumberSparseMapping[captureNumber] as int?;
Debug.Assert(captureNumber is not null);
}
Debug.Assert(captureNumber == i);
}
}
#endif
Root = root;
Caps = caps;
CapNumList = capNumList;
CapTop = capTop;
CapNames = capNames;
CapsList = capsList;
CaptureNumberSparseMapping = captureNumberSparseMapping;
CaptureCount = captureCount;
CaptureNameToNumberMapping = captureNameToNumberMapping;
CaptureNames = captureNames;
Options = options;
MinRequiredLength = minRequiredLength;
FindOptimizations = new RegexFindOptimizations(root, options, culture);
}
}
}
......@@ -9,11 +9,11 @@ namespace System.Text.RegularExpressions
/// <summary>Analyzes a <see cref="RegexTree"/> of <see cref="RegexNode"/>s to produce data on the tree structure, in particular in support of code generation.</summary>
internal static class RegexTreeAnalyzer
{
/// <summary>Analyzes a <see cref="RegexCode"/> to learn about the structure of the tree.</summary>
public static AnalysisResults Analyze(RegexCode code)
/// <summary>Analyzes a <see cref="RegexInterpreterCode"/> to learn about the structure of the tree.</summary>
public static AnalysisResults Analyze(RegexTree regexTree)
{
var results = new AnalysisResults(code);
results._complete = TryAnalyze(code.Tree.Root, results, isAtomicByAncestor: true);
var results = new AnalysisResults(regexTree);
results._complete = TryAnalyze(regexTree.Root, results, isAtomicByAncestor: true);
return results;
static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor)
......@@ -23,6 +23,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA
return false;
}
// Track whether we've seen any node with IgnoreCase set.
results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0;
if (isAtomicByAncestor)
{
// We've been told by our parent that we should be considered atomic, so add ourselves
......@@ -144,13 +147,15 @@ internal sealed class AnalysisResults
internal readonly HashSet<RegexNode> _containsCapture = new(); // the root is a capture, so this will always contain at least the root node
/// <summary>Set of nodes that directly or indirectly contain backtracking constructs that aren't hidden internaly by atomic constructs.</summary>
internal HashSet<RegexNode>? _mayBacktrack;
/// <summary>Whether any node has <see cref="RegexOptions.IgnoreCase"/> set.</summary>
internal bool _hasIgnoreCase;
/// <summary>Initializes the instance.</summary>
/// <param name="code">The code being analyzed.</param>
internal AnalysisResults(RegexCode code) => Code = code;
/// <param name="regexTree">The code being analyzed.</param>
internal AnalysisResults(RegexTree regexTree) => RegexTree = regexTree;
/// <summary>Gets the code that was analyzed.</summary>
public RegexCode Code { get; }
public RegexTree RegexTree { get; }
/// <summary>Gets whether a node is considered atomic based on its ancestry.</summary>
public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node);
......@@ -168,5 +173,8 @@ internal sealed class AnalysisResults
/// true for any node that requires backtracking.
/// </remarks>
public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false);
/// <summary>Gets whether a node might have <see cref="RegexOptions.IgnoreCase"/> set.</summary>
public bool HasIgnoreCase => _complete && _hasIgnoreCase;
}
}
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
......@@ -21,10 +20,11 @@ namespace System.Text.RegularExpressions
private const int EmittedSize = 64;
private const int IntStackSize = 32;
private readonly RegexTree _tree;
private readonly CultureInfo _culture;
private readonly Dictionary<string, int> _stringTable;
private ValueListBuilder<int> _emitted;
private ValueListBuilder<int> _intStack;
private Hashtable? _caps;
private int _trackCount;
#if DEBUG
......@@ -35,66 +35,50 @@ static RegexWriter()
}
#endif
private RegexWriter(Span<int> emittedSpan, Span<int> intStackSpan)
private RegexWriter(RegexTree tree, CultureInfo culture, Span<int> emittedSpan, Span<int> intStackSpan)
{
_tree = tree;
_culture = culture;
_emitted = new ValueListBuilder<int>(emittedSpan);
_intStack = new ValueListBuilder<int>(intStackSpan);
_stringTable = new Dictionary<string, int>();
_caps = null;
_trackCount = 0;
}
/// <summary>
/// This is the only function that should be called from outside.
/// It takes a <see cref="RegexTree"/> and creates a corresponding <see cref="RegexCode"/>.
/// Return rented buffers.
/// </summary>
public static RegexCode Write(RegexTree tree, CultureInfo culture)
public void Dispose()
{
using var writer = new RegexWriter(stackalloc int[EmittedSize], stackalloc int[IntStackSize]);
return writer.RegexCodeFromRegexTree(tree, culture);
_emitted.Dispose();
_intStack.Dispose();
}
/// <summary>
/// Return rented buffers.
/// This is the only function that should be called from outside.
/// It takes a <see cref="RegexTree"/> and creates a corresponding <see cref="RegexInterpreterCode"/>.
/// </summary>
public void Dispose()
public static RegexInterpreterCode Write(RegexTree tree, CultureInfo culture)
{
_emitted.Dispose();
_intStack.Dispose();
using var writer = new RegexWriter(tree, culture, stackalloc int[EmittedSize], stackalloc int[IntStackSize]);
return writer.EmitCode();
}
/// <summary>
/// The top level RegexCode generator. It does a depth-first walk
/// The top level RegexInterpreterCode generator. It does a depth-first walk
/// through the tree and calls EmitFragment to emit code before
/// and after each child of an interior node and at each leaf.
/// It also computes various information about the tree, such as
/// prefix data to help with optimizations.
/// </summary>
public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture)
private RegexInterpreterCode EmitCode()
{
// Construct sparse capnum mapping if some numbers are unused.
int capsize;
if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length)
{
capsize = tree.CapTop;
_caps = null;
}
else
{
capsize = tree.CapNumList.Length;
_caps = tree.Caps;
for (int i = 0; i < tree.CapNumList.Length; i++)
{
_caps[tree.CapNumList[i]] = i;
}
}
// Every written code begins with a lazy branch. This will be back-patched
// to point to the ending Stop after the whole expression has been written.
Emit(RegexOpcode.Lazybranch, 0);
// Emit every node.
RegexNode curNode = tree.Root;
RegexNode curNode = _tree.Root;
int curChild = 0;
while (true)
{
......@@ -138,7 +122,7 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture)
}
// Return all that in a RegexCode object.
return new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize);
return new RegexInterpreterCode(_tree.FindOptimizations, _tree.Options, emitted, strings, _trackCount);
}
/// <summary>
......@@ -157,7 +141,7 @@ private void PatchJump(int offset, int jumpDest)
/// </summary>
private void Emit(RegexOpcode op)
{
if (RegexCode.OpcodeBacktracks(op))
if (RegexInterpreterCode.OpcodeBacktracks(op))
{
_trackCount++;
}
......@@ -168,7 +152,7 @@ private void Emit(RegexOpcode op)
/// <summary>Emits a one-argument operation.</summary>
private void Emit(RegexOpcode op, int opd1)
{
if (RegexCode.OpcodeBacktracks(op))
if (RegexInterpreterCode.OpcodeBacktracks(op))
{
_trackCount++;
}
......@@ -180,7 +164,7 @@ private void Emit(RegexOpcode op, int opd1)
/// <summary>Emits a two-argument operation.</summary>
private void Emit(RegexOpcode op, int opd1, int opd2)
{
if (RegexCode.OpcodeBacktracks(op))
if (RegexInterpreterCode.OpcodeBacktracks(op))
{
_trackCount++;
}
......@@ -270,7 +254,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex)
Emit(RegexOpcode.Setjump);
_intStack.Append(_emitted.Length);
Emit(RegexOpcode.Lazybranch, 0);
Emit(RegexOpcode.TestBackreference, RegexParser.MapCaptureNumber(node.M, _caps));
Emit(RegexOpcode.TestBackreference, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping));
Emit(RegexOpcode.Forejump);
break;
}
......@@ -368,7 +352,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex)
break;
case RegexNodeKind.Capture | AfterChild:
Emit(RegexOpcode.Capturemark, RegexParser.MapCaptureNumber(node.M, _caps), RegexParser.MapCaptureNumber(node.N, _caps));
Emit(RegexOpcode.Capturemark, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping), RegexParser.MapCaptureNumber(node.N, _tree.CaptureNumberSparseMapping));
break;
case RegexNodeKind.PositiveLookaround | BeforeChild:
......@@ -448,7 +432,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex)
break;
case RegexNodeKind.Backreference:
Emit((RegexOpcode)node.Kind | bits, RegexParser.MapCaptureNumber(node.M, _caps));
Emit((RegexOpcode)node.Kind | bits, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping));
break;
case RegexNodeKind.Nothing:
......
......@@ -17,7 +17,7 @@ internal sealed class RegexNodeConverter
/// <summary>The culture to use for IgnoreCase comparisons.</summary>
private readonly CultureInfo _culture;
/// <summary>Capture information.</summary>
private readonly Hashtable? _caps;
private readonly Hashtable? _captureSparseMapping;
/// <summary>The builder to use to create the <see cref="SymbolicRegexNode{S}"/> nodes.</summary>
internal readonly SymbolicRegexBuilder<BDD> _builder;
......@@ -26,10 +26,10 @@ internal sealed class RegexNodeConverter
private Dictionary<(bool IgnoreCase, string Set), BDD>? _setBddCache;
/// <summary>Constructs a regex to symbolic finite automata converter</summary>
public RegexNodeConverter(CultureInfo culture, Hashtable? caps)
public RegexNodeConverter(CultureInfo culture, Hashtable? captureSparseMapping)
{
_culture = culture;
_caps = caps;
_captureSparseMapping = captureSparseMapping;
_builder = new SymbolicRegexBuilder<BDD>(CharSetSolver.Instance);
}
......@@ -133,11 +133,7 @@ public SymbolicRegexNode<BDD> ConvertToSymbolicRegexNode(RegexNode node, bool tr
// Other constructs
case RegexNodeKind.Capture when node.N == -1: // N == -1 because balancing groups aren't supported
int captureNum;
if (_caps == null || !_caps.TryGetValue(node.M, out captureNum))
{
captureNum = node.M;
}
int captureNum = RegexParser.MapCaptureNumber(node.M, _captureSparseMapping);
return _builder.CreateCapture(ConvertToSymbolicRegexNode(node.Child(0), tryCreateFixedLengthMarker), captureNum);
case RegexNodeKind.Empty:
......
......@@ -147,7 +147,7 @@ private TSetType GetMinterm(int c)
}
/// <summary>Constructs matcher for given symbolic regex.</summary>
internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, BDD[] minterms, TimeSpan matchTimeout)
internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexTree regexTree, BDD[] minterms, TimeSpan matchTimeout)
{
Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}");
......@@ -161,17 +161,17 @@ internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, BD
BVAlgebra bv => bv._classifier,
_ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms),
};
_capsize = code.CapSize;
_capsize = regexTree.CaptureCount;
if (code.Tree.MinRequiredLength == code.FindOptimizations.MaxPossibleLength)
if (regexTree.FindOptimizations.MinRequiredLength == regexTree.FindOptimizations.MaxPossibleLength)
{
_fixedMatchLength = code.Tree.MinRequiredLength;
_fixedMatchLength = regexTree.FindOptimizations.MinRequiredLength;
}
if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch &&
code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match.
if (regexTree.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch &&
regexTree.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match.
{
_findOpts = code.FindOptimizations;
_findOpts = regexTree.FindOptimizations;
}
// Determine the number of initial states. If there's no anchor, only the default previous
......
......@@ -13,7 +13,7 @@ internal sealed class SymbolicRegexRunnerFactory : RegexRunnerFactory
internal readonly SymbolicRegexMatcher _matcher;
/// <summary>Initializes the factory.</summary>
public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture)
public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture)
{
// RightToLeft and ECMAScript are currently not supported in conjunction with NonBacktracking.
if ((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) != 0)
......@@ -23,9 +23,9 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan
(options & RegexOptions.RightToLeft) != 0 ? nameof(RegexOptions.RightToLeft) : nameof(RegexOptions.ECMAScript)));
}
var converter = new RegexNodeConverter(culture, code.Caps);
var converter = new RegexNodeConverter(culture, regexTree.CaptureNumberSparseMapping);
CharSetSolver solver = CharSetSolver.Instance;
SymbolicRegexNode<BDD> root = converter.ConvertToSymbolicRegexNode(code.Tree.Root, tryCreateFixedLengthMarker: true);
SymbolicRegexNode<BDD> root = converter.ConvertToSymbolicRegexNode(regexTree.Root, tryCreateFixedLengthMarker: true);
BDD[] minterms = root.ComputeMinterms();
if (minterms.Length > 64)
......@@ -42,7 +42,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan
// Convert the BDD-based AST to BV-based AST
SymbolicRegexNode<BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd));
_matcher = new SymbolicRegexMatcher<BV>(rootBV, code, minterms, matchTimeout);
_matcher = new SymbolicRegexMatcher<BV>(rootBV, regexTree, minterms, matchTimeout);
}
else
{
......@@ -58,7 +58,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan
// Convert the BDD-based AST to ulong-based AST
SymbolicRegexNode<ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd));
_matcher = new SymbolicRegexMatcher<ulong>(root64, code, minterms, matchTimeout);
_matcher = new SymbolicRegexMatcher<ulong>(root64, regexTree, minterms, matchTimeout);
}
}
......
......@@ -32,7 +32,6 @@
<Compile Include="RegexCultureTests.cs" />
<Compile Include="RegexMatchTimeoutExceptionTests.cs" />
<Compile Include="RegexParserTests.cs" />
<Compile Include="RegexReductionTests.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' == 'net48'">
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexParseError.cs" Link="System\Text\RegularExpressions\RegexParseError.cs" />
......
......@@ -130,8 +130,8 @@ public void LiteralAfterLoop(string pattern, RegexOptions options, int expectedM
private static RegexFindOptimizations ComputeOptimizations(string pattern, RegexOptions options)
{
RegexCode code = RegexWriter.Write(RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture), CultureInfo.InvariantCulture);
return new RegexFindOptimizations(code.Tree, CultureInfo.InvariantCulture);
RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture);
return new RegexFindOptimizations(tree.Root, options, CultureInfo.InvariantCulture);
}
}
}
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Reflection;
using System.Globalization;
using Xunit;
namespace System.Text.RegularExpressions.Tests
{
[SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Many of these optimizations don't exist in .NET Framework.")]
[ConditionalClass(typeof(PlatformDetection), nameof(PlatformDetection.IsNotBuiltWithAggressiveTrimming))]
public class RegexReductionTests
{
// These tests depend on using reflection to access internals of Regex in order to validate
// if, when, and how various optimizations are being employed. As implementation details
// change, these tests will need to be updated as well. Note, too, that Compiled Regexes
// null out the _code field being accessed here, so this mechanism won't work to validate
// Compiled, which also means it won't work to validate optimizations only enabled
// when using Compiled, such as auto-atomicity for the last node in a regex.
private static readonly FieldInfo s_regexCode;
private static readonly FieldInfo s_regexCodeCodes;
private static readonly FieldInfo s_regexCodeTree;
private static readonly FieldInfo s_regexCodeFindOptimizations;
private static readonly PropertyInfo s_regexCodeFindOptimizationsMaxPossibleLength;
private static readonly FieldInfo s_regexCodeTreeMinRequiredLength;
static RegexReductionTests()
{
if (PlatformDetection.IsNetFramework || PlatformDetection.IsBuiltWithAggressiveTrimming)
{
// These members may not exist or may have been trimmed away, and the tests won't run.
return;
}
s_regexCode = typeof(Regex).GetField("_code", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCode);
s_regexCodeFindOptimizations = s_regexCode.FieldType.GetField("FindOptimizations", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCodeFindOptimizations);
s_regexCodeFindOptimizationsMaxPossibleLength = s_regexCodeFindOptimizations.FieldType.GetProperty("MaxPossibleLength", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCodeFindOptimizationsMaxPossibleLength);
s_regexCodeCodes = s_regexCode.FieldType.GetField("Codes", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCodeCodes);
s_regexCodeTree = s_regexCode.FieldType.GetField("Tree", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCodeTree);
s_regexCodeTreeMinRequiredLength = s_regexCodeTree.FieldType.GetField("MinRequiredLength", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(s_regexCodeTreeMinRequiredLength);
}
private static string GetRegexCodes(Regex r)
{
object code = s_regexCode.GetValue(r);
Assert.NotNull(code);
string result = code.ToString();
// In release builds, the above ToString won't be informative.
// Also include the numerical codes, which are not as comprehensive
// but which exist in release builds as well.
int[] codes = s_regexCodeCodes.GetValue(code) as int[];
Assert.NotNull(codes);
result += Environment.NewLine + string.Join(", ", codes);
return result;
}
private static int GetMinRequiredLength(Regex r)
{
object code = s_regexCode.GetValue(r);
Assert.NotNull(code);
object tree = s_regexCodeTree.GetValue(code);
Assert.NotNull(tree);
object minRequiredLength = s_regexCodeTreeMinRequiredLength.GetValue(tree);
Assert.IsType<int>(minRequiredLength);
return (int)minRequiredLength;
}
private static int? GetMaxPossibleLength(Regex r)
{
object code = s_regexCode.GetValue(r);
Assert.NotNull(code);
object findOpts = s_regexCodeFindOptimizations.GetValue(code);
Assert.NotNull(findOpts);
object maxPossibleLength = s_regexCodeFindOptimizationsMaxPossibleLength.GetValue(findOpts);
Assert.True(maxPossibleLength is null || maxPossibleLength is int);
return (int?)maxPossibleLength;
}
[Theory]
// Two greedy one loops
[InlineData("a*a*", "a*")]
......@@ -390,7 +303,7 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("abcd|abef", "ab(?>cd|ef)")]
[InlineData("abcd|aefg", "a(?>bcd|efg)")]
[InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")]
[InlineData("abcde|abcdef", "abcde(?>|f)")]
// [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
[InlineData("abcdef|abcde", "abcde(?>f|)")]
[InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
[InlineData("(ab|ab*)bc", "(a(?:b|b*))bc")]
......@@ -441,7 +354,7 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("[ab]*[^a]*", "[ab]*(?>[^a]*)")]
[InlineData("[aa]*[^a]*", "(?>a*)(?>[^a]*)")]
[InlineData("a??", "")]
[InlineData("(abc*?)", "(ab)")]
//[InlineData("(abc*?)", "(ab)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
[InlineData("a{1,3}?", "a{1,4}?")]
[InlineData("a{2,3}?", "a{2}")]
[InlineData("bc(a){1,3}?", "bc(a){1,2}?")]
......@@ -474,13 +387,15 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?i)\\d", "\\d")]
[InlineData("(?i).", ".")]
[InlineData("(?i)\\$", "\\$")]
public void PatternsReduceIdentically(string pattern1, string pattern2)
public void PatternsReduceIdentically(string actual, string expected)
{
string result1 = GetRegexCodes(new Regex(pattern1));
string result2 = GetRegexCodes(new Regex(pattern2));
if (result1 != result2)
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.
string actualStr = RegexParser.Parse(actual, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString();
string expectedStr = RegexParser.Parse(expected, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString();
if (actualStr != expectedStr)
{
throw new Xunit.Sdk.EqualException(result2, result1);
throw new Xunit.Sdk.EqualException(actualStr, expectedStr);
}
}
......@@ -554,13 +469,15 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
[InlineData("a*(?(xyz)acd|efg)", "(?>a*)(?(xyz)acd|efg)")]
[InlineData("a*(?(xyz)bcd|afg)", "(?>a*)(?(xyz)bcd|afg)")]
[InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")]
public void PatternsReduceDifferently(string pattern1, string pattern2)
public void PatternsReduceDifferently(string actual, string expected)
{
string result1 = GetRegexCodes(new Regex(pattern1));
string result2 = GetRegexCodes(new Regex(pattern2));
if (result1 == result2)
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.
string actualStr = RegexParser.Parse(actual, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString();
string expectedStr = RegexParser.Parse(expected, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString();
if (actualStr == expectedStr)
{
throw new Xunit.Sdk.EqualException(result2, result1);
throw new Xunit.Sdk.NotEqualException(actualStr, expectedStr);
}
}
......@@ -632,29 +549,33 @@ public void PatternsReduceDifferently(string pattern1, string pattern2)
[InlineData(@"abcdef", RegexOptions.RightToLeft, 6, null)]
public void MinMaxLengthIsCorrect(string pattern, RegexOptions options, int expectedMin, int? expectedMax)
{
var r = new Regex(pattern, options);
Assert.Equal(expectedMin, GetMinRequiredLength(r));
RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture);
Assert.Equal(expectedMin, tree.FindOptimizations.MinRequiredLength);
if (!pattern.EndsWith("$", StringComparison.Ordinal) &&
!pattern.EndsWith(@"\Z", StringComparison.OrdinalIgnoreCase))
{
// MaxPossibleLength is currently only computed/stored if there's a trailing End{Z} anchor as the max length is otherwise unused
r = new Regex($"(?:{pattern})$", options);
tree = RegexParser.Parse($"(?:{pattern})$", options, CultureInfo.InvariantCulture);
}
Assert.Equal(expectedMax, GetMaxPossibleLength(r));
Assert.Equal(expectedMax, tree.FindOptimizations.MaxPossibleLength);
}
[Fact]
public void MinMaxLengthIsCorrect_HugeDepth()
{
const int Depth = 10_000;
var r = new Regex($"{new string('(', Depth)}a{new string(')', Depth)}$"); // too deep for analysis on some platform default stack sizes
RegexTree tree = RegexParser.Parse($"{new string('(', Depth)}a{new string(')', Depth)}$", RegexOptions.None, CultureInfo.InvariantCulture); // too deep for analysis on some platform default stack sizes
int minRequiredLength = tree.FindOptimizations.MinRequiredLength;
int minRequiredLength = GetMinRequiredLength(r);
Assert.True(
minRequiredLength == 1 /* successfully analyzed */ || minRequiredLength == 0 /* ran out of stack space to complete analysis */,
$"Expected 1 or 0, got {minRequiredLength}");
int? maxPossibleLength = GetMaxPossibleLength(r);
int? maxPossibleLength = tree.FindOptimizations.MaxPossibleLength;
Assert.True(
maxPossibleLength == 1 /* successfully analyzed */ || maxPossibleLength is null /* ran out of stack space to complete analysis */,
$"Expected 1 or null, got {maxPossibleLength}");
......
......@@ -12,18 +12,18 @@ public class RegexTreeAnalyzerTests
[Fact]
public void SimpleString()
{
(RegexCode code, AnalysisResults analysis) = Analyze("abc");
(RegexTree tree, AnalysisResults analysis) = Analyze("abc");
RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode abc = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Multi, atomicByAncestor: true, backtracks: false, captures: false);
}
[Fact]
public void AlternationWithCaptures()
{
(RegexCode code, AnalysisResults analysis) = Analyze("abc|d(e)f|(ghi)");
(RegexTree tree, AnalysisResults analysis) = Analyze("abc|d(e)f|(ghi)");
RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode implicitAtomic = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Atomic, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode alternation = AssertNode(analysis, implicitAtomic.Child(0), RegexNodeKind.Alternate, atomicByAncestor: true, backtracks: false, captures: true);
......@@ -43,9 +43,9 @@ public void AlternationWithCaptures()
[Fact]
public void LoopsReducedWithAutoAtomic()
{
(RegexCode code, AnalysisResults analysis) = Analyze("a*(b*)c*");
(RegexTree tree, AnalysisResults analysis) = Analyze("a*(b*)c*");
RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode concat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: false, captures: true);
RegexNode aStar = AssertNode(analysis, concat.Child(0), RegexNodeKind.Oneloopatomic, atomicByAncestor: false, backtracks: false, captures: false);
......@@ -59,9 +59,9 @@ public void LoopsReducedWithAutoAtomic()
[Fact]
public void AtomicGroupAroundBacktracking()
{
(RegexCode code, AnalysisResults analysis) = Analyze("[ab]*(?>[bc]*[cd])[ef]");
(RegexTree tree, AnalysisResults analysis) = Analyze("[ab]*(?>[bc]*[cd])[ef]");
RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true);
RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true);
RegexNode rootConcat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: true, captures: false);
RegexNode abStar = AssertNode(analysis, rootConcat.Child(0), RegexNodeKind.Setloop, atomicByAncestor: false, backtracks: true, captures: false);
......@@ -76,10 +76,10 @@ public void AtomicGroupAroundBacktracking()
RegexNode cd = AssertNode(analysis, atomicConcat.Child(1), RegexNodeKind.Set, atomicByAncestor: true, backtracks: false, captures: false);
}
private static (RegexCode Code, AnalysisResults Analysis) Analyze(string pattern)
private static (RegexTree Tree, AnalysisResults Analysis) Analyze(string pattern)
{
RegexCode code = RegexWriter.Write(RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture), CultureInfo.InvariantCulture);
return (code, RegexTreeAnalyzer.Analyze(code));
RegexTree tree = RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture);
return (tree, RegexTreeAnalyzer.Analyze(tree));
}
private static RegexNode AssertNode(AnalysisResults analysis, RegexNode node, RegexNodeKind kind, bool atomicByAncestor, bool backtracks, bool captures)
......
......@@ -8,11 +8,14 @@
<DebuggerSupport Condition="'$(DebuggerSupport)' == '' and '$(TargetOS)' == 'Browser'">true</DebuggerSupport>
<EmitCompilerGeneratedFiles>true</EmitCompilerGeneratedFiles>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<DefineConstants>$(DefineConstants);DEBUG</DefineConstants> <!-- always define debug, even in release builds -->
</PropertyGroup>
<ItemGroup>
<DefaultReferenceExclusion Include="System.Text.RegularExpressions" />
<Compile Include="RegexFindOptimizationsTests.cs" />
<Compile Include="RegexReductionTests.cs" />
<Compile Include="RegexTreeAnalyzerTests.cs" />
<!-- Code included from System.Text.RegularExpressions -->
......@@ -23,7 +26,7 @@
<Compile Include="..\..\src\System\Threading\StackHelper.cs" Link="Production\StackHelper.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexCharClass.cs" Link="Production\RegexCharClass.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" Link="Production\RegexCharClass.MappingTable.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexCode.cs" Link="Production\RegexCode.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexInterpreterCode.cs" Link="Production\RegexInterpreterCode.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexFindOptimizations.cs" Link="Production\RegexFindOptimizations.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexNode.cs" Link="Production\RegexNode.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\RegexNodeKind.cs" Link="Production\RegexNodeKind.cs" />
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册