From 6ebcb3d67279554e1262b0b0abea16bd18c469af Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 7 Jul 2022 18:50:38 -0700 Subject: [PATCH] NonBacktracking locking fixes and cleanup (#71234) * Concurrency fixes and refactoring for clarity Removed builder reference from SymbolicRegexNode instances; builder now has to be passed in. Since the builder is not thread safe this clarifies the locking required in the matcher when using it. Moved matching specific state from the builder to the matcher. This includes state and transition arrays. Simplify character kind code by eliminating duplication of logic. * Changes from review and cleanup DfaMatchingState is now just MatchingState * Comment on NFA mode IDs --- .../src/System.Text.RegularExpressions.csproj | 3 +- .../RegularExpressions/Symbolic/CharKind.cs | 3 + .../Symbolic/DfaMatchingState.cs | 149 ----- .../Symbolic/MatchingState.cs | 118 ++++ .../Symbolic/RegexNodeConverter.cs | 4 +- .../Symbolic/SymbolicRegexBuilder.cs | 417 +------------ .../Symbolic/SymbolicRegexInfo.cs | 87 +-- .../Symbolic/SymbolicRegexKind.cs | 2 +- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 441 ++++++++++++++ .../Symbolic/SymbolicRegexMatcher.Dgml.cs | 250 ++++---- .../Symbolic/SymbolicRegexMatcher.Explore.cs | 120 ++-- .../Symbolic/SymbolicRegexMatcher.Sample.cs | 207 +++---- .../Symbolic/SymbolicRegexMatcher.cs | 569 +++++++++--------- .../Symbolic/SymbolicRegexNode.cs | 478 +++++++-------- .../Symbolic/SymbolicRegexRunnerFactory.cs | 4 +- .../src/System/Threading/StackHelper.cs | 48 ++ .../tests/UnitTests/SymbolicRegexTests.cs | 29 +- ....Text.RegularExpressions.Unit.Tests.csproj | 2 +- 18 files changed, 1493 insertions(+), 1438 deletions(-) delete mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 56a71e7e812..94490e85c60 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -63,7 +63,7 @@ - + @@ -75,6 +75,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs index aacdf067022..9f21787baae 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs @@ -43,5 +43,8 @@ internal static class CharKind WordLetter => @"\w", _ => string.Empty, }; + + /// Returns whether the given value is in the range of valid character kinds. + internal static bool IsValidCharKind(uint charKind) => charKind < CharKindCount; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs deleted file mode 100644 index ecbb44415ee..00000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections.Generic; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Net; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Captures a state of a DFA explored during matching. - internal sealed class DfaMatchingState where TSet : IComparable, IEquatable - { - internal DfaMatchingState(SymbolicRegexNode node, uint prevCharKind) - { - Node = node; - PrevCharKind = prevCharKind; - } - - internal SymbolicRegexNode Node { get; } - - internal uint PrevCharKind { get; } - - internal int Id { get; set; } - - /// This is a deadend state - internal bool IsDeadend => Node.IsNothing; - - /// The node must be nullable here - internal int FixedLength(uint nextCharKind) - { - Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.ResolveFixedLength(context); - } - - /// If true then the state is a dead-end, rejects all inputs. - internal bool IsNothing => Node.IsNothing; - - /// If true then state starts with a ^ or $ or \Z - internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; - - /// - /// Translates a minterm set to a character kind, which is a general categorization of characters used - /// for cheaply deciding the nullability of anchors. - /// - /// - /// An empty set is handled as a special case to indicate the very last \n. - /// - /// the minterm to translate - /// the character kind of the minterm - private uint GetNextCharKind(ref TSet minterm) - { - ISolver solver = Node._builder._solver; - TSet wordLetterPredicate = Node._builder._wordLetterForBoundariesSet; - TSet newLinePredicate = Node._builder._newLineSet; - - // minterm == solver.False is used to represent the very last \n - uint nextCharKind = CharKind.General; - if (solver.Empty.Equals(minterm)) - { - nextCharKind = CharKind.NewLineS; - minterm = newLinePredicate; - } - else if (newLinePredicate.Equals(minterm)) - { - // If the previous state was the start state, mark this as the very FIRST \n. - // Essentially, this looks the same as the very last \n and is used to nullify - // rev(\Z) in the conext of a reversed automaton. - nextCharKind = PrevCharKind == CharKind.BeginningEnd ? - CharKind.NewLineS : - CharKind.Newline; - } - else if (!solver.IsEmpty(solver.And(wordLetterPredicate, minterm))) - { - nextCharKind = CharKind.WordLetter; - } - return nextCharKind; - } - - /// - /// Compute the target state for the given input minterm. - /// If is False this means that this is \n and it is the last character of the input. - /// - /// minterm corresponding to some input character or False corresponding to last \n - internal DfaMatchingState Next(TSet minterm) - { - uint nextCharKind = GetNextCharKind(ref minterm); - - // Combined character context - uint context = CharKind.Context(PrevCharKind, nextCharKind); - - // Compute the derivative of the node for the given context - SymbolicRegexNode derivative = Node.CreateDerivativeWithoutEffects(minterm, context); - - // nextCharKind will be the PrevCharKind of the target state - // use an existing state instead if one exists already - // otherwise create a new new id for it - return Node._builder.CreateState(derivative, nextCharKind, capturing: false); - } - - /// - /// Compute a set of transitions for the given minterm. - /// - /// minterm corresponding to some input character or False corresponding to last \n - /// an enumeration of the transitions as pairs of the target state and a list of effects to be applied - internal List<(DfaMatchingState State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm) - { - uint nextCharKind = GetNextCharKind(ref minterm); - - // Combined character context - uint context = CharKind.Context(PrevCharKind, nextCharKind); - - // Compute the transitions for the given context - List<(SymbolicRegexNode, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context); - - var list = new List<(DfaMatchingState State, DerivativeEffect[] Effects)>(); - foreach ((SymbolicRegexNode node, DerivativeEffect[]? effects) in nodesAndEffects) - { - // nextCharKind will be the PrevCharKind of the target state - // use an existing state instead if one exists already - // otherwise create a new new id for it - DfaMatchingState state = Node._builder.CreateState(node, nextCharKind, capturing: true); - if (!state.IsDeadend) - list.Add((state, effects)); - } - return list; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal bool IsNullableFor(uint nextCharKind) - { - Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.IsNullableFor(context); - } - - public override bool Equals(object? obj) => - obj is DfaMatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); - - public override int GetHashCode() => (PrevCharKind, Node).GetHashCode(); - -#if DEBUG - public override string ToString() => - PrevCharKind == 0 ? Node.ToString() : - $"({CharKind.DescribePrev(PrevCharKind)},{Node})"; -#endif - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs new file mode 100644 index 00000000000..38226258df4 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -0,0 +1,118 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Net; + +namespace System.Text.RegularExpressions.Symbolic +{ + /// Captures a state explored during matching. + internal sealed class MatchingState where TSet : IComparable, IEquatable + { + internal MatchingState(SymbolicRegexNode node, uint prevCharKind) + { + Node = node; + PrevCharKind = prevCharKind; + } + + /// The regular expression that labels this state and gives it its semantics. + internal SymbolicRegexNode Node { get; } + + /// + /// The kind of the previous character in the input. The is responsible + /// for ensuring that in all uses of this state this invariant holds by both selecting initial states accordingly + /// and transitioning on each character to states that match that character's kind. + /// + /// + /// Tracking this information is an optimization that allows each transition taken in the matcher to only depend + /// on the next character (and its kind). In general, the transitions from a state with anchors in its pattern + /// depend on both the previous and the next character. Creating distinct states for each kind of the previous + /// character embeds the necessary information about the previous character into the state space of the automaton. + /// However, this does incur a memory overhead due to the duplication of states. For patterns with no anchors + /// this will always be set to , which can reduce the number of states created. + /// + /// The performance effect of this optimization has not been investigated. If this optimization were removed, the + /// transition logic would in turn have to become more complicated for derivatives that depend on the nullability + /// of anchors. Care should be taken to not slow down transitions without anchors involved. + /// + internal uint PrevCharKind { get; } + + /// + /// A unique identifier for this state, which is used in to index into + /// state information and transition arrays. Valid IDs are always >= 1. + /// + internal int Id { get; set; } + + /// Whether this state is known to be a dead end, i.e. no nullable states are reachable from here. + internal bool IsDeadend(ISolver solver) => Node.IsNothing(solver); + + /// + /// Returns the fixed length that any match ending with this state must have, or -1 if there is no such + /// fixed length, . The context is defined + /// by of this state and the given nextCharKind. The node must be nullable here. + /// + internal int FixedLength(uint nextCharKind) + { + Debug.Assert(IsNullableFor(nextCharKind)); + Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); + uint context = CharKind.Context(PrevCharKind, nextCharKind); + return Node.ResolveFixedLength(context); + } + + /// If true then state starts with a ^ or $ or \Z + internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; + + /// + /// Compute the target state for the given input minterm. + /// If is False this means that this is \n and it is the last character of the input. + /// + /// the builder that owns + /// minterm corresponding to some input character or False corresponding to last \n + /// + internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet minterm, uint nextCharKind) + { + // Combined character context + uint context = CharKind.Context(PrevCharKind, nextCharKind); + + // Compute the derivative of the node for the given context + return Node.CreateDerivativeWithoutEffects(builder, minterm, context); + } + + /// + /// Compute a set of transitions for the given minterm. + /// + /// the builder that owns + /// minterm corresponding to some input character or False corresponding to last \n + /// + /// an enumeration of the transitions as pairs of the target state and a list of effects to be applied + internal List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)> NfaNextWithEffects(SymbolicRegexBuilder builder, TSet minterm, uint nextCharKind) + { + // Combined character context + uint context = CharKind.Context(PrevCharKind, nextCharKind); + + // Compute the transitions for the given context + return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsNullableFor(uint nextCharKind) + { + Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); + uint context = CharKind.Context(PrevCharKind, nextCharKind); + return Node.IsNullableFor(context); + } + + public override bool Equals(object? obj) => + obj is MatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); + + public override int GetHashCode() => (PrevCharKind, Node).GetHashCode(); + +#if DEBUG + public override string ToString() => + PrevCharKind == 0 ? Node.ToString() : + $"({CharKind.DescribePrev(PrevCharKind)},{Node})"; +#endif + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 8c4fd992f93..857b8d51972 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -240,12 +240,12 @@ static string UnexpectedNodeType(RegexNode node) SymbolicRegexNode elem = childResult.Count == 1 ? childResult.FirstElement : _builder.CreateConcatAlreadyReversed(childResult); - if (elem.IsNothing) + if (elem.IsNothing(_builder._solver)) { continue; } - or = elem.IsAnyStar ? + or = elem.IsAnyStar(_builder._solver) ? elem : // .* is the absorbing element SymbolicRegexNode.CreateAlternate(_builder, elem, or); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index 3ba759e2b04..eceaadd247e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -30,43 +30,34 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal SymbolicRegexNode Epsilon => _epsilon ??= SymbolicRegexNode.CreateEpsilon(this); private SymbolicRegexNode? _beginningAnchor; - internal SymbolicRegexNode BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BeginningAnchor); + internal SymbolicRegexNode BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BeginningAnchor); private SymbolicRegexNode? _endAnchor; - internal SymbolicRegexNode EndAnchor => _endAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchor); + internal SymbolicRegexNode EndAnchor => _endAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchor); private SymbolicRegexNode? _endAnchorZ; - internal SymbolicRegexNode EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZ); + internal SymbolicRegexNode EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZ); private SymbolicRegexNode? _endAnchorZReverse; - internal SymbolicRegexNode EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse); + internal SymbolicRegexNode EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse); private SymbolicRegexNode? _bolAnchor; - internal SymbolicRegexNode BolAnchor => _bolAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BOLAnchor); + internal SymbolicRegexNode BolAnchor => _bolAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BOLAnchor); private SymbolicRegexNode? _eolAnchor; - internal SymbolicRegexNode EolAnchor => _eolAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EOLAnchor); + internal SymbolicRegexNode EolAnchor => _eolAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EOLAnchor); private SymbolicRegexNode? _wbAnchor; - internal SymbolicRegexNode BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor); + internal SymbolicRegexNode BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor); private SymbolicRegexNode? _nwbAnchor; - internal SymbolicRegexNode NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor); + internal SymbolicRegexNode NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor); internal TSet _wordLetterForBoundariesSet; internal TSet _newLineSet; - /// Partition of the input space of sets. - internal TSet[]? _minterms; - private readonly Dictionary> _singletonCache = new(); - // states that have been created - internal HashSet> _stateCache = new(); - - // capturing states that have been created - internal HashSet> _capturingStateCache = new(); - /// /// This cache is used in to keep all nodes associated with this builder /// unique. This ensures that reference equality can be used for syntactic equality and that all shared subexpressions @@ -84,7 +75,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, // matching when simplification rules fail to eliminate the portions being walked over. /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node to derivate /// -The character or minterm to take the derivative with /// -The surrounding character context @@ -93,7 +84,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal readonly Dictionary<(SymbolicRegexNode, TSet elem, uint context), SymbolicRegexNode> _derivativeCache = new(); /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node to prune /// -The surrounding character context /// The value is the pruned node. @@ -101,74 +92,13 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal readonly Dictionary<(SymbolicRegexNode, uint), SymbolicRegexNode> _pruneLowerPriorityThanNullabilityCache = new(); /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node R potentially subsuming S /// -The node S potentially being subsumed by R /// The value indicates if subsumption is known to hold. /// internal readonly Dictionary<(SymbolicRegexNode, SymbolicRegexNode), bool> _subsumptionCache = new(); - /// - /// Maps state ids to states, initial capacity is 1024 states. - /// Each time more states are needed the length is increased by 1024. - /// - internal DfaMatchingState[]? _stateArray; - internal DfaMatchingState[]? _capturingStateArray; - - /// - /// Maps state IDs to context-independent information for all states in . - /// - private ContextIndependentState[] _stateInfo = Array.Empty(); - - /// Context-independent information available for every state. - [Flags] - private enum ContextIndependentState : byte - { - IsInitial = 1, - IsDeadend = 2, - IsNullable = 4, - CanBeNullable = 8, - } - - /// - /// For these "delta" arrays, technically Volatile.Read should be used to read out an element, - /// but in practice that's not needed on the runtimes in use (though that needs to be documented - /// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is - /// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789). - /// - internal int[]? _delta; - internal List<(DfaMatchingState, DerivativeEffect[])>?[]? _capturingDelta; - private const int InitialStateLimit = 1024; - - /// 1 + Log2(_minterms.Length), the smallest k s.t. 2^k >= minterms.Length + 1 - internal int _mintermsLog; - - /// - /// Maps each NFA state id to the state id of the DfaMatchingState stored in _stateArray. - /// This map is used to compactly represent NFA state ids in NFA mode in order to utilize - /// the property that all NFA states are small integers in one interval. - /// The valid entries are 0 to -1. - /// - internal int[] _nfaStateArray = Array.Empty(); - - /// - /// Maps the id of a DfaMatchingState to the NFA state id that it is being identifed with in the NFA. - /// It is the inverse of used entries in _nfaStateArray. - /// The range of this map is 0 to -1. - /// - internal readonly Dictionary _nfaStateArrayInverse = new(); - - /// Gets .Count - internal int NfaStateCount => _nfaStateArrayInverse.Count; - - /// - /// Transition function for NFA transitions in NFA mode. - /// Each NFA entry maps to a list of NFA target states. - /// Each list of target states is without repetitions. - /// If the entry is null then the targets states have not been computed yet. - /// - internal int[]?[] _nfaDelta = Array.Empty(); - /// Create a new symbolic regex builder. internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) { @@ -176,24 +106,6 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) _charSetSolver = charSetSolver; _solver = solver; - // minterms = null if partition of the solver is undefined and returned as null - _minterms = solver.GetMinterms(); - if (_minterms == null) - { - _mintermsLog = -1; - } - else - { - _stateArray = new DfaMatchingState[InitialStateLimit]; - _capturingStateArray = new DfaMatchingState[InitialStateLimit]; - _stateInfo = new ContextIndependentState[InitialStateLimit]; - - // the extra +1 slot with id minterms.Length is reserved for \Z (last occurrence of \n) - _mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1; - _delta = new int[InitialStateLimit << _mintermsLog]; - _capturingDelta = new List<(DfaMatchingState, DerivativeEffect[])>[InitialStateLimit << _mintermsLog]; - } - // initialized to False but updated later to the actual condition ony if \b or \B occurs anywhere in the regex // this implies that if a regex never uses \b or \B then the character context will never // update the previous character context to distinguish word and nonword letters @@ -213,94 +125,6 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) _singletonCache[_solver.Full] = _anyChar; } - /// Assign the context-independent information for the given state. - internal void SetStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) - { - Debug.Assert(stateId > 0); - Debug.Assert(!isNullable || canBeNullable); - - ContextIndependentState info = 0; - - if (isInitial) - { - info |= ContextIndependentState.IsInitial; - } - - if (isDeadend) - { - info |= ContextIndependentState.IsDeadend; - } - - if (canBeNullable) - { - info |= ContextIndependentState.CanBeNullable; - if (isNullable) - { - info |= ContextIndependentState.IsNullable; - } - } - - _stateInfo[stateId] = info; - } - - /// Get context-independent information for the given state. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId) - { - Debug.Assert(stateId > 0); - - ContextIndependentState info = _stateInfo[stateId]; - return ((info & ContextIndependentState.IsInitial) != 0, - (info & ContextIndependentState.IsDeadend) != 0, - (info & ContextIndependentState.IsNullable) != 0, - (info & ContextIndependentState.CanBeNullable) != 0); - } - - /// Lookup the actual minterm based on its ID. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal TSet GetMinterm(int mintermId) - { - TSet[]? minterms = _minterms; - Debug.Assert(minterms is not null); - return (uint)mintermId < (uint)minterms.Length ? - minterms[mintermId] : - _solver.Empty; // minterm=False represents \Z - } - - /// Returns the span from that may contain transitions for the given state - internal Span GetDeltasFor(DfaMatchingState state) - { - if (_delta is null || _minterms is null) - { - return default; - } - - int numMinterms = _minterms.Length; - if (state.StartsWithLineAnchor) - { - numMinterms++; - } - - return _delta.AsSpan(state.Id << _mintermsLog, numMinterms); - } - - /// Returns the span from that may contain transitions for the given state - internal Span GetNfaDeltasFor(DfaMatchingState state) - { - if (_nfaDelta is null || _minterms is null || !_nfaStateArrayInverse.TryGetValue(state.Id, out int nfaState)) - { - return default; - } - - int numMinterms = _minterms.Length; - if (state.StartsWithLineAnchor) - { - numMinterms++; - } - - return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms); - } - /// /// Make an alternation of given nodes, simplify by eliminating any regex that accepts no inputs /// @@ -509,224 +333,5 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n return null; } } - - /// - /// Create a state with given node and previous character context. - /// - /// the pattern that this state will represent - /// the kind of the character that led to this state - /// whether to use the separate space of states with capturing transitions or not - /// whether to mark the state as an initial state or not - /// - public DfaMatchingState CreateState(SymbolicRegexNode node, uint prevCharKind, bool capturing = false, bool isInitialState = false) - { - //first prune the anchors in the node - TSet wlbSet = _wordLetterForBoundariesSet; - TSet startSet = node.GetStartSet(); - - //true if the startset of the node overlaps with some wordletter or the node can be nullable - bool contWithWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(wlbSet, startSet)); - - //true if the startset of the node overlaps with some nonwordletter or the node can be nullable - bool contWithNWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(_solver.Not(wlbSet), startSet)); - SymbolicRegexNode pruned_node = node.PruneAnchors(prevCharKind, contWithWL, contWithNWL); - var s = new DfaMatchingState(pruned_node, prevCharKind); - if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState? state)) - { - state = MakeNewState(s, capturing, isInitialState); - } - - return state; - } - - private DfaMatchingState MakeNewState(DfaMatchingState state, bool capturing, bool isInitialState) - { - lock (this) - { - HashSet> cache = capturing ? _capturingStateCache : _stateCache; - cache.Add(state); // Add to cache first to make 1 the first state ID - state.Id = cache.Count; - - Debug.Assert(_stateArray is not null && _capturingStateArray is not null); - - const int GrowthSize = 1024; - if (capturing) - { - if (state.Id == _capturingStateArray.Length) - { - int newsize = _capturingStateArray.Length + GrowthSize; - Array.Resize(ref _capturingStateArray, newsize); - Array.Resize(ref _capturingDelta, newsize << _mintermsLog); - } - _capturingStateArray[state.Id] = state; - } - else - { - if (state.Id == _stateArray.Length) - { - int newsize = _stateArray.Length + GrowthSize; - Array.Resize(ref _stateArray, newsize); - Array.Resize(ref _delta, newsize << _mintermsLog); - Array.Resize(ref _stateInfo, newsize); - } - _stateArray[state.Id] = state; - SetStateInfo(state.Id, isInitialState, state.IsDeadend, state.Node.IsNullable, state.Node.CanBeNullable); - } - return state; - } - } - - /// - /// Make an NFA state for the given node and previous character kind. - /// - public int CreateNfaState(SymbolicRegexNode node, uint prevCharKind) - { - Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate); - - // First make the underlying core state - DfaMatchingState coreState = CreateState(node, prevCharKind); - - if (!_nfaStateArrayInverse.TryGetValue(coreState.Id, out int nfaStateId)) - { - nfaStateId = MakeNewNfaState(coreState.Id); - } - - return nfaStateId; - } - - /// Critical region that creates a new NFA state for the underlying core state - private int MakeNewNfaState(int coreStateId) - { - lock (this) - { - if (NfaStateCount == _nfaStateArray.Length) - { - // TBD: is 1024 reasonable? - int newsize = _nfaStateArray.Length + 1024; - Array.Resize(ref _nfaStateArray, newsize); - Array.Resize(ref _nfaDelta, newsize << _mintermsLog); - // TBD: capturing - } - - int nfaStateId = NfaStateCount; - _nfaStateArray[nfaStateId] = coreStateId; - _nfaStateArrayInverse[coreStateId] = nfaStateId; - return nfaStateId; - } - } - - /// Gets the core state Id corresponding to the NFA state - public int GetCoreStateId(int nfaStateId) - { - Debug.Assert(_stateArray is not null); - Debug.Assert(nfaStateId < _nfaStateArray.Length); - Debug.Assert(_nfaStateArray[nfaStateId] < _stateArray.Length); - return _nfaStateArray[nfaStateId]; - } - - /// Gets the core state corresponding to the NFA state - public DfaMatchingState GetCoreState(int nfaStateId) - { - Debug.Assert(_stateArray is not null); - return _stateArray[GetCoreStateId(nfaStateId)]; - } - - /// Critical region for defining a new core transition - public DfaMatchingState CreateNewTransition(DfaMatchingState sourceState, int mintermId, int offset) - { - TryCreateNewTransition(sourceState, mintermId, offset, checkThreshold: false, out DfaMatchingState? nextState); - Debug.Assert(nextState is not null); - return nextState; - } - - /// Gets or creates a new DFA transition. - public bool TryCreateNewTransition( - DfaMatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out DfaMatchingState? nextState) - { - Debug.Assert(_delta is not null && _stateArray is not null); - lock (this) - { - Debug.Assert(offset < _delta.Length); - - // check if meanwhile delta[offset] has become defined possibly by another thread - DfaMatchingState? targetState = _stateArray[_delta[offset]]; - if (targetState is null) - { - if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) - { - nextState = null; - return false; - } - - targetState = sourceState.Next(GetMinterm(mintermId)); - Volatile.Write(ref _delta[offset], targetState.Id); - } - - nextState = targetState; - return true; - } - } - - /// Gets or creates a new NFA transition. - public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset) - { - Debug.Assert(_delta is not null && _stateArray is not null); - lock (this) - { - Debug.Assert(nfaOffset < _nfaDelta.Length); - - // check if meanwhile the nfaoffset has become defined possibly by another thread - int[]? targets = _nfaDelta[nfaOffset]; - if (targets is null) - { - // Create the underlying transition from the core state corresponding to the nfa state - DfaMatchingState coreState = GetCoreState(nfaStateId); - int coreOffset = (coreState.Id << _mintermsLog) | mintermId; - int coreTargetId = _delta[coreOffset]; - DfaMatchingState? coreTarget = coreTargetId > 0 ? - _stateArray[coreTargetId] : CreateNewTransition(coreState, mintermId, coreOffset); - - SymbolicRegexNode node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - coreTarget.Node._left! : coreTarget.Node; - if (node.Kind == SymbolicRegexNodeKind.Alternate) - { - // Create separate NFA states for all members of a disjunction - // Here duplicate NFA states cannot arise because there are no duplicate nodes in the disjunction - List> alts = node.ToList(listKind: SymbolicRegexNodeKind.Alternate); - targets = new int[alts.Count]; - int targetIndex = 0; - foreach (SymbolicRegexNode q in alts) - { - Debug.Assert(!q.IsNothing); - // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too - SymbolicRegexNode targetNode = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - CreateDisableBacktrackingSimulation(q) : q; - targets[targetIndex++] = CreateNfaState(targetNode, coreTarget.PrevCharKind); - } - Debug.Assert(targetIndex == targets.Length); - } - else if (coreTarget.IsDeadend) - { - // Omit deadend states from the target list of states - // target list being empty means that the NFA state itself is a deadend - targets = Array.Empty(); - } - else - { - // Add the single NFA target state correponding to the core target state - if (!_nfaStateArrayInverse.TryGetValue(coreTarget.Id, out int nfaTargetId)) - { - nfaTargetId = MakeNewNfaState(coreTarget.Id); - } - - targets = new[] { nfaTargetId }; - } - - Volatile.Write(ref _nfaDelta[nfaOffset], targets); - } - - return targets; - } - } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index cd333942b49..ff95195292b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -1,6 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; + namespace System.Text.RegularExpressions.Symbolic { /// Misc information of structural properties of a that is computed bottom up. @@ -14,54 +16,34 @@ namespace System.Text.RegularExpressions.Symbolic private const uint StartsWithSomeAnchorMask = 32; private const uint IsHighPriorityNullableMask = 64; private const uint ContainsEffectMask = 128; + private const uint ContainsLineAnchorMask = 256; private readonly uint _info; private SymbolicRegexInfo(uint i) => _info = i; - internal static SymbolicRegexInfo Create( + private static SymbolicRegexInfo Create( bool isAlwaysNullable = false, bool canBeNullable = false, - bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, + bool startsWithLineAnchor = false, bool containsLineAnchor = false, + bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isHighPriorityNullable = false, bool containsEffect = false) { - uint i = 0; - - if (canBeNullable || isAlwaysNullable) - { - i |= CanBeNullableMask; - - if (isAlwaysNullable) - { - i |= IsAlwaysNullableMask; - } - } - - if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor) - { - i |= ContainsSomeAnchorMask; - - if (startsWithLineAnchor) - { - i |= StartsWithLineAnchorMask; - } - - if (startsWithLineAnchor || startsWithSomeAnchor) - { - i |= StartsWithSomeAnchorMask; - } - } - - if (isHighPriorityNullable) - { - i |= IsHighPriorityNullableMask; - } - - if (containsEffect) - { - i |= ContainsEffectMask; - } - - return new SymbolicRegexInfo(i); + // Assert that the expected implications hold. For example, every node that contains a line anchor + // must also be marked as containing some anchor. + Debug.Assert(!isAlwaysNullable || canBeNullable); + Debug.Assert(!startsWithLineAnchor || containsLineAnchor); + Debug.Assert(!startsWithLineAnchor || startsWithSomeAnchor); + Debug.Assert(!containsLineAnchor || containsSomeAnchor); + Debug.Assert(!startsWithSomeAnchor || containsSomeAnchor); + return new SymbolicRegexInfo( + (isAlwaysNullable ? IsAlwaysNullableMask : 0) | + (canBeNullable ? CanBeNullableMask : 0) | + (startsWithLineAnchor ? StartsWithLineAnchorMask : 0) | + (containsLineAnchor ? ContainsLineAnchorMask : 0) | + (startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) | + (containsSomeAnchor ? ContainsSomeAnchorMask : 0) | + (isHighPriorityNullable ? IsHighPriorityNullableMask : 0) | + (containsEffect ? ContainsEffectMask : 0)); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -70,6 +52,8 @@ namespace System.Text.RegularExpressions.Symbolic public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; + public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0; + public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; @@ -80,6 +64,27 @@ namespace System.Text.RegularExpressions.Symbolic public bool ContainsEffect => (_info & ContainsEffectMask) != 0; + /// + /// Used for any node that acts as an epsilon, i.e., something that always matches the empty string. + /// + public static SymbolicRegexInfo Epsilon() => + Create( + isAlwaysNullable: true, + canBeNullable: true, + isHighPriorityNullable: true); + + /// + /// Used for all anchors. + /// + /// whether this anchor is a line anchor + public static SymbolicRegexInfo Anchor(bool isLineAnchor) => + Create( + canBeNullable: true, + startsWithLineAnchor: isLineAnchor, + containsLineAnchor: isLineAnchor, + startsWithSomeAnchor: true, + containsSomeAnchor: true); + /// /// The alternation remains high priority nullable if the left alternative is so. /// All other info properties are the logical disjunction of the resepctive info properties @@ -90,6 +95,7 @@ namespace System.Text.RegularExpressions.Symbolic isAlwaysNullable: left_info.IsNullable || right_info.IsNullable, canBeNullable: left_info.CanBeNullable || right_info.CanBeNullable, startsWithLineAnchor: left_info.StartsWithLineAnchor || right_info.StartsWithLineAnchor, + containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor, startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor, containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable, @@ -105,6 +111,7 @@ namespace System.Text.RegularExpressions.Symbolic isAlwaysNullable: left_info.IsNullable && right_info.IsNullable, canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable, startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor), + containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor, startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor), containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs index ea1f6807500..bc01b913f7c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs @@ -48,7 +48,7 @@ internal enum SymbolicRegexNodeKind /// Effects to be applied when taking a transition. /// /// Left child is the pattern itself and the right child is a concatenation of nodes whose effects should be applied. - /// Effect nodes are created in the rule for concatenation in , + /// Effect nodes are created in the rule for concatenation in , /// where they are used to represent additional operations that should be performed in the current position if /// the pattern in the left child is used to match the input. Since these Effect nodes are relative to the current /// position in the input, the effects from the right child must be applied in the transition that the derivative is diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs new file mode 100644 index 00000000000..9912da4da8e --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -0,0 +1,441 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Text.RegularExpressions.Symbolic +{ + internal sealed partial class SymbolicRegexMatcher + { + /// + /// Initial capacity for DFA related arrays. + /// + private const int InitialDfaStateCapacity = 1024; + + /// + /// Minimum capacity for NFA related arrays when the matcher first enters NFA mode. The arrays start out empty, + /// but are resized to this capacity upon first use. + /// + private const int InitialNfaStateCapacity = 64; + + /// + /// Cache for the states that have been created. Each state is uniquely identified by its associated + /// and the kind of the previous character. + /// + private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new(); + + /// + /// Maps state ids to states, initial capacity is given by . + /// Each time more states are needed the length is doubled. + /// The first valid state is at index 1. + /// + private MatchingState?[] _stateArray; + + /// + /// Maps state IDs to context-independent information for all states in . + /// The first valid entry is at index 1. + /// + private ContextIndependentState[] _stateInfo; + + /// Context-independent information available for every state. + [Flags] + private enum ContextIndependentState : byte + { + IsInitial = 1, + IsDeadend = 2, + IsNullable = 4, + CanBeNullable = 8, + } + + /// + /// The transition function for DFA mode. + /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is + /// the number of bits required to represent the largest minterm ID , is reserved + /// for each state. This makes indexing into this array not require a multiplication + /// , but does mean some unused space may be present. + /// The first valid state ID is 1. + /// + /// + /// For these "delta" arrays, technically Volatile.Read should be used to read out an element, + /// but in practice that's not needed on the runtimes in use (though that needs to be documented + /// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is + /// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789). + /// + private int[] _dfaDelta; + + /// + /// Maps each NFA state id to the state id of the MatchingState stored in _stateArray. + /// This map is used to compactly represent NFA state ids in NFA mode in order to utilize + /// the property that all NFA states are small integers in one interval. + /// The valid entries are 0 to the size of - 1. + /// + private int[] _nfaCoreIdArray = Array.Empty(); + + /// + /// Maps the id of a MatchingState to the NFA state id that it is being identifed with in the NFA. + /// It is the inverse of used entries in _nfaStateArray. + /// The range of this map is 0 to its size - 1. + /// + private readonly Dictionary _nfaIdByCoreId = new(); + + /// + /// Transition function for NFA transitions in NFA mode. + /// Each NFA entry maps to a list of NFA target states. + /// Each list of target states is without repetitions. + /// If the entry is null then the targets states have not been computed yet. + /// + private int[]?[] _nfaDelta = Array.Empty(); + + /// + /// The transition function for , + /// which is an NFA mode with additional state to track capture start and end positions. + /// Each entry is an array of pairs of target state and effects to be applied when taking the transition. + /// If the entry is null then the transition has not been computed yet. + /// + private (int, DerivativeEffect[])[]?[] _capturingNfaDelta = Array.Empty<(int, DerivativeEffect[])[]?>(); + + /// + /// Implements a version of that is guaranteed to not publish an array before values + /// have been copied over. + /// + /// + /// This may not be strictly necessary for arrays of primitive or reference types (which have atomic + /// reads/writes), as when, e.g., is found to not have an entry the array is checked again + /// after a lock on the matcher has been acquired. However, in a highly threaded use case it still seems better + /// to avoid unnecessarily causing other threads to acquire the lock. + /// + private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) + { + Debug.Assert(newSize >= array.Length); + T[] newArray = new T[newSize]; + Array.Copy(array, newArray, array.Length); + Volatile.Write(ref array, newArray); + } + + private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId; + + /// Returns the span from that may contain transitions for the given state + private Span GetDeltasFor(MatchingState state) + { + Debug.Assert(Monitor.IsEntered(this)); + + int numMinterms = _minterms.Length; + if (state.StartsWithLineAnchor) + { + numMinterms++; + } + + return _dfaDelta.AsSpan(state.Id << _mintermsLog, numMinterms); + } + + /// Returns the span from that may contain transitions for the given state + private Span GetNfaDeltasFor(MatchingState state) + { + Debug.Assert(Monitor.IsEntered(this)); + + if (!_nfaIdByCoreId.TryGetValue(state.Id, out int nfaState)) + { + return default; + } + + int numMinterms = _minterms.Length; + if (state.StartsWithLineAnchor) + { + numMinterms++; + } + + return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms); + } + + /// Get context-independent information for the given state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId) + { + Debug.Assert(stateId > 0); + + ContextIndependentState info = _stateInfo[stateId]; + return ((info & ContextIndependentState.IsInitial) != 0, + (info & ContextIndependentState.IsDeadend) != 0, + (info & ContextIndependentState.IsNullable) != 0, + (info & ContextIndependentState.CanBeNullable) != 0); + } + + /// + /// Create a state with given node and previous character context. + /// + /// the pattern that this state will represent + /// the kind of the character that led to this state + /// + private MatchingState GetOrCreateState(SymbolicRegexNode node, uint prevCharKind) + { + Debug.Assert(Monitor.IsEntered(this)); + return GetOrCreateState_NoLock(node, prevCharKind); + } + + /// + /// Create a state with given node and previous character context. + /// + /// the pattern that this state will represent + /// the kind of the character that led to this state + /// whether to mark the state as an initial state or not + /// + private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node, uint prevCharKind, bool isInitialState = false) + { + SymbolicRegexNode prunedNode = node.PruneAnchors(_builder, prevCharKind); + (SymbolicRegexNode Node, uint PrevCharKind) key = (prunedNode, prevCharKind); + if (!_stateCache.TryGetValue(key, out MatchingState? state)) + { + state = new MatchingState(key.Node, key.PrevCharKind); + _stateCache.Add(key, state); // Add to cache first to make 1 the first state ID + state.Id = _stateCache.Count; + + Debug.Assert(_stateArray is not null); + + if (state.Id == _stateArray.Length) + { + // The growth factor 2 matches that of List + int newsize = _stateArray.Length * 2; + ArrayResizeAndVolatilePublish(ref _stateArray, newsize); + ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); + ArrayResizeAndVolatilePublish(ref _stateInfo, newsize); + } + _stateArray[state.Id] = state; + _stateInfo[state.Id] = BuildStateInfo(state.Id, isInitialState, state.IsDeadend(Solver), state.Node.IsNullable, state.Node.CanBeNullable); + } + + return state; + + // Assign the context-independent information for the given state + static ContextIndependentState BuildStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) + { + Debug.Assert(stateId > 0); + Debug.Assert(!isNullable || canBeNullable); + + ContextIndependentState info = 0; + + if (isInitial) + { + info |= ContextIndependentState.IsInitial; + } + + if (isDeadend) + { + info |= ContextIndependentState.IsDeadend; + } + + if (canBeNullable) + { + info |= ContextIndependentState.CanBeNullable; + if (isNullable) + { + info |= ContextIndependentState.IsNullable; + } + } + + return info; + } + } + + /// + /// Make an NFA state for the given node and previous character kind. NFA states include a "core state" of a + /// allocated with , + /// which stores the pattern and previous character kind and can be used for creating further NFA transitions. + /// In addition to the ID of the core state, NFA states are allocated a new NFA mode specific ID, which is + /// used to index into NFA mode transition arrays (e.g. ). + /// + /// + /// Using an ID numbering for NFA mode that is separate from DFA mode allows the IDs to be smaller, which saves + /// space both in the NFA mode arrays and in the instances used during matching for + /// sets of NFA states. + /// The core state ID can be looked up by the NFA ID with . + /// + /// the NFA ID of the new state, or null if the state is a dead end + private int? CreateNfaState(SymbolicRegexNode node, uint prevCharKind) + { + Debug.Assert(Monitor.IsEntered(this)); + Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate); + + // First make the core state for the node, which is used for creating further transitions out of this state + MatchingState coreState = GetOrCreateState(node, prevCharKind); + + // If the state is a dead end then don't create an NFA state, as dead ends in NFA mode are represented + // as empty lists of states. + if (coreState.IsDeadend(Solver)) + { + return null; + } + + // The NFA state itself is an ID that can be mapped back to the ID of the MatchingState. These NFA states are + // allocated separately from the IDs used in DFA mode to avoid large values, which helps save memory in the + // SparseIntMap data structures used in NFA matching modes. + if (!_nfaIdByCoreId.TryGetValue(coreState.Id, out int nfaStateId)) + { + // No NFA state already exists, so make a new one. NFA state IDs are allocated sequentially from zero by + // giving each new state an ID equal to the number of existing NFA states. + nfaStateId = _nfaIdByCoreId.Count; + + // If the next ID is past the end of the NFA state array, increase the sizes of the NFA arrays + if (nfaStateId == _nfaCoreIdArray.Length) + { + // The growth factor 2 matches that of List + int newsize = Math.Max(_nfaCoreIdArray.Length * 2, InitialNfaStateCapacity); + ArrayResizeAndVolatilePublish(ref _nfaCoreIdArray, newsize); + ArrayResizeAndVolatilePublish(ref _nfaDelta, newsize << _mintermsLog); + ArrayResizeAndVolatilePublish(ref _capturingNfaDelta, newsize << _mintermsLog); + } + + // Store the mapping from NFA state ID to core state ID + Debug.Assert(nfaStateId < _nfaCoreIdArray.Length); + _nfaCoreIdArray[nfaStateId] = coreState.Id; + + // Store the mapping from core state ID to NFA state ID + // Adding an entry here increments the ID that will be given to the next NFA state + _nfaIdByCoreId.Add(coreState.Id, nfaStateId); + } + + return nfaStateId; + } + + /// Gets the corresponding to the given state ID. + private MatchingState GetState(int stateId) + { + Debug.Assert(stateId > 0); + MatchingState? state = _stateArray[stateId]; + Debug.Assert(state is not null); + return state; + } + + /// Gets the core state Id corresponding to the NFA state + private int GetCoreStateId(int nfaStateId) + { + Debug.Assert(nfaStateId < _nfaCoreIdArray.Length); + Debug.Assert(_nfaCoreIdArray[nfaStateId] < _stateArray.Length); + return _nfaCoreIdArray[nfaStateId]; + } + + /// Gets or creates a new DFA transition. + /// This function locks the matcher for safe concurrent use of the + private bool TryCreateNewTransition( + MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState) + { + Debug.Assert(offset < _dfaDelta.Length); + + lock (this) + { + // check if meanwhile delta[offset] has become defined possibly by another thread + MatchingState? targetState = _stateArray[_dfaDelta[offset]]; + if (targetState is null) + { + if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) + { + nextState = null; + return false; + } + + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + targetState = GetOrCreateState(sourceState.Next(_builder, minterm, nextCharKind), nextCharKind); + Volatile.Write(ref _dfaDelta[offset], targetState.Id); + } + + nextState = targetState; + return true; + } + } + + /// Gets or creates a new NFA transition. + /// This function locks the matcher for safe concurrent use of the + private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset) + { + Debug.Assert(nfaOffset < _nfaDelta.Length); + + lock (this) + { + // check if meanwhile the nfaoffset has become defined possibly by another thread + int[]? targets = _nfaDelta[nfaOffset]; + if (targets is null) + { + // Create the underlying transition from the core state corresponding to the nfa state + int coreId = GetCoreStateId(nfaStateId); + int coreOffset = (coreId << _mintermsLog) | mintermId; + int coreTargetId = _dfaDelta[coreOffset]; + MatchingState coreState = GetState(coreId); + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + SymbolicRegexNode? targetNode = coreTargetId > 0 ? + GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); + + List targetsList = new(); + ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) => + targetsList.Add(nfaId)); + + targets = targetsList.ToArray(); + Volatile.Write(ref _nfaDelta[nfaOffset], targets); + } + + return targets; + } + } + + /// Gets or creates a new capturing NFA transition. + /// This function locks the matcher for safe concurrent use of the + private (int, DerivativeEffect[])[] CreateNewCapturingTransition(int nfaStateId, int mintermId, int offset) + { + lock (this) + { + // Get the next state if it exists. The caller should have already tried and found it null (not yet created), + // but in the interim another thread could have created it. + (int, DerivativeEffect[])[]? targets = _capturingNfaDelta[offset]; + if (targets is null) + { + MatchingState coreState = GetState(GetCoreStateId(nfaStateId)); + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind); + // Build the new state and store it into the array. + List<(int, DerivativeEffect[])> targetsList = new(); + foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition) + { + ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects), + static (int nfaId, (List<(int, DerivativeEffect[])> Targets, DerivativeEffect[] Effects) args) => + args.Targets.Add((nfaId, args.Effects))); + } + targets = targetsList.ToArray(); + Volatile.Write(ref _capturingNfaDelta[offset], targets); + } + + return targets; + } + } + + /// + /// Iterates through the alternation branches + /// and tries to create NFA states for each. The supplied action is called for each created NFA state. These never + /// include dead ends as will filter those out. + /// + /// This function locks the matcher for safe concurrent use of the + /// the type of the additional argument passed through to the action + /// the node to break up into NFA states + /// the previous character kind for each created NFA state + /// an additional argument passed through to each call to the action + /// action to call for each NFA state + private void ForEachNfaState(SymbolicRegexNode node, uint prevCharKind, T arg, Action action) + { + lock (this) + { + foreach (SymbolicRegexNode nfaNode in node.EnumerateAlternationBranches(_builder)) + { + if (CreateNfaState(nfaNode, prevCharKind) is int nfaId) + { + action(nfaId, arg); + } + } + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs index 1225c4748e4..157fd7d332d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs @@ -16,140 +16,140 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override void SaveDGML(TextWriter writer, int maxLabelLength) { - if (maxLabelLength < 0) - maxLabelLength = int.MaxValue; - - Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> transitions = GatherTransitions(_builder); - - writer.WriteLine(""); - writer.WriteLine(""); - writer.WriteLine(" "); - writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); - writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); - foreach (DfaMatchingState state in _builder._stateCache) + lock (this) { - string info = CharKind.DescribePrev(state.PrevCharKind); - string deriv = WebUtility.HtmlEncode(state.Node.ToString()); - string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info} ")}{(deriv == string.Empty ? "()" : deriv)}"; + if (maxLabelLength < 0) + maxLabelLength = int.MaxValue; + + Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> transitions = GatherTransitions(this); - writer.WriteLine(" ", state.Id, nodeDgmlView); - if (_builder.GetStateInfo(state.Id).IsInitial) + writer.WriteLine(""); + writer.WriteLine(""); + writer.WriteLine(" "); + writer.WriteLine(" ", FormatInfo(this, transitions.Count)); + writer.WriteLine(" ", FormatInfo(this, transitions.Count)); + foreach (MatchingState state in _stateCache.Values) { - writer.WriteLine(" "); + string info = CharKind.DescribePrev(state.PrevCharKind); + string deriv = WebUtility.HtmlEncode(state.Node.ToString()); + string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info} ")}{(deriv == string.Empty ? "()" : deriv)}"; + + writer.WriteLine(" ", state.Id, nodeDgmlView); + if (GetStateInfo(state.Id).IsInitial) + { + writer.WriteLine(" "); + } + if (state.Node.CanBeNullable) + { + writer.WriteLine(" "); + } + writer.WriteLine(" "); + writer.WriteLine(" ", state.Id, nodeDgmlView); } - if (state.Node.CanBeNullable) + writer.WriteLine(" "); + writer.WriteLine(" "); + foreach (MatchingState initialState in GetInitialStates(this)) { - writer.WriteLine(" "); + writer.WriteLine(" ", initialState.Id); } - writer.WriteLine(" "); - writer.WriteLine(" ", state.Id, nodeDgmlView); - } - writer.WriteLine(" "); - writer.WriteLine(" "); - foreach (DfaMatchingState initialState in GetInitialStates(this)) - { - Debug.Assert(_builder._stateCache.Contains(initialState)); - writer.WriteLine(" ", initialState.Id); - } - writer.WriteLine(" "); + writer.WriteLine(" "); - foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List NfaTargets)> transition in transitions) - { - string label = DescribeLabel(transition.Value.Rule, _builder); - string info = ""; - if (label.Length > maxLabelLength) + foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List NfaTargets)> transition in transitions) { - info = $"FullLabel = \"{label}\" "; - label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + string label = DescribeLabel(transition.Value.Rule, _builder); + string info = ""; + if (label.Length > maxLabelLength) + { + info = $"FullLabel = \"{label}\" "; + label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + } + + writer.WriteLine($" "); + // Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character) + // from the target of the DFA transition. + foreach (int nfaTarget in transition.Value.NfaTargets) + { + writer.WriteLine($" "); + } } - writer.WriteLine($" "); - // Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character) - // from the target of the DFA transition. - foreach (int nfaTarget in transition.Value.NfaTargets) + foreach (MatchingState state in _stateCache.Values) { - writer.WriteLine($" "); + writer.WriteLine(" ", state.Id); } - } - foreach (DfaMatchingState state in _builder._stateCache) - { - writer.WriteLine(" ", state.Id); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(""); } - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(""); - // This function gathers all transitions in the given builder and groups them by (source,destination) state ID - static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> GatherTransitions(SymbolicRegexBuilder builder) + static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> GatherTransitions(SymbolicRegexMatcher matcher) { - Debug.Assert(builder._delta is not null); - Debug.Assert(builder._minterms is not null); Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> result = new(); - foreach (DfaMatchingState source in builder._stateCache) + foreach (MatchingState source in matcher._stateCache.Values) { // Get the span of entries in delta that gives the transitions for the different minterms - Span deltas = builder.GetDeltasFor(source); - Span nfaDeltas = builder.GetNfaDeltasFor(source); - Debug.Assert(deltas.Length == builder._minterms.Length); + Span deltas = matcher.GetDeltasFor(source); + Span nfaDeltas = matcher.GetNfaDeltasFor(source); + Debug.Assert(deltas.Length == matcher._minterms.Length); for (int i = 0; i < deltas.Length; ++i) { // negative entries are transitions not explored yet, so skip them @@ -160,7 +160,7 @@ static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> G (int Source, int Target) key = (source.Id, targetId); if (!result.TryGetValue(key, out (TSet Rule, List NfaTargets) entry)) { - entry = (builder._solver.Empty, new List()); + entry = (matcher.Solver.Empty, new List()); } // If this state has an NFA transition for the same minterm, then associate // those with the transition. @@ -168,24 +168,24 @@ static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> G { foreach (int nfaTarget in nfaTargets) { - entry.NfaTargets.Add(builder._nfaStateArray[nfaTarget]); + entry.NfaTargets.Add(matcher._nfaCoreIdArray[nfaTarget]); } } // Expand the rule for this minterm - result[key] = (builder._solver.Or(entry.Rule, builder._minterms[i]), entry.NfaTargets); + result[key] = (matcher.Solver.Or(entry.Rule, matcher._minterms[i]), entry.NfaTargets); } } } return result; } - static string FormatInfo(SymbolicRegexBuilder builder, int transitionCount) + static string FormatInfo(SymbolicRegexMatcher matcher, int transitionCount) { StringBuilder sb = new(); - sb.Append($"States = {builder._stateCache.Count} "); + sb.Append($"States = {matcher._stateCache.Count} "); sb.Append($"Transitions = {transitionCount} "); - sb.Append($"Min Terms ({builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', - DescribeLabels(builder._solver.GetMinterms()!, builder)); + sb.Append($"Min Terms ({matcher.Solver.GetMinterms()!.Length}) = ").AppendJoin(',', + DescribeLabels(matcher.Solver.GetMinterms()!, matcher._builder)); return sb.ToString(); } @@ -200,13 +200,13 @@ static IEnumerable DescribeLabels(IEnumerable labels, SymbolicRege static string DescribeLabel(TSet label, SymbolicRegexBuilder builder) => WebUtility.HtmlEncode(builder._solver.PrettyPrint(label, builder._charSetSolver)); - static IEnumerable> GetInitialStates(SymbolicRegexMatcher matcher) + static IEnumerable> GetInitialStates(SymbolicRegexMatcher matcher) { - foreach (DfaMatchingState state in matcher._dotstarredInitialStates) + foreach (MatchingState state in matcher._dotstarredInitialStates) yield return state; - foreach (DfaMatchingState state in matcher._initialStates) + foreach (MatchingState state in matcher._initialStates) yield return state; - foreach (DfaMatchingState state in matcher._reverseInitialStates) + foreach (MatchingState state in matcher._reverseInitialStates) yield return state; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs index 6808434ef98..09880c1ad44 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs @@ -16,89 +16,91 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa) { - Debug.Assert(_builder._minterms is not null); - - // Track seen states to avoid exploring twice - HashSet> seen = new(); - // Use a queue for unexplored states - // This results in a breadth-first exploration - Queue> toExplore = new(); + lock (this) + { + // Track seen states to avoid exploring twice + HashSet> seen = new(); + // Use a queue for unexplored states + // This results in a breadth-first exploration + Queue> toExplore = new(); - // Explore all initial states as requested - if (includeDotStarred) - EnqueueAll(_dotstarredInitialStates, seen, toExplore); - if (includeReverse) - EnqueueAll(_reverseInitialStates, seen, toExplore); - if (includeOriginal) - EnqueueAll(_initialStates, seen, toExplore); + // Explore all initial states as requested + if (includeDotStarred) + EnqueueAll(_dotstarredInitialStates, seen, toExplore); + if (includeReverse) + EnqueueAll(_reverseInitialStates, seen, toExplore); + if (includeOriginal) + EnqueueAll(_initialStates, seen, toExplore); - if (exploreDfa) - { - while (toExplore.Count > 0) + if (exploreDfa) { - // Don't dequeue yet, because a transition might fail - DfaMatchingState state = toExplore.Peek(); - // Include the special minterm for the last end-of-line if the state is sensitive to it - int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; - // Explore successor states for each minterm - for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + while (toExplore.Count > 0) { - int offset = (state.Id << _builder._mintermsLog) | mintermId; - if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState? nextState)) - goto DfaLimitReached; - EnqueueIfUnseen(nextState, seen, toExplore); + // Don't dequeue yet, because a transition might fail + MatchingState state = toExplore.Peek(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + { + int offset = DeltaOffset(state.Id, mintermId); + if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState? nextState)) + goto DfaLimitReached; + EnqueueIfUnseen(nextState, seen, toExplore); + } + // Safe to dequeue now that the state has been completely handled + toExplore.Dequeue(); } - // Safe to dequeue now that the state has been completely handled - toExplore.Dequeue(); } - } - DfaLimitReached: - if (exploreNfa && toExplore.Count > 0) - { - // DFA states are broken up into NFA states when they are alternations - DfaMatchingState[] toBreakUp = toExplore.ToArray(); - toExplore.Clear(); - foreach (DfaMatchingState dfaState in toBreakUp) + DfaLimitReached: + if (exploreNfa && toExplore.Count > 0) { - // Remove state from seen so that it can be added back in if necessary - seen.Remove(dfaState); - // Enqueue all elements of a top level alternation or the state itself - foreach (var element in dfaState.Node.EnumerateAlternationBranches()) + // DFA states are broken up into NFA states when they are alternations + MatchingState[] toBreakUp = toExplore.ToArray(); + toExplore.Clear(); + foreach (MatchingState dfaState in toBreakUp) { - int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind); - EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore); + // Remove state from seen so that it can be added back in if necessary + seen.Remove(dfaState); + // Enqueue all elements of a top level alternation or the state itself + ForEachNfaState(dfaState.Node, dfaState.PrevCharKind, (this, seen, toExplore), + static (int nfaId, (SymbolicRegexMatcher Matcher, HashSet> Seen, Queue> ToExplore) args) => + { + MatchingState? coreState = args.Matcher.GetState(args.Matcher.GetCoreStateId(nfaId)); + EnqueueIfUnseen(coreState, args.Seen, args.ToExplore); + }); } - } - while (toExplore.Count > 0) - { - // NFA transitions can't fail, so its safe to dequeue here - DfaMatchingState state = toExplore.Dequeue(); - // Include the special minterm for the last end-of-line if the state is sensitive to it - int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; - // Explore successor states for each minterm - for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + while (toExplore.Count > 0) { - int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId; - int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset); - foreach (int nextNfaState in nextNfaStates) + // NFA transitions can't fail, so its safe to dequeue here + MatchingState state = toExplore.Dequeue(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _minterms.Length : _minterms.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) { - EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore); + int nfaOffset = DeltaOffset(_nfaIdByCoreId[state.Id], mintermId); + int[] nextNfaStates = CreateNewNfaTransition(_nfaIdByCoreId[state.Id], mintermId, nfaOffset); + foreach (int nextNfaState in nextNfaStates) + { + EnqueueIfUnseen(GetState(GetCoreStateId(nextNfaState)), seen, toExplore); + } } } } } - static void EnqueueAll(DfaMatchingState[] states, HashSet> seen, Queue> toExplore) + static void EnqueueAll(MatchingState[] states, HashSet> seen, Queue> toExplore) { - foreach (DfaMatchingState state in states) + foreach (MatchingState state in states) { EnqueueIfUnseen(state, seen, toExplore); } } - static void EnqueueIfUnseen(DfaMatchingState state, HashSet> seen, Queue> queue) + static void EnqueueIfUnseen(MatchingState state, HashSet> seen, Queue> queue) { if (seen.Add(state)) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs index e5040d7b121..dc62647080b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs @@ -30,133 +30,134 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override IEnumerable SampleMatches(int k, int randomseed) { - // Zero is treated as no seed, instead using a system provided one - Random random = randomseed != 0 ? new Random(randomseed) : new Random(); - - ISolver solver = _builder._solver; - CharSetSolver charSetSolver = _builder._charSetSolver; + lock (this) + { + // Zero is treated as no seed, instead using a system provided one + Random random = randomseed != 0 ? new Random(randomseed) : new Random(); + CharSetSolver charSetSolver = _builder._charSetSolver; - // Create helper BDDs for handling anchors and preferentially generating ASCII inputs - BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { + // Create helper BDDs for handling anchors and preferentially generating ASCII inputs + BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { charSetSolver.CreateBDDFromRange('A', 'Z'), charSetSolver.CreateBDDFromRange('a', 'z'), charSetSolver.CreateBDDFromChar('_'), charSetSolver.CreateBDDFromRange('0', '9')}); - // Visible ASCII range for input character generation - BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E'); - BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); - - // Set up two sets of minterms, one with the additional special minterm for the last end-of-line - Debug.Assert(_builder._minterms is not null); - int[] mintermIdsWithoutZ = new int[_builder._minterms.Length]; - int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1]; - for (int i = 0; i < _builder._minterms.Length; ++i) - { - mintermIdsWithoutZ[i] = i; - mintermIdsWithZ[i] = i; - } - mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length; - - for (int i = 0; i < k; i++) - { - // Holds the generated input so far - StringBuilder inputSoFar = new(); - StringBuilder? latestCandidate = null; + // Visible ASCII range for input character generation + BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E'); + BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); + + // Set up two sets of minterms, one with the additional special minterm for the last end-of-line + Debug.Assert(_minterms is not null); + int[] mintermIdsWithoutZ = new int[_minterms.Length]; + int[] mintermIdsWithZ = new int[_minterms.Length + 1]; + for (int i = 0; i < _minterms.Length; ++i) + { + mintermIdsWithoutZ[i] = i; + mintermIdsWithZ[i] = i; + } + mintermIdsWithZ[_minterms.Length] = _minterms.Length; - // Current set of states reached initially contains just the root - NfaMatchingState states = new(_builder); - // Here one could also consider previous characters for example for \b, \B, and ^ anchors - // and initialize inputSoFar accordingly - states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan.Empty, -1)]); - CurrentState statesWrapper = new(states); + for (int i = 0; i < k; i++) + { + // Holds the generated input so far + StringBuilder inputSoFar = new(); + StringBuilder? latestCandidate = null; - // Used for end suffixes - List possibleEndings = new(); + // Current set of states reached initially contains just the root + NfaMatchingState states = new(); + // Here one could also consider previous characters for example for \b, \B, and ^ anchors + // and initialize inputSoFar accordingly + states.InitializeFrom(this, _initialStates[GetCharKind(ReadOnlySpan.Empty, -1)]); + CurrentState statesWrapper = new(states); - while (true) - { - Debug.Assert(states.NfaStateSet.Count > 0); + // Used for end suffixes + List possibleEndings = new(); - // Gather the possible endings for satisfying nullability - possibleEndings.Clear(); - if (NfaStateHandler.CanBeNullable(ref statesWrapper)) + while (true) { - // Unconditionally final state or end of the input due to \Z anchor for example - if (NfaStateHandler.IsNullable(ref statesWrapper) || - NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd)) - { - possibleEndings.Add(""); - } + Debug.Assert(states.NfaStateSet.Count > 0); - // End of line due to end-of-line anchor - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline)) + // Gather the possible endings for satisfying nullability + possibleEndings.Clear(); + if (SymbolicRegexMatcher.NfaStateHandler.CanBeNullable(this, in statesWrapper)) { - possibleEndings.Add("\n"); + // Unconditionally final state or end of the input due to \Z anchor for example + if (SymbolicRegexMatcher.NfaStateHandler.IsNullable(this, in statesWrapper) || + SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.BeginningEnd)) + { + possibleEndings.Add(""); + } + + // End of line due to end-of-line anchor + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.Newline)) + { + possibleEndings.Add("\n"); + } + + // Related to wordborder due to \b or \B + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.WordLetter)) + { + possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); + } + + // Related to wordborder due to \b or \B + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.General)) + { + possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); + } } - // Related to wordborder due to \b or \B - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter)) + // If we have a possible ending, then store a candidate input + if (possibleEndings.Count > 0) { - possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); + latestCandidate ??= new(); + latestCandidate.Clear(); + latestCandidate.Append(inputSoFar); + //Choose some suffix that allows some anchor (if any) to be nullable + latestCandidate.Append(Choose(random, possibleEndings)); + + // Choose to stop here based on a coin-toss + if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) + { + yield return latestCandidate.ToString(); + break; + } } - // Related to wordborder due to \b or \B - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General)) + // Shuffle the minterms, including the last end-of-line marker if appropriate + int[] mintermIds = SymbolicRegexMatcher.NfaStateHandler.StartsWithLineAnchor(this, in statesWrapper) ? + Shuffle(random, mintermIdsWithZ) : + Shuffle(random, mintermIdsWithoutZ); + foreach (int mintermId in mintermIds) { - possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); + bool success = SymbolicRegexMatcher.NfaStateHandler.TryTakeTransition(this, ref statesWrapper, mintermId); + Debug.Assert(success); + if (states.NfaStateSet.Count > 0) + { + TSet minterm = GetMintermFromId(mintermId); + // Append a random member of the minterm + inputSoFar.Append(ChooseChar(random, ToBDD(minterm, Solver, charSetSolver), ascii, charSetSolver)); + break; + } + else + { + // The transition was a dead end, undo and continue to try another minterm + NfaStateHandler.UndoTransition(ref statesWrapper); + } } - } - // If we have a possible ending, then store a candidate input - if (possibleEndings.Count > 0) - { - latestCandidate ??= new(); - latestCandidate.Clear(); - latestCandidate.Append(inputSoFar); - //Choose some suffix that allows some anchor (if any) to be nullable - latestCandidate.Append(Choose(random, possibleEndings)); - - // Choose to stop here based on a coin-toss - if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) + // In the case that there are no next states or input has become too large: stop here + if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) { - yield return latestCandidate.ToString(); + // Ending up here without an ending is unlikely but possible for example for infeasible patterns + // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. + if (latestCandidate != null) + { + yield return latestCandidate.ToString(); + } break; } } - - // Shuffle the minterms, including the last end-of-line marker if appropriate - int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ? - Shuffle(random, mintermIdsWithZ) : - Shuffle(random, mintermIdsWithoutZ); - foreach (int mintermId in mintermIds) - { - bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId); - Debug.Assert(success); - if (states.NfaStateSet.Count > 0) - { - TSet minterm = _builder.GetMinterm(mintermId); - // Append a random member of the minterm - inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver)); - break; - } - else - { - // The transition was a dead end, undo and continue to try another minterm - NfaStateHandler.UndoTransition(ref statesWrapper); - } - } - - // In the case that there are no next states or input has become too large: stop here - if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) - { - // Ending up here without an ending is unlikely but possible for example for infeasible patterns - // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. - if (latestCandidate != null) - { - yield return latestCandidate.ToString(); - } - break; - } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index beec486dce6..b84df67463f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.IO; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; @@ -84,19 +85,31 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _initialStates; + private readonly MatchingState[] _initialStates; /// The initial states for the dot-star pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _dotstarredInitialStates; + private readonly MatchingState[] _dotstarredInitialStates; /// The initial states for the reverse pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _reverseInitialStates; + private readonly MatchingState[] _reverseInitialStates; - /// Lookup table to quickly determine the character kind for ASCII characters. - /// Non-null iff the pattern contains anchors; otherwise, it's unused. - private readonly uint[]? _asciiCharKinds; + /// Partition of the input space of sets. + private readonly TSet[] _minterms; + + /// + /// Character kinds for all minterms in as well as two special + /// cases: character positions outside the input bounds and an end-of-line as the last input character. + /// + private readonly uint[] _positionKinds; + + /// + /// The smallest k s.t. 2^k >= minterms.Length + 1. The "delta arrays", e.g., allocate 2^k + /// consecutive slots for each state ID to represent the transitions for each minterm. The extra slot at index + /// _minterms.Length is used to represent an \n occurring at the very end of input, for supporting the \Z anchor. + /// + private readonly int _mintermsLog; /// Number of capture groups. private readonly int _capsize; @@ -105,14 +118,10 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// This determines whether the matcher uses the special capturing NFA simulation mode. internal bool HasSubcaptures => _capsize > 1; - /// Get the minterm of . - /// character code - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private TSet GetMinterm(int c) - { - Debug.Assert(_builder._minterms is not null); - return _builder._minterms[_mintermClassifier.GetMintermID(c)]; - } + /// + /// Both solvers supported here, and are thread safe. + /// + private ISolver Solver => _builder._solver; /// Creates a new . /// The number of captures in the regular expression. @@ -136,25 +145,46 @@ private TSet GetMinterm(int c) _newLineSet = solver.ConvertFromBDD(bddBuilder._newLineSet, charSetSolver) }; - // Convert the BDD-based AST to TSetType-based AST + // Convert the BDD-based AST to TSet-based AST SymbolicRegexNode rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver)); - return new SymbolicRegexMatcher(rootNode, captureCount, findOptimizations, matchTimeout); + return new SymbolicRegexMatcher(builder, rootNode, captureCount, findOptimizations, matchTimeout); } /// Constructs matcher for given symbolic regex. - private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) + private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) { - Debug.Assert(rootNode._builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {rootNode._builder._solver}"); + Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}"); _pattern = rootNode; - _builder = rootNode._builder; + _builder = builder; _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms - _mintermClassifier = _builder._solver is UInt64Solver bv64 ? + TSet[]? solverMinterms = builder._solver.GetMinterms(); + Debug.Assert(solverMinterms is not null); + _minterms = solverMinterms; + // BitOperations.Log2 gives the integer floor of the log, so the +1 below either rounds up with non-power-of-two + // minterms or adds an extra bit with power-of-two minterms. The extra slot at index _minterms.Length is used to + // represent an \n occurring at the very end of input, for supporting the \Z anchor. + _mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1; + _mintermClassifier = builder._solver is UInt64Solver bv64 ? bv64._classifier : - ((BitVectorSolver)(object)_builder._solver)._classifier; + ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; + // Initialization for fields in SymbolicRegexMatcher.Automata.cs + _stateArray = new MatchingState[InitialDfaStateCapacity]; + _stateInfo = new ContextIndependentState[InitialDfaStateCapacity]; + _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; + + // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm + // ID _minterms.Length, which is used to represent a \n at the very end of input, and another ID -1, + // which is used to represent any position outside the bounds of the input. + _positionKinds = new uint[_minterms.Length + 2]; + for (int mintermId = -1; mintermId < _positionKinds.Length - 1; mintermId++) + { + _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); + } + // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's // handling for beginning anchors. @@ -168,26 +198,28 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, // character kind 0 is ever going to be used for all initial states. int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; + // The loops below and how character kinds are calculated assume that the "general" character kind is zero + Debug.Assert(CharKind.General == 0); + // Create the initial states for the original pattern. - var initialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < initialStates.Length; i++) + var initialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < initialStates.Length; charKind++) { - initialStates[i] = _builder.CreateState(_pattern, i, capturing: HasSubcaptures); + initialStates[charKind] = GetOrCreateState_NoLock(_pattern, charKind); } _initialStates = initialStates; // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. - _dotStarredPattern = _builder.CreateConcat(_builder._anyStarLazy, _pattern); - var dotstarredInitialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < dotstarredInitialStates.Length; i++) + _dotStarredPattern = builder.CreateConcat(builder._anyStarLazy, _pattern); + var dotstarredInitialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < dotstarredInitialStates.Length; charKind++) { // Used to detect if initial state was reentered, // but observe that the behavior from the state may ultimately depend on the previous // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, // in that sense there can be several "versions" (not more than StateCount) of the initial state. - DfaMatchingState state = _builder.CreateState(_dotStarredPattern, i, capturing: false, isInitialState: true); - dotstarredInitialStates[i] = state; + dotstarredInitialStates[charKind] = GetOrCreateState_NoLock(_dotStarredPattern, charKind, isInitialState: true); } _dotstarredInitialStates = dotstarredInitialStates; @@ -195,84 +227,91 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, // initial states. Also disable backtracking simulation to ensure the reverse path from // the final state that was found is followed. Not doing so might cause the earliest // starting point to not be found. - _reversePattern = _builder.CreateDisableBacktrackingSimulation(_pattern.Reverse()); - var reverseInitialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < reverseInitialStates.Length; i++) + _reversePattern = builder.CreateDisableBacktrackingSimulation(_pattern.Reverse(builder)); + var reverseInitialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < reverseInitialStates.Length; charKind++) { - reverseInitialStates[i] = _builder.CreateState(_reversePattern, i, capturing: false); + reverseInitialStates[charKind] = GetOrCreateState_NoLock(_reversePattern, charKind); } _reverseInitialStates = reverseInitialStates; - // Initialize our fast-lookup for determining the character kind of ASCII characters. - // This is only required when the pattern contains anchors, as otherwise there's only - // ever a single kind used. - if (_pattern._info.ContainsSomeAnchor) + // Maps a minterm ID to a character kind + uint CalculateMintermIdKind(int mintermId) { - var asciiCharKinds = new uint[128]; - for (int i = 0; i < asciiCharKinds.Length; i++) + // Only patterns with anchors use anything except the general kind + if (_pattern._info.ContainsSomeAnchor) { - TSet set; - uint charKind; - - if (i == '\n') + // A minterm ID of -1 represents the positions before the first and after the last character + // in the input. + if (mintermId == -1) { - set = _builder._newLineSet; - charKind = CharKind.Newline; + return CharKind.BeginningEnd; } - else + + // A minterm ID of minterms.Length represents a \n at the very end of input, which is matched + // by the \Z anchor. + if ((uint)mintermId == (uint)_minterms.Length) { - set = _builder._wordLetterForBoundariesSet; - charKind = CharKind.WordLetter; + return CharKind.NewLineS; } - asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), set).Equals(_builder._solver.Empty) ? 0 : charKind; + TSet minterm = _minterms[mintermId]; + + // Examine the minterm to figure out its character kind + if (_builder._newLineSet.Equals(minterm)) + { + // The minterm is a new line character + return CharKind.Newline; + } + else if (!Solver.IsEmpty(Solver.And(_builder._wordLetterForBoundariesSet, minterm))) + { + Debug.Assert(Solver.IsEmpty(Solver.And(Solver.Not(_builder._wordLetterForBoundariesSet), minterm))); + // The minterm is a subset of word letters as considered by \b and \B + return CharKind.WordLetter; + } } - _asciiCharKinds = asciiCharKinds; + + // All other minterms belong to the general kind + return CharKind.General; } } /// /// Create a PerThreadData with the appropriate parts initialized for this matcher's pattern. /// - internal PerThreadData CreatePerThreadData() => new PerThreadData(_builder, _capsize); + internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize); - /// Compute the target state for the source state and input[i] character and transition to it. - /// The associated builder. - /// The input text. - /// The index into at which the target character lives. - /// The current state being transitioned from. Upon return it's the new state if the transition succeeded. + /// Look up what is the character kind given a position ID [MethodImpl(MethodImplOptions.AggressiveInlining)] - private bool TryTakeTransition(SymbolicRegexBuilder builder, ReadOnlySpan input, int i, ref CurrentState state) - where TStateHandler : struct, IStateHandler + private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1]; + + /// + /// Lookup the actual minterm based on its ID. Also get its character kind, which is a general categorization of + /// characters used for cheaply deciding the nullability of anchors. + /// + internal TSet GetMintermFromId(int mintermId) { - int c = input[i]; + TSet[] minterms = _minterms; - // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor - int mintermId = c == '\n' && i == input.Length - 1 && TStateHandler.StartsWithLineAnchor(builder, ref state) ? - builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input - _mintermClassifier.GetMintermID(c); + // A minterm ID of minterms.Length represents a \n at the very end of input, which is matched + // by the \Z anchor. + if ((uint)mintermId >= (uint)minterms.Length) + { + return _builder._newLineSet; + } - return TStateHandler.TakeTransition(builder, ref state, mintermId); + // Otherwise look up the minterm from the array + return minterms[mintermId]; } - private List<(DfaMatchingState, DerivativeEffect[])> CreateNewCapturingTransitions(DfaMatchingState state, TSet minterm, int offset) - { - Debug.Assert(_builder._capturingDelta is not null); - lock (this) - { - // Get the next state if it exists. The caller should have already tried and found it null (not yet created), - // but in the interim another thread could have created it. - List<(DfaMatchingState, DerivativeEffect[])>? p = _builder._capturingDelta[offset]; - if (p is null) - { - // Build the new state and store it into the array. - p = state.NfaNextWithEffects(minterm); - Volatile.Write(ref _builder._capturingDelta[offset], p); - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private uint GetCharKind(ReadOnlySpan input, int i) + where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ? + CharKind.General : // The previous character kind is irrelevant when anchors are not used. + GetPositionKind(TInputReader.GetPositionId(this, input, i)); - return p; - } - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsMintermId(int positionId) => positionId >= 0; private void CheckTimeout(long timeoutOccursAt) { @@ -309,12 +348,16 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. int matchStartLowBoundary, matchStartLengthMarker; - int matchEnd = (_findOpts is not null, _pattern._info.ContainsSomeAnchor) switch + int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch { - (true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), }; // If there wasn't a match, we're done. @@ -345,9 +388,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { Debug.Assert(matchEnd >= startat - 1); matchStart = matchEnd < startat ? - startat : _pattern._info.ContainsSomeAnchor ? - FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData) : - FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData); + startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch + { + (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (false, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (false, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + }; } // Phase 3: @@ -361,7 +408,9 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData); + Registers endRegisters = _pattern._info.ContainsLineAnchor ? + FindSubcaptures(input, matchStart, matchEnd, perThreadData) : + FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } @@ -377,15 +426,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { initialStatePos = pos; int initialStatePosCandidate = pos; - var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); - SymbolicRegexBuilder builder = _builder; + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; int endStateId = -1; @@ -404,8 +453,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i input; bool done = currentState.NfaState is not null ? - FindEndPositionDeltas(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - FindEndPositionDeltas(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + FindEndPositionDeltas(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + FindEndPositionDeltas(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -421,10 +470,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Because there was still more input available, a failure to transition in DFA mode must be the cause // of the early exit. Upgrade to NFA mode. - DfaMatchingState? dfaState = currentState.DfaState(_builder); - Debug.Assert(dfaState is not null); NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(dfaState); + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); currentState = new CurrentState(nfaState); } @@ -437,7 +484,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Check whether there's a fixed-length marker for the current state. If there is, we can // use that length to optimize subsequent matching phases. - matchLength = endStateId > 0 ? _builder._stateArray![endStateId].FixedLength(GetCharKind(input, endPos)) : -1; + matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } @@ -448,8 +495,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's /// to be non-negative and its to be null; vice versa for /// . /// @@ -458,15 +505,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// 0 if iteration completed because we reached an initial state. /// A negative value if iteration completed because we ran out of input or we failed to transition. /// - private bool FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, RegexRunnerMode mode, - ref int posRef, ref CurrentState stateRef, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + private bool FindEndPositionDeltas(ReadOnlySpan input, RegexRunnerMode mode, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; - CurrentState state = stateRef; int endPos = endPosRef; int endStateId = endStateIdRef; int initialStatePos = initialStatePosRef; @@ -476,13 +523,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Loop through each character in the input, transitioning from state to state for each. while (true) { - (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state); // Check if currentState represents an initial state. If it does, call into any possible find optimizations // to hopefully more quickly find the next possible starting location. if (isInitial) { - if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) + if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) { return true; } @@ -496,12 +543,14 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i return true; } + int positionId = TInputReader.GetPositionId(this, input, pos); + // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. - if (TNullabilityHandler.IsNullableAt(this, ref state, input, pos, isNullable, canBeNullable)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, isNullable, canBeNullable)) { endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, ref state, input, pos); + endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. @@ -512,7 +561,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } // If there is more input available try to transition with the next character. - if ((uint)pos >= (uint)input.Length || !TryTakeTransition(builder, input, pos, ref state)) + if (!IsMintermId(positionId) || !TStateHandler.TryTakeTransition(this, ref state, positionId)) { return false; } @@ -525,7 +574,6 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Write back the local copies of the ref values. posRef = pos; - stateRef = state; endPosRef = endPos; endStateIdRef = endStateId; initialStatePosRef = initialStatePos; @@ -546,7 +594,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// The initial starting location discovered in phase 1, a point we must not walk earlier than. /// Per thread data reused between calls. /// The found starting position for the match. - private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { Debug.Assert(i >= 0, $"{nameof(i)} == {i}"); @@ -555,18 +604,17 @@ private int FindStartPosition(ReadOnlySpan input, int // Get the starting state for the reverse pattern. This depends on previous character (which, because we're // going backwards, is character number i). - var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); + var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); int lastStart = -1; // invalid sentinel value // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. - SymbolicRegexBuilder builder = _builder; while (true) { // Run the DFA or NFA traversal backwards from the current point using the current state. bool done = currentState.NfaState is not null ? - FindStartPositionDeltas(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart) : - FindStartPositionDeltas(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart); + FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : + FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart); // If we found the starting position, we're done. if (done) @@ -578,10 +626,8 @@ private int FindStartPosition(ReadOnlySpan input, int // if we were unable to transition, which should only happen if we were in DFA mode and exceeded our graph size. // Upgrade to NFA mode and continue. Debug.Assert(i >= matchStartBoundary); - DfaMatchingState? dfaState = currentState.DfaState(_builder); - Debug.Assert(dfaState is not null); NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(dfaState); + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); currentState = new CurrentState(nfaState); } @@ -594,23 +640,25 @@ private int FindStartPosition(ReadOnlySpan input, int /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState currentState, ref int lastStart) + private bool FindStartPositionDeltas(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. int pos = i; - CurrentState state = currentState; try { // Loop backwards through each character in the input, transitioning from state to state for each. while (true) { - (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state); + + int positionId = TInputReader.GetPositionId(this, input, pos - 1); // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (TNullabilityHandler.IsNullableAt(this, ref state, input, pos - 1, isNullable, canBeNullable)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, isNullable, canBeNullable)) { lastStart = pos; } @@ -624,7 +672,7 @@ private int FindStartPosition(ReadOnlySpan input, int } // Try to transition with the next character, the one before the current position. - if (!TryTakeTransition(builder, input, pos - 1, ref state)) + if (!TStateHandler.TryTakeTransition(this, ref state, positionId)) { // Return false to indicate the search didn't finish. return false; @@ -637,7 +685,6 @@ private int FindStartPosition(ReadOnlySpan input, int finally { // Write back the local copies of the ref values. - currentState = state; i = pos; } } @@ -649,10 +696,11 @@ private int FindStartPosition(ReadOnlySpan input, int /// exclusive end position /// Per thread data reused between calls. /// the final register values, which indicate capture starts and ends - private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData) + private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData) + where TInputReader : struct, IInputReader { // Pick the correct start state based on previous character kind. - DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; + MatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; Registers initialRegisters = perThreadData.InitialRegisters; @@ -667,52 +715,45 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per SparseIntMap current = perThreadData.Current, next = perThreadData.Next; current.Clear(); next.Clear(); - current.Add(initialState.Id, initialRegisters); - SymbolicRegexBuilder builder = _builder; + ForEachNfaState(initialState.Node, initialState.PrevCharKind, (current, initialRegisters), + static (int nfaId, (SparseIntMap Current, Registers InitialRegisters) args) => + args.Current.Add(nfaId, args.InitialRegisters.Clone())); while ((uint)i < (uint)iEnd) { Debug.Assert(next.Count == 0); - // Read the next character and find its minterm - int c = input[i]; - int normalMintermId = _mintermClassifier.GetMintermID(c); + // i is guaranteed to be within bounds, so the position ID is a minterm ID + int mintermId = TInputReader.GetPositionId(this, input, i); foreach ((int sourceId, Registers sourceRegisters) in current.Values) { - Debug.Assert(builder._capturingStateArray is not null); - DfaMatchingState sourceState = builder._capturingStateArray[sourceId]; - - // Handle the special case for the last \n for states that start with a relevant anchor - int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ? - builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input - normalMintermId; - TSet minterm = builder.GetMinterm(mintermId); - // Get or create the transitions - int offset = (sourceId << builder._mintermsLog) | mintermId; - Debug.Assert(builder._capturingDelta is not null); - List<(DfaMatchingState, DerivativeEffect[])>? transitions = - builder._capturingDelta[offset] ?? - CreateNewCapturingTransitions(sourceState, minterm, offset); + int offset = DeltaOffset(sourceId, mintermId); + (int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ?? + CreateNewCapturingTransition(sourceId, mintermId, offset); // Take the transitions in their prioritized order - for (int j = 0; j < transitions.Count; ++j) + for (int j = 0; j < transitions.Length; ++j) { - (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j]; - Debug.Assert(!targetState.IsDeadend, "Transitions should not include dead ends."); + (int targetStateId, DerivativeEffect[] effects) = transitions[j]; // Try to add the state and handle the case where it didn't exist before. If the state already // exists, then the transition can be safely ignored, as the existing state was generated by a // higher priority transition. - if (next.Add(targetState.Id, out int index)) + if (next.Add(targetStateId, out int index)) { // Avoid copying the registers on the last transition from this state, reusing the registers instead - Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters; + Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters; newRegisters.ApplyEffects(effects, i); - next.Update(index, targetState.Id, newRegisters); - if (targetState.IsNullableFor(GetCharKind(input, i + 1))) + next.Update(index, targetStateId, newRegisters); + + int coreStateId = GetCoreStateId(targetStateId); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = GetStateInfo(coreStateId); + Debug.Assert(!isDeadend); + + if (isNullable || (canBeNullable && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) { // No lower priority transitions from this or other source states are taken because the // backtracking engines would return the match ending here. @@ -732,15 +773,14 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per } Debug.Assert(current.Count > 0); - Debug.Assert(_builder._capturingStateArray is not null); foreach (var (endStateId, endRegisters) in current.Values) { - DfaMatchingState endState = _builder._capturingStateArray[endStateId]; - if (endState.IsNullableFor(GetCharKind(input, iEnd))) + MatchingState endState = GetState(GetCoreStateId(endStateId)); + if (endState.IsNullableFor(GetCharKind(input, iEnd))) { // Apply effects for finishing at the stored end state endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos), - CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); + CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); return endRegisters; } } @@ -749,39 +789,6 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per return default; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private uint GetCharKind(ReadOnlySpan input, int i) - { - return !_pattern._info.ContainsSomeAnchor ? - CharKind.General : // The previous character kind is irrelevant when anchors are not used. - GetCharKindWithAnchor(input, i); - - uint GetCharKindWithAnchor(ReadOnlySpan input, int i) - { - Debug.Assert(_asciiCharKinds is not null); - - if ((uint)i >= (uint)input.Length) - { - return CharKind.BeginningEnd; - } - - char nextChar = input[i]; - if (nextChar == '\n') - { - return - _builder._newLineSet.Equals(_builder._solver.Empty) ? 0 : // ignore \n - i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). - CharKind.Newline; - } - - uint[] asciiCharKinds = _asciiCharKinds; - return - nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] : - _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterForBoundariesSet).Equals(_builder._solver.Empty) ? 0 : // intersect with the wordletter set to compute the kind of the next character - CharKind.WordLetter; - } - } - /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. internal struct Registers @@ -867,9 +874,9 @@ internal sealed class PerThreadData /// Registers used for the capturing third phase. public readonly Registers InitialRegisters; - public PerThreadData(SymbolicRegexBuilder builder, int capsize) + public PerThreadData(int capsize) { - NfaState = new NfaMatchingState(builder); + NfaState = new NfaMatchingState(); // Only create data used for capturing mode if there are subcaptures if (capsize > 1) @@ -883,11 +890,9 @@ public PerThreadData(SymbolicRegexBuilder builder, int capsize) /// Stores the state that represents a current state in NFA mode. /// The entire state is composed of a list of individual states. + /// New instances should only be created once per runner. internal sealed class NfaMatchingState { - /// The associated builder used to lazily add new DFA or NFA nodes to the graph. - public readonly SymbolicRegexBuilder Builder; - /// Ordered set used to store the current NFA states. /// The value is unused. The type is used purely for its keys. public SparseIntMap NfaStateSet = new(); @@ -899,24 +904,17 @@ internal sealed class NfaMatchingState /// public SparseIntMap NfaStateSetScratch = new(); - /// Create the instance. - /// New instances should only be created once per runner. - public NfaMatchingState(SymbolicRegexBuilder builder) => Builder = builder; - /// Resets this NFA state to represent the supplied DFA state. + /// /// The DFA state to use to initialize the NFA state. - public void InitializeFrom(DfaMatchingState dfaMatchingState) + public void InitializeFrom(SymbolicRegexMatcher matcher, MatchingState dfaMatchingState) { NfaStateSet.Clear(); // If the DFA state is a union of multiple DFA states, loop through all of them // adding an NFA state for each. - foreach (SymbolicRegexNode element in dfaMatchingState.Node.EnumerateAlternationBranches()) - { - // Create (possibly new) NFA states for all the members. - // Add their IDs to the current set of NFA states and into the list. - NfaStateSet.Add(Builder.CreateNfaState(element, dfaMatchingState.PrevCharKind), out _); - } + matcher.ForEachNfaState(dfaMatchingState.Node, dfaMatchingState.PrevCharKind, NfaStateSet, + static (int nfaId, SparseIntMap nfaStateSet) => nfaStateSet.Add(nfaId, out _)); } } @@ -925,7 +923,7 @@ public void InitializeFrom(DfaMatchingState dfaMatchingState) private struct CurrentState { /// Initializes the state as a DFA state. - public CurrentState(DfaMatchingState dfaState) + public CurrentState(MatchingState dfaState) { DfaStateId = dfaState.Id; NfaState = null; @@ -942,51 +940,48 @@ public CurrentState(NfaMatchingState nfaState) public int DfaStateId; /// The NFA state. public NfaMatchingState? NfaState; - - public DfaMatchingState? DfaState(SymbolicRegexBuilder builder) => DfaStateId > 0 ? builder._stateArray![DfaStateId] : null; } /// Represents a set of routines for operating over a . private interface IStateHandler { - public static abstract bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state); - public static abstract bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind); - public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos); - public static abstract int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind); - public static abstract bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId); - public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state); + public static abstract bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state); + public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); + public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos); + public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); + public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId); + public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state); } /// An for operating over instances configured as DFA states. private readonly struct DfaStateHandler : IStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state) => state.DfaState(builder)!.StartsWithLineAnchor; + public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.IsNullableFor(nextCharKind); + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind); /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos) => state.DfaStateId; + public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) => state.DfaStateId; /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.FixedLength(nextCharKind); + public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).FixedLength(nextCharKind); /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); - Debug.Assert(builder._delta is not null); // Use the mintermId for the character being read to look up which state to transition to. // If that state has already been materialized, move to it, and we're done. If that state // hasn't been materialized, try to create it; if we can, move to it, and we're done. - int dfaOffset = (state.DfaStateId << builder._mintermsLog) | mintermId; - int nextStateId = builder._delta[dfaOffset]; + int dfaOffset = matcher.DeltaOffset(state.DfaStateId, mintermId); + int nextStateId = matcher._dfaDelta[dfaOffset]; if (nextStateId > 0) { // There was an existing DFA transition to some state. Move to it and @@ -995,7 +990,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren return true; } - if (builder.TryCreateNewTransition(state.DfaState(builder)!, mintermId, dfaOffset, checkThreshold: true, out DfaMatchingState? nextState)) + if (matcher.TryCreateNewTransition(matcher.GetState(state.DfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState? nextState)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. @@ -1014,22 +1009,19 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren /// - whether this state may be contextually nullable /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state) - { - Debug.Assert(state.DfaStateId > 0); - return builder.GetStateInfo(state.DfaStateId); - } + public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state) + => matcher.GetStateInfo(state.DfaStateId); } /// An for operating over instances configured as NFA states. private readonly struct NfaStateHandler : IStateHandler { /// Check if any underlying core state starts with a line anchor. - public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetCoreState(nfaState.Key).StartsWithLineAnchor) + if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).StartsWithLineAnchor) { return true; } @@ -1039,11 +1031,11 @@ public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref } /// Check if any underlying core state is nullable in the context of the next character kind. - public static bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetCoreState(nfaState.Key).IsNullableFor(nextCharKind)) + if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).IsNullableFor(nextCharKind)) { return true; } @@ -1053,12 +1045,12 @@ public static bool IsNullableFor(SymbolicRegexBuilder builder, ref Current } /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos) + public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) { - uint nextCharKind = matcher.GetCharKind(input, pos); + uint nextCharKind = matcher.GetCharKind(input, pos); foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - DfaMatchingState coreState = matcher._builder.GetCoreState(nfaState.Key); + MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); if (coreState.IsNullableFor(nextCharKind)) { return coreState.Id; @@ -1070,11 +1062,11 @@ public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, } /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. - public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) + public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - DfaMatchingState coreState = builder.GetCoreState(nfaState.Key); + MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); if (coreState.IsNullableFor(nextCharKind)) { return coreState.FixedLength(nextCharKind); @@ -1086,7 +1078,7 @@ public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentSta } /// Take the transition to the next NFA state. - public static bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); @@ -1105,7 +1097,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren { // We have a single source state. We know its next states are already deduped, // so we can just add them directly to the destination states list. - foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, builder)) + foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, matcher)) { nextStates.Add(nextState, out _); } @@ -1118,7 +1110,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren // to the set, then add the known-unique state to the destination list. foreach (ref KeyValuePair sourceState in CollectionsMarshal.AsSpan(sourceStates.Values)) { - foreach (int nextState in GetNextStates(sourceState.Key, mintermId, builder)) + foreach (int nextState in GetNextStates(sourceState.Key, mintermId, matcher)) { nextStates.Add(nextState, out _); } @@ -1128,13 +1120,13 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren return true; [MethodImpl(MethodImplOptions.AggressiveInlining)] - static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder builder) + static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher matcher) { // Calculate the offset into the NFA transition table. - int nfaOffset = (sourceState << builder._mintermsLog) | mintermId; + int nfaOffset = matcher.DeltaOffset(sourceState, mintermId); // Get the next NFA state. - return builder._nfaDelta[nfaOffset] ?? builder.CreateNewNfaTransition(sourceState, mintermId, nfaOffset); + return matcher._nfaDelta[nfaOffset] ?? matcher.CreateNewNfaTransition(sourceState, mintermId, nfaOffset); } } @@ -1153,15 +1145,15 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder< /// can transition back to a DFA state. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state) => - (false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(builder, ref state), CanBeNullable(builder, ref state)); + public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state) => + (false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(matcher, in state), CanBeNullable(matcher, in state)); /// Check if any underlying core state is unconditionally nullable. - private static bool IsNullable(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool IsNullable(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).IsNullable) + if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).IsNullable) { return true; } @@ -1171,11 +1163,11 @@ private static bool IsNullable(SymbolicRegexBuilder builder, ref CurrentSt } /// Check if any underlying core state can be nullable in some context. - private static bool CanBeNullable(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool CanBeNullable(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).CanBeNullable) + if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).CanBeNullable) { return true; } @@ -1185,10 +1177,10 @@ private static bool CanBeNullable(SymbolicRegexBuilder builder, ref Curren } #if DEBUG - /// Undo a previous call to . + /// Undo a previous call to . public static void UndoTransition(ref CurrentState state) { - Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaState)}."); + Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); NfaMatchingState nfaState = state.NfaState!; @@ -1202,37 +1194,43 @@ public static void UndoTransition(ref CurrentState state) // Sanity check: if there are any next states, then there must have been some source states. Debug.Assert(nextStates.Count == 0 || sourceStates.Count > 0); } +#endif + } - /// Check if any underlying core state is unconditionally nullable. - public static bool IsNullable(ref CurrentState state) - { - SymbolicRegexBuilder builder = state.NfaState!.Builder; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - if (builder.GetCoreState(nfaState.Key).Node.IsNullable) - { - return true; - } - } + /// + /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to + /// both take transitions and decide nullability. For positions of valid characters that are handled normally, + /// these IDs coincide with minterm IDs (i.e. indices to ). Positions outside the bounds + /// of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be + /// mapped to _minterms.Length for supporting the \Z anchor. + /// + private interface IInputReader + { + public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); + } - return false; - } + /// This reader omits the special handling of \n for the \Z anchor. + private readonly struct NoZAnchorInputReader : IInputReader + { + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => + (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]); + } - /// Check if any underlying core state can be nullable. - public static bool CanBeNullable(ref CurrentState state) + /// This reader includes full handling of an \n as the last character of input for the \Z anchor. + private readonly struct FullInputReader : IInputReader + { + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) { - SymbolicRegexBuilder builder = state.NfaState!.Builder; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - if (builder.GetCoreState(nfaState.Key).Node.CanBeNullable) - { - return true; - } - } + if ((uint)pos >= (uint)input.Length) + return -1; - return false; + int c = input[pos]; + + // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor + return c == '\n' && pos == input.Length - 1 ? + matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input + matcher._mintermClassifier.GetMintermID(c); } -#endif } /// @@ -1240,7 +1238,8 @@ public static bool CanBeNullable(ref CurrentState state) /// private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos); + public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader; } /// @@ -1249,7 +1248,8 @@ private interface IInitialStateHandler private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader { // return true to indicate that the current position is a possible starting position return true; @@ -1262,7 +1262,8 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0)) @@ -1273,7 +1274,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche // Update the starting state based on where TryFindNextStartingPosition moved us to. // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); return true; } } @@ -1283,7 +1284,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche /// private interface INullabilityHandler { - public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler; } @@ -1293,7 +1294,7 @@ private interface INullabilityHandler private readonly struct NoAnchorsNullabilityHandler : INullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler { Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); @@ -1307,10 +1308,10 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche private readonly struct FullNullabilityHandler : INullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler { - return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher._builder, ref state, matcher.GetCharKind(input, pos))); + return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 849fad5f82e..f3fcc33a2a2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -33,24 +33,25 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE /// internal const int SubsumptionCheckDepthLimit = 50; - internal readonly SymbolicRegexBuilder _builder; internal readonly SymbolicRegexNodeKind _kind; internal readonly int _lower; internal readonly int _upper; internal readonly TSet? _set; internal readonly SymbolicRegexNode? _left; internal readonly SymbolicRegexNode? _right; + internal readonly SymbolicRegexInfo _info; /// /// Caches nullability of this node for any given context (0 <= context < ContextLimit) /// when _info.StartsWithSomeAnchor and _info.CanBeNullable are true. Otherwise the cache is null. /// - private byte[]? _nullabilityCache; + private readonly byte[]? _nullabilityCache; - private TSet _startSet; +#if DEBUG + internal SymbolicRegexBuilder? _debugBuilder; +#endif /// AST node of a symbolic regex - /// the builder /// what kind of node /// left child /// right child @@ -58,9 +59,8 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE /// upper boubd of a loop /// singelton set /// misc flags including laziness - private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) + private SymbolicRegexNode(SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) { - _builder = builder; _kind = kind; _left = left; _right = right; @@ -68,7 +68,6 @@ private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeK _upper = upper; _set = set; _info = info; - _startSet = ComputeStartSet(); _nullabilityCache = info.StartsWithSomeAnchor && info.CanBeNullable ? new byte[CharKind.ContextLimit] : null; } @@ -78,7 +77,10 @@ private static SymbolicRegexNode Create(SymbolicRegexBuilder builder var key = (kind, left, right, lower, upper, set, info); if (!builder._nodeCache.TryGetValue(key, out SymbolicRegexNode? node)) { - node = new SymbolicRegexNode(builder, kind, left, right, lower, upper, set, info); + node = new SymbolicRegexNode(kind, left, right, lower, upper, set, info); +#if DEBUG + node._debugBuilder = builder; +#endif builder._nodeCache[key] = node; } return node; @@ -172,9 +174,6 @@ internal bool CanBeNullable } } - internal SymbolicRegexInfo _info; - - /// /// Converts a list of a given kind, e.g. Concat or Alternate, into an array, /// returns anything else in a singleton array. @@ -331,71 +330,31 @@ bool WithCache(uint context) } /// Returns true if this is equivalent to .* (the node must be eager also) - public bool IsAnyStar + public bool IsAnyStar(ISolver solver) { - get + if (IsStar) { - if (IsStar) - { - Debug.Assert(_left is not null); - if (_left._kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_left._set is not null); - return !IsLazy && _builder._solver.Full.Equals(_left._set); - } - } - - return false; - } - } - - /// Returns true if this is equivalent to .+ (the node must be eager also) - public bool IsAnyPlus - { - get - { - if (IsPlus) + Debug.Assert(_left is not null); + if (_left._kind == SymbolicRegexNodeKind.Singleton) { - Debug.Assert(_left is not null); - if (_left._kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_left._set is not null); - return !IsLazy && _builder._solver.Full.Equals(_left._set); - } + Debug.Assert(_left._set is not null); + return !IsLazy && solver.Full.Equals(_left._set); } - - return false; } - } - - /// Returns true if this is equivalent to [\0-\xFFFF] - public bool IsAnyChar - { - get - { - if (_kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_set is not null); - return _builder._solver.IsFull(_set); - } - return false; - } + return false; } /// Returns true if this is equivalent to [0-[0]] - public bool IsNothing + public bool IsNothing(ISolver solver) { - get + if (_kind == SymbolicRegexNodeKind.Singleton) { - if (_kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_set is not null); - return _builder._solver.IsEmpty(_set); - } - - return false; + Debug.Assert(_set is not null); + return solver.IsEmpty(_set); } + + return false; } /// Returns true iff this is a loop whose lower bound is 0 and upper bound is max @@ -415,39 +374,33 @@ public bool IsNothing #region called only once, in the constructor of SymbolicRegexBuilder internal static SymbolicRegexNode CreateFalse(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, default); internal static SymbolicRegexNode CreateTrue(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, default); internal static SymbolicRegexNode CreateFixedLengthMarker(SymbolicRegexBuilder builder, int length) => - Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateEpsilon(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Epsilon()); - internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) + internal static SymbolicRegexNode CreateAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) { Debug.Assert(kind is + SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor or SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true, - startsWithLineAnchor: kind is + return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); } - internal static SymbolicRegexNode CreateBoundaryAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) - { - Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true)); - } - #endregion internal static SymbolicRegexNode CreateSingleton(SymbolicRegexBuilder builder, TSet set) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, default); internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder builder, SymbolicRegexNode body, int lower, int upper, bool isLazy) { @@ -480,10 +433,10 @@ internal static SymbolicRegexNode CreateEffect(SymbolicRegexBuilder } internal static SymbolicRegexNode CreateCaptureStart(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateCaptureEnd(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateDisableBacktrackingSimulation(SymbolicRegexBuilder builder, SymbolicRegexNode child) => Create(builder, SymbolicRegexNodeKind.DisableBacktrackingSimulation, child, null, -1, -1, default, child._info); @@ -530,7 +483,7 @@ internal static SymbolicRegexNode CreateConcat(SymbolicRegexBuilder /// internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, bool deduplicated = false, bool hintRightLikelySubsumes = false) { - if (left.IsAnyStar || right == builder._nothing || left == right || (left.IsNullable && right.IsEpsilon)) + if (left.IsAnyStar(builder._solver) || right.IsNothing(builder._solver) || left == right || (left.IsNullable && right.IsEpsilon)) return left; if (left == builder._nothing) return right; @@ -541,16 +494,16 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder tail = right._kind == SymbolicRegexNodeKind.Alternate ? right._right! : builder._nothing; // Simplify away right side if left side subsumes it. For example X?Y|Y|Z would simplify to just X?Y|Z. - if (!hintRightLikelySubsumes && left.Subsumes(head)) + if (!hintRightLikelySubsumes && left.Subsumes(builder, head)) return CreateAlternate(builder, left, tail); // Simplify by folding right side into left side if right side subsumes the left side. For example Y|X?Y|Z // would simplify to X??Y|Z. - if (head.Subsumes(left) && TryFoldAlternation(left, head, out SymbolicRegexNode? result)) + if (head.Subsumes(builder, left) && TryFoldAlternation(builder, left, head, out SymbolicRegexNode? result)) return CreateAlternate(builder, result, tail); // This is a repeat of a rule above, but for the case when the hint tells us to try reverse subsumption first. - if (hintRightLikelySubsumes && left.Subsumes(head)) + if (hintRightLikelySubsumes && left.Subsumes(builder, head)) return CreateAlternate(builder, left, tail); // If left is not an Alternate, try to avoid allocation by checking if deduplication is necessary @@ -647,53 +600,54 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder + /// the builder that owns this node /// the node to check for being subsumed /// the current recursion depth /// - internal bool Subsumes(SymbolicRegexNode other, int depth = 0) + internal bool Subsumes(SymbolicRegexBuilder builder, SymbolicRegexNode other, int depth = 0) { // A node subsumes itself if (this == other) return true; // Nothing has an empty language, which is subsumed by anything - if (other == _builder._nothing) + if (other.IsNothing(builder._solver)) return true; // Early exit if we've gone too deep if (depth >= SubsumptionCheckDepthLimit) return false; - if (_builder._subsumptionCache.TryGetValue((this, other), out bool cached)) + if (builder._subsumptionCache.TryGetValue((this, other), out bool cached)) { return cached; } if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(Subsumes, other, depth); + return StackHelper.CallOnEmptyStack(Subsumes, builder, other, depth); } // Try to apply all subsumption rules - bool? subsumes = ApplySubsumptionRules(this, other, depth + 1); + bool? subsumes = ApplySubsumptionRules(builder, this, other, depth + 1); // Cache and return the result if any rule applied if (subsumes.HasValue) { - return (_builder._subsumptionCache[(this, other)] = subsumes.Value); + return (builder._subsumptionCache[(this, other)] = subsumes.Value); } // Assume false if no rule applied return false; - static bool? ApplySubsumptionRules(SymbolicRegexNode left, SymbolicRegexNode right, int depth) + static bool? ApplySubsumptionRules(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, int depth) { // Rule: Effect(X,E) subsumes Y iff X subsumes Y // Effectively this ignores any effects if (left._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(left._left is not null && left._right is not null); - return left._left.Subsumes(right, depth); + return left._left.Subsumes(builder, right, depth); } // Rule: X subsumes Effect(Y,E) iff X subsumes Y @@ -701,7 +655,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) if (right._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(right._left is not null && right._right is not null); - return left.Subsumes(right._left, depth); + return left.Subsumes(builder, right._left, depth); } // Rule: XY subsumes (X')??Y' if X equals X' and Y subsumes Y' @@ -714,7 +668,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) { Debug.Assert(rl._left is not null); if (TrySkipPrefix(left, rl._left, out SymbolicRegexNode? tail)) - return tail.Subsumes(right._right, depth); + return tail.Subsumes(builder, right._right, depth); } } @@ -728,7 +682,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) { Debug.Assert(ll._left is not null); if (TrySkipPrefix(right, ll._left, out SymbolicRegexNode? tail)) - return left._right.Subsumes(tail, depth); + return left._right.Subsumes(builder, tail, depth); } } @@ -738,7 +692,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) Debug.Assert(left._left is not null && left._right is not null); if (left._left.IsNullable) { - return left._right.Subsumes(right, depth); + return left._right.Subsumes(builder, right, depth); } } @@ -804,18 +758,19 @@ private SymbolicRegexNode UnwrapEffects() /// eliminate the alternation by simplifying to (xyz){0,3}?abc. Note that the transformation preserves the priority /// of the shorter "abc" match by making the prefix lazy. /// + /// the builder that owns this node /// the lower priority alternative /// the higher priority alternative /// the folded regex that eliminates alternation, or null if the operation fails /// accumulated effects from the right side /// whether folding was successful - private static bool TryFoldAlternation(SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? result, + private static bool TryFoldAlternation(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? result, SymbolicRegexNode? rightEffects = null) { // The rules below assume that the right side subsumes the left side - Debug.Assert(right.Subsumes(left)); + Debug.Assert(right.Subsumes(builder, left)); - rightEffects ??= left._builder.Epsilon; + rightEffects ??= builder.Epsilon; // If the sides are equal (ignoring effects) then just return the higher priority left side if (left.UnwrapEffects() == right.UnwrapEffects()) @@ -830,20 +785,20 @@ private SymbolicRegexNode UnwrapEffects() if (left._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(left._left is not null && left._right is not null); - Debug.Assert(right.Subsumes(left._left)); + Debug.Assert(right.Subsumes(builder, left._left)); // If there are any accumulated effects we don't know how to handle them here. // This shouldn't normally happen because this rule has priority over the rule // for effects on the right side. - if (rightEffects != left._builder.Epsilon) + if (rightEffects != builder.Epsilon) { result = null; return false; } - if (TryFoldAlternation(left._left, right, out SymbolicRegexNode? innerResult, rightEffects)) + if (TryFoldAlternation(builder, left._left, right, out SymbolicRegexNode? innerResult, rightEffects)) { - result = CreateEffect(left._builder, innerResult, left._right); + result = CreateEffect(builder, innerResult, left._right); return true; } } @@ -853,19 +808,19 @@ private SymbolicRegexNode UnwrapEffects() if (right._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(right._left is not null && right._right is not null); - Debug.Assert(right._left.Subsumes(left)); - rightEffects = CreateConcat(left._builder, right._right, rightEffects); - return TryFoldAlternation(left, right._left, out result, rightEffects); + Debug.Assert(right._left.Subsumes(builder, left)); + rightEffects = CreateConcat(builder, right._right, rightEffects); + return TryFoldAlternation(builder, left, right._left, out result, rightEffects); } // If we have Y | XY then this rule will find X and fold to X??Y. if (right._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(right._left is not null && right._right is not null); - if (right._left.IsNullable && TrySplitConcatSubsumption(left, right, out SymbolicRegexNode? prefix)) + if (right._left.IsNullable && TrySplitConcatSubsumption(builder, left, right, out SymbolicRegexNode? prefix)) { - prefix = CreateEffect(left._builder, prefix, rightEffects); - result = left._builder.CreateConcat(CreateLoop(left._builder, prefix, 0, 1, true), left); + prefix = CreateEffect(builder, prefix, rightEffects); + result = builder.CreateConcat(CreateLoop(builder, prefix, 0, 1, true), left); return true; } } @@ -875,7 +830,7 @@ private SymbolicRegexNode UnwrapEffects() return false; // This rule tries to find a prefix P that the right side has such that right is PR and left is equivalent to R - static bool TrySplitConcatSubsumption(SymbolicRegexNode left, SymbolicRegexNode right, + static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? prefix) { List> prefixElements = new(); @@ -884,25 +839,25 @@ private SymbolicRegexNode UnwrapEffects() { Debug.Assert(suffix._left is not null && suffix._right is not null); // We maintain a loop invariant that the suffix subsumes the left hand side - Debug.Assert(suffix.Subsumes(left)); + Debug.Assert(suffix.Subsumes(builder, left)); if (suffix == left) { // We found a split, so store the prefix and return success prefixElements.Reverse(); - prefix = left._builder.CreateConcatAlreadyReversed(prefixElements); + prefix = builder.CreateConcatAlreadyReversed(prefixElements); return true; } - else if (suffix._right.Subsumes(left)) + else if (suffix._right.Subsumes(builder, left)) { // The tail of the suffix still subsumes left, so we can extend the prefix prefixElements.Add(suffix._left); suffix = suffix._right; } - else if (left.Subsumes(suffix)) + else if (left.Subsumes(builder, suffix)) { // If left subsumes the suffix, then due to the loop invariant we have equivalence prefixElements.Reverse(); - prefix = left._builder.CreateConcatAlreadyReversed(prefixElements); + prefix = builder.CreateConcatAlreadyReversed(prefixElements); return true; } else @@ -1015,9 +970,10 @@ public int GetFixedLength() /// This function will rebuild concatenations because it pushes the FixedLengthMarker into the rightmost element. /// Due to this this function should not be called on every character. /// + /// the builder that owns this node /// accumulater used in the recursion for lengths of paths /// the node with fixed length markers added - public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) + public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder builder, int lengthSoFar = 0) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { @@ -1029,9 +985,9 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); // For an Alternate attempt to add markers separately for each element - return CreateAlternate(_builder, - _left.AddFixedLengthMarkers(lengthSoFar), - _right.AddFixedLengthMarkers(lengthSoFar), deduplicated: true); + return CreateAlternate(builder, + _left.AddFixedLengthMarkers(builder, lengthSoFar), + _right.AddFixedLengthMarkers(builder, lengthSoFar), deduplicated: true); case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); @@ -1039,13 +995,13 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) int leftLength = _left.GetFixedLength(); if (leftLength >= 0) { - return CreateConcat(_builder, _left, _right.AddFixedLengthMarkers(lengthSoFar + leftLength)); + return CreateConcat(builder, _left, _right.AddFixedLengthMarkers(builder, lengthSoFar + leftLength)); } // If the right side is always zero length, then just recurse to the left side int rightLength = _right.GetFixedLength(); if (rightLength == 0) { - return CreateConcat(_builder, _left.AddFixedLengthMarkers(lengthSoFar), _right); + return CreateConcat(builder, _left.AddFixedLengthMarkers(builder, lengthSoFar), _right); } break; @@ -1058,80 +1014,82 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) // if there is one. int thisLength = GetFixedLength(); return thisLength < 0 ? this : - CreateConcat(_builder, this, CreateFixedLengthMarker(_builder, lengthSoFar + thisLength)); + CreateConcat(builder, this, CreateFixedLengthMarker(builder, lengthSoFar + thisLength)); } /// - /// Create a derivative ( and ) and then strip + /// Create a derivative ( and ) and then strip /// effects with . /// This derivative simulates backtracking, i.e. it only considers paths that backtracking would /// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking /// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - internal SymbolicRegexNode CreateDerivativeWithoutEffects(TSet elem, uint context) => CreateDerivativeWrapper(elem, context).StripEffects(); + internal SymbolicRegexNode CreateDerivativeWithoutEffects(SymbolicRegexBuilder builder, TSet elem, uint context) => CreateDerivativeWrapper(builder, elem, context).StripEffects(builder); /// - /// Create a derivative ( and ) and then strip + /// Create a derivative ( and ) and then strip /// and map effects for use in NFA simulation with . /// This derivative simulates backtracking, i.e. it only considers paths that backtracking would /// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking /// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed. /// /// - /// The differences of this to are that (1) effects (e.g. capture starts and ends) + /// The differences of this to are that (1) effects (e.g. capture starts and ends) /// are considered and (2) the different elements that would form a top level union are instead returned as separate /// nodes (paired with their associated effects). This function is meant to be used for NFA simulation, where top level /// unions would be broken up into separate states. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(TSet elem, uint context) + internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context) { List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new(); - CreateDerivativeWrapper(elem, context).StripAndMapEffects(context, transitions); + CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions); return transitions; } // This wrapper handles the shared top-level concerns of constructing derivatives. Namely: // -Unwrapping and rewrapping nodes in DisableBacktrackingSimulation // -When backtracking is being simulated calling into PruneLowerPriorityThanNullability - private SymbolicRegexNode CreateDerivativeWrapper(TSet elem, uint context) + private SymbolicRegexNode CreateDerivativeWrapper(SymbolicRegexBuilder builder, TSet elem, uint context) { if (this._kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation) { // This node kind can only occur at the top level and indicates that backtracking simulation is turned off Debug.Assert(_left is not null); - SymbolicRegexNode derivative = _left.CreateDerivative(elem, context); + SymbolicRegexNode derivative = _left.CreateDerivative(builder, elem, context); // Reinsert the marker that maintains the non-backtracking semantics - return _builder.CreateDisableBacktrackingSimulation(derivative); + return builder.CreateDisableBacktrackingSimulation(derivative); } else { // If this node is nullable for the given context then prune any branches that are less preferred than // just the empty match. This is done in order to maintain backtracking semantics. - SymbolicRegexNode node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(context) : this; - return node.CreateDerivative(elem, context); + SymbolicRegexNode node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(builder, context) : this; + return node.CreateDerivative(builder, elem, context); } } /// Prune this node wrt the given context in order to maintain backtracking semantics. Mimics how backtracking chooses a path. - private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) + private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexBuilder builder, uint context) { //caching pruning to avoid otherwise potential quadratic worst case behavior SymbolicRegexNode? prunedNode; (SymbolicRegexNode, uint) key = (this, context); - if (_builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) + if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) { return prunedNode; } if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, context); + return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, builder, context); } switch (_kind) @@ -1143,8 +1101,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) // In a alternation (X|Y) where X is nullable (in the given context), Y must be eliminated. // Thus, taking the higher-priority branch in backtracking that is known to lead to a match // at which point the other branches become irrelevant and must no longer be used. - prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(context) : - CreateAlternate(_builder, _left, _right.PruneLowerPriorityThanNullability(context), deduplicated: true); + prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(builder, context) : + CreateAlternate(builder, _left, _right.PruneLowerPriorityThanNullability(builder, context), deduplicated: true); break; case SymbolicRegexNodeKind.Concat: @@ -1159,20 +1117,20 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) //e.g. a{0,5}?b{0,5}? reduces to () prunedNode = _left._kind == SymbolicRegexNodeKind.Alternate ? (_left._left!.IsNullableFor(context) ? - CreateConcat(_builder, _left._left, _right).PruneLowerPriorityThanNullability(context) : - CreateAlternate(_builder, CreateConcat(_builder, _left._left, _right), CreateConcat(_builder, _left._right!, _right).PruneLowerPriorityThanNullability(context))) : - CreateConcat(_builder, _left.PruneLowerPriorityThanNullability(context), _right.PruneLowerPriorityThanNullability(context)); + CreateConcat(builder, _left._left, _right).PruneLowerPriorityThanNullability(builder, context) : + CreateAlternate(builder, CreateConcat(builder, _left._left, _right), CreateConcat(builder, _left._right!, _right).PruneLowerPriorityThanNullability(builder, context))) : + CreateConcat(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right.PruneLowerPriorityThanNullability(builder, context)); break; case SymbolicRegexNodeKind.Loop when _info.IsLazyLoop && _lower == 0: //lazy nullable loop reduces to (), i.e., the loop body is just forgotten - prunedNode = _builder.Epsilon; + prunedNode = builder.Epsilon; break; case SymbolicRegexNodeKind.Effect: //Effects are maintained and the pruning is propagated to the body of the effect Debug.Assert(_left is not null && _right is not null); - prunedNode = CreateEffect(_builder, _left.PruneLowerPriorityThanNullability(context), _right); + prunedNode = CreateEffect(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right); break; default: @@ -1181,7 +1139,7 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) break; } - _builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode; + builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode; return prunedNode; } @@ -1205,19 +1163,20 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) /// positions for capture starts and ends. For example, given a DerivativeEffect for CaptureStart of capture number 0 /// and an input position 5, applying it to a Registers instance is simply assigning the relevant value to 5. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - private SymbolicRegexNode CreateDerivative(TSet elem, uint context) + private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder builder, TSet elem, uint context) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(CreateDerivative, elem, context); + return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context); } SymbolicRegexNode? derivative; (SymbolicRegexNode, TSet, uint) key = (this, elem, context); - if (_builder._derivativeCache.TryGetValue(key, out derivative)) + if (builder._derivativeCache.TryGetValue(key, out derivative)) { return derivative; } @@ -1230,14 +1189,14 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) // The following check assumes that either (1) the element and set are minterms, in which case // the element is exactly the set if the intersection is non-empty (satisfiable), or (2) the element is a singleton // set in which case it is fully contained in the set if the intersection is non-empty. - if (!_builder._solver.IsEmpty(_builder._solver.And(elem, _set))) + if (!builder._solver.IsEmpty(builder._solver.And(elem, _set))) { // the sigleton is consumed so the derivative is epsilon - derivative = _builder.Epsilon; + derivative = builder.Epsilon; } else { - derivative = _builder._nothing; + derivative = builder._nothing; } break; } @@ -1250,12 +1209,12 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) { // If the left side is not nullable then the character must be consumed there. // For example, Da(ab) = Da(a)b = b. - derivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right); + derivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right); } else { - SymbolicRegexNode leftDerivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right); - SymbolicRegexNode rightDerivative = _builder.CreateEffect(_right.CreateDerivative(elem, context), _left); + SymbolicRegexNode leftDerivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right); + SymbolicRegexNode rightDerivative = builder.CreateEffect(_right.CreateDerivative(builder, elem, context), _left); // If the left alternative is high-priority-nullable then // the priority is to skip left and prioritize rderiv over lderivR // Two examples: suppose elem = a @@ -1268,8 +1227,8 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) // In the second case backtracking would try to continue to follow (ab)* after reading b // This backtracking semantics is effectively being recorded into the order of the alternatives derivative = _left.IsHighPriorityNullableFor(context) ? - CreateAlternate(_builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : - CreateAlternate(_builder, leftDerivative, rightDerivative); + CreateAlternate(builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : + CreateAlternate(builder, leftDerivative, rightDerivative); } break; } @@ -1279,10 +1238,10 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) Debug.Assert(_left is not null); Debug.Assert(_upper > 0); - SymbolicRegexNode bodyDerivative = _left.CreateDerivative(elem, context); - if (bodyDerivative.IsNothing) + SymbolicRegexNode bodyDerivative = _left.CreateDerivative(builder, elem, context); + if (bodyDerivative.IsNothing(builder._solver)) { - derivative = _builder._nothing; + derivative = builder._nothing; } else { @@ -1294,7 +1253,7 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) int newlower = _lower == 0 || _lower == int.MaxValue ? _lower : _lower - 1; // the continued loop becomes epsilon when newlower == newupper == 0 // in which case the returned concatenation will be just bodyDerivative - derivative = _builder.CreateConcat(bodyDerivative, _builder.CreateLoop(_left, IsLazy, newlower, newupper)); + derivative = builder.CreateConcat(bodyDerivative, builder.CreateLoop(_left, IsLazy, newlower, newupper)); } break; } @@ -1302,7 +1261,7 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - derivative = CreateAlternate(_builder, _left.CreateDerivative(elem, context), _right.CreateDerivative(elem, context)); + derivative = CreateAlternate(builder, _left.CreateDerivative(builder, elem, context), _right.CreateDerivative(builder, elem, context)); break; } @@ -1314,11 +1273,11 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) default: // The derivative of any other case is nothing // e.g. taking the derivative of () (epsilon) is [] (nothing) - derivative = _builder._nothing; + derivative = builder._nothing; break; } - _builder._derivativeCache[key] = derivative; + builder._derivativeCache[key] = derivative; return derivative; } @@ -1327,11 +1286,11 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) /// So Effect(R,E) would be simplified to just R. /// /// the node with all Effect nodes stripped away - internal SymbolicRegexNode StripEffects() + internal SymbolicRegexNode StripEffects(SymbolicRegexBuilder builder) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(StripEffects); + return StackHelper.CallOnEmptyStack(StripEffects, builder); } // If the node doesn't contain any Effect nodes under it we are done @@ -1344,12 +1303,12 @@ internal SymbolicRegexNode StripEffects() case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null && _right is not null); // This is the place where the effect (the right child) is getting ignored - return _left.StripEffects(); + return _left.StripEffects(builder); case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect); - return _builder.CreateConcat(_left.StripEffects(), _right); + return builder.CreateConcat(_left.StripEffects(builder), _right); case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); @@ -1357,16 +1316,16 @@ internal SymbolicRegexNode StripEffects() // the elements. We don't want to omit deduplication here, since he stripping may make nodes equal. List> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate); for (int i = 0; i < elems.Count; i++) - elems[i] = elems[i].StripEffects(); - return _builder.Alternate(elems); + elems[i] = elems[i].StripEffects(builder); + return builder.Alternate(elems); case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return _builder.CreateDisableBacktrackingSimulation(_left.StripEffects()); + return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _builder.CreateLoop(_left.StripEffects(), IsLazy, _lower, _upper); + return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); default: Debug.Fail($"{nameof(StripEffects)}:{_kind}"); @@ -1386,15 +1345,16 @@ internal SymbolicRegexNode StripEffects() /// Here both include the CaptureStart_0 effect, since both are nested inside the outer Effect node, /// while only R includes the CaptureStart_1 effect. /// + /// the builder that owns this node /// immediately surrounding character context that affects nullability of anchors /// the list to insert the pairs of nodes and their effects into in priority order /// a helper list this function uses to accumulate effects in recursive calls - internal void StripAndMapEffects(uint context, List<(SymbolicRegexNode, DerivativeEffect[])> alternativesAndEffects, + internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint context, List<(SymbolicRegexNode, DerivativeEffect[])> alternativesAndEffects, List? currentEffects = null) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StackHelper.CallOnEmptyStack(StripAndMapEffects, context, alternativesAndEffects, currentEffects); + StackHelper.CallOnEmptyStack(StripAndMapEffects, builder, context, alternativesAndEffects, currentEffects); return; } @@ -1418,7 +1378,7 @@ internal SymbolicRegexNode StripEffects() int oldEffectCount = currentEffects.Count; _right.ApplyEffects((e, s) => s.Add(e), context, currentEffects); // Recurse into the main child - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); // Pop all the effects that were pushed above currentEffects.RemoveRange(oldEffectCount, currentEffects.Count - oldEffectCount); return; @@ -1430,19 +1390,19 @@ internal SymbolicRegexNode StripEffects() // For concat the nodes for the left hand side are added first and then fixed up by concatenating // the right side to each of them. int oldAlternativesCount = alternativesAndEffects.Count; - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { var (node, effects) = alternativesAndEffects[i]; - alternativesAndEffects[i] = (_builder.CreateConcat(node, _right), effects); + alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects); } break; } case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); - _right.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); + _right.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); break; case SymbolicRegexNodeKind.Loop when _lower == 0 && _upper == 1: @@ -1452,14 +1412,14 @@ internal SymbolicRegexNode StripEffects() Debug.Assert(_left is not null); // For lazy loops skipping is preferred, so output the epsilon first if (IsLazy) - alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ? + alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ? currentEffects.ToArray() : Array.Empty())); // Recurse into the body - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); // For eager loops the body is preferred, so output the epsilon last if (!IsLazy) - alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ? + alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ? currentEffects.ToArray() : Array.Empty())); break; @@ -1468,11 +1428,11 @@ internal SymbolicRegexNode StripEffects() { Debug.Assert(_left is not null); int oldAlternativesCount = alternativesAndEffects.Count; - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { var (node, effects) = alternativesAndEffects[i]; - alternativesAndEffects[i] = (_builder.CreateDisableBacktrackingSimulation(node), effects); + alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects); } break; } @@ -1632,12 +1592,12 @@ internal void ToStringHelper(StringBuilder sb) case SymbolicRegexNodeKind.Singleton: Debug.Assert(_set is not null); - sb.Append(_builder._solver.PrettyPrint(_set, _builder._charSetSolver)); + sb.Append(_debugBuilder!._solver.PrettyPrint(_set, _debugBuilder._charSetSolver)); return; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - if (IsAnyStar) + if (IsAnyStar(_debugBuilder!._solver)) { sb.Append(".*"); } @@ -1782,19 +1742,19 @@ static void AppendNumberSuperscript(StringBuilder sb, int value) /// /// Returns all sets that occur in the regex or the full set if there are no sets in the regex (e.g. the regex is "^"). /// - public HashSet GetSets() + public HashSet GetSets(SymbolicRegexBuilder builder) { var sets = new HashSet(); - CollectSets(sets); + CollectSets(builder, sets); return sets; } /// Collects all sets that occur in the regex into the specified collection. - private void CollectSets(HashSet sets) + private void CollectSets(SymbolicRegexBuilder builder, HashSet sets) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StackHelper.CallOnEmptyStack(CollectSets, sets); + StackHelper.CallOnEmptyStack(CollectSets, builder, sets); return; } @@ -1804,7 +1764,7 @@ private void CollectSets(HashSet sets) case SymbolicRegexNodeKind.EOLAnchor: case SymbolicRegexNodeKind.EndAnchorZ: case SymbolicRegexNodeKind.EndAnchorZReverse: - sets.Add(_builder._newLineSet); + sets.Add(builder._newLineSet); return; case SymbolicRegexNodeKind.BeginningAnchor: @@ -1822,13 +1782,13 @@ private void CollectSets(HashSet sets) case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - _left.CollectSets(sets); + _left.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - _left.CollectSets(sets); - _right.CollectSets(sets); + _left.CollectSets(builder, sets); + _right.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.Concat: @@ -1837,20 +1797,20 @@ private void CollectSets(HashSet sets) while (conc._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(conc._left is not null && conc._right is not null); - conc._left.CollectSets(sets); + conc._left.CollectSets(builder, sets); conc = conc._right; } - conc.CollectSets(sets); + conc.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - _left.CollectSets(sets); + _left.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.NonBoundaryAnchor: case SymbolicRegexNodeKind.BoundaryAnchor: - sets.Add(_builder._wordLetterForBoundariesSet); + sets.Add(builder._wordLetterForBoundariesSet); return; default: @@ -1860,10 +1820,10 @@ private void CollectSets(HashSet sets) } /// Compute and sort all the minterms from the sets in this regex. - public TSet[] ComputeMinterms() + public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) { - HashSet sets = GetSets(); - List minterms = MintermGenerator.GenerateMinterms(_builder._solver, sets); + HashSet sets = GetSets(builder); + List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets); minterms.Sort(); return minterms.ToArray(); } @@ -1871,69 +1831,69 @@ public TSet[] ComputeMinterms() /// /// Create the reverse of this regex /// - public SymbolicRegexNode Reverse() + public SymbolicRegexNode Reverse(SymbolicRegexBuilder builder) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(Reverse); + return StackHelper.CallOnEmptyStack(Reverse, builder); } switch (_kind) { case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _builder.CreateLoop(_left.Reverse(), IsLazy, _lower, _upper); + return builder.CreateLoop(_left.Reverse(builder), IsLazy, _lower, _upper); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode rev = _left.Reverse(); + SymbolicRegexNode rev = _left.Reverse(builder); SymbolicRegexNode rest = _right; while (rest._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(rest._left is not null && rest._right is not null); - SymbolicRegexNode rev1 = rest._left.Reverse(); - rev = _builder.CreateConcat(rev1, rev); + SymbolicRegexNode rev1 = rest._left.Reverse(builder); + rev = builder.CreateConcat(rev1, rev); rest = rest._right; } - SymbolicRegexNode restr = rest.Reverse(); - rev = _builder.CreateConcat(restr, rev); + SymbolicRegexNode restr = rest.Reverse(builder); + rev = builder.CreateConcat(restr, rev); return rev; } case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - return CreateAlternate(_builder, _left.Reverse(), _right.Reverse()); + return CreateAlternate(builder, _left.Reverse(builder), _right.Reverse(builder)); case SymbolicRegexNodeKind.FixedLengthMarker: // Fixed length markers are omitted in reverse - return _builder.Epsilon; + return builder.Epsilon; case SymbolicRegexNodeKind.BeginningAnchor: // The reverse of BeginningAnchor is EndAnchor - return _builder.EndAnchor; + return builder.EndAnchor; case SymbolicRegexNodeKind.EndAnchor: - return _builder.BeginningAnchor; + return builder.BeginningAnchor; case SymbolicRegexNodeKind.BOLAnchor: // The reverse of BOLanchor is EOLanchor - return _builder.EolAnchor; + return builder.EolAnchor; case SymbolicRegexNodeKind.EOLAnchor: - return _builder.BolAnchor; + return builder.BolAnchor; case SymbolicRegexNodeKind.EndAnchorZ: // The reversal of the \Z anchor - return _builder.EndAnchorZReverse; + return builder.EndAnchorZReverse; case SymbolicRegexNodeKind.EndAnchorZReverse: Debug.Fail("Should only happen if a reversed regex is reversed again, which isn't expected"); - return _builder.EndAnchorZ; + return builder.EndAnchorZ; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return _builder.CreateDisableBacktrackingSimulation(_left.Reverse()); + return builder.CreateDisableBacktrackingSimulation(_left.Reverse(builder)); // Remaining cases map to themselves: case SymbolicRegexNodeKind.Epsilon: @@ -1974,12 +1934,8 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1) }; } - - /// Gets the set that includes all elements that can start a match. - internal TSet GetStartSet() => _startSet; - /// Computes the set that includes all elements that can start a match. - private TSet ComputeStartSet() + public TSet GetStartSet(SymbolicRegexBuilder builder) { switch (_kind) { @@ -1996,7 +1952,7 @@ private TSet ComputeStartSet() case SymbolicRegexNodeKind.BOLAnchor: case SymbolicRegexNodeKind.CaptureStart: case SymbolicRegexNodeKind.CaptureEnd: - return _builder._solver.Empty; + return builder._solver.Empty; case SymbolicRegexNodeKind.Singleton: Debug.Assert(_set is not null); @@ -2004,44 +1960,64 @@ private TSet ComputeStartSet() case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _left._startSet; + return _left.GetStartSet(builder); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - TSet startSet = _left.CanBeNullable ? _builder._solver.Or(_left._startSet, _right._startSet) : _left._startSet; + TSet startSet = _left.CanBeNullable ? builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder)) : _left.GetStartSet(builder); return startSet; } case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - return _builder._solver.Or(_left._startSet, _right._startSet); + return builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder)); } case SymbolicRegexNodeKind.DisableBacktrackingSimulation: case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null); - return _left._startSet; + return _left.GetStartSet(builder); default: - Debug.Fail($"{nameof(ComputeStartSet)}:{_kind}"); - return _builder._solver.Full; + Debug.Fail($"{nameof(GetStartSet)}:{_kind}"); + return builder._solver.Full; } } /// /// Replace anchors that are infeasible by [] wrt the given previous character kind and what continuation is possible. /// + /// + /// This helps the matcher detect deadend states that have no viable matches in situations where the pattern's + /// language is empty due to interactions between anchors and the rest of the pattern. For example, a*\ba would + /// be simplified to [] when prevKind is a word letter. This allows the matcher to avoid spurious work and return + /// early. + /// + /// the builder that owns this node /// previous character kind - /// if true the continuation can start with wordletter or stop - /// if true the continuation can start with nonwordletter or stop - internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bool contWithNWL) + internal SymbolicRegexNode PruneAnchors(SymbolicRegexBuilder builder, uint prevKind) + { + //first prune the anchors in the node + TSet wlbSet = builder._wordLetterForBoundariesSet; + TSet startSet = GetStartSet(builder); + + //true if the startset of the node overlaps with some wordletter or the node can be nullable + bool contWithWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(wlbSet, startSet)); + + //true if the startset of the node overlaps with some nonwordletter or the node can be nullable + bool contWithNWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(builder._solver.Not(wlbSet), startSet)); + + return PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + } + + private SymbolicRegexNode PruneAnchorsImpl(SymbolicRegexBuilder builder, uint prevKind, bool contWithWL, bool contWithNWL) { // Guard against stack overflow due to deep recursion if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(PruneAnchors, prevKind, contWithWL, contWithNWL); + return StackHelper.CallOnEmptyStack(PruneAnchorsImpl, builder, prevKind, contWithWL, contWithNWL); } if (!_info.StartsWithSomeAnchor) @@ -2052,73 +2028,73 @@ internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bo case SymbolicRegexNodeKind.BeginningAnchor: return prevKind == CharKind.BeginningEnd ? this : - _builder._nothing; //start anchor is only nullable if the previous character is Start + builder._nothing; //start anchor is only nullable if the previous character is Start case SymbolicRegexNodeKind.EndAnchorZReverse: return ((prevKind & CharKind.BeginningEnd) != 0) ? this : - _builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n + builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n case SymbolicRegexNodeKind.BoundaryAnchor: return (prevKind == CharKind.WordLetter ? contWithNWL : contWithWL) ? this : // \b is impossible when the previous character is \w but no continuation matches \W // or the previous character is \W but no continuation matches \w - _builder._nothing; + builder._nothing; case SymbolicRegexNodeKind.NonBoundaryAnchor: return (prevKind == CharKind.WordLetter ? contWithWL : contWithNWL) ? this : // \B is impossible when the previous character is \w but no continuation matches \w // or the previous character is \W but no continuation matches \W - _builder._nothing; + builder._nothing; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - SymbolicRegexNode body = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode body = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return body == _left ? this : - CreateLoop(_builder, body, _lower, _upper, IsLazy); + CreateLoop(builder, body, _lower, _upper, IsLazy); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); - SymbolicRegexNode right1 = _left.IsNullable ? _right.PruneAnchors(prevKind, contWithWL, contWithNWL) : _right; + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + SymbolicRegexNode right1 = _left.IsNullable ? _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL) : _right; Debug.Assert(left1 is not null && right1 is not null); return left1 == _left && right1 == _right ? this : - CreateConcat(_builder, left1, right1); + CreateConcat(builder, left1, right1); } case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); - SymbolicRegexNode right1 = _right.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + SymbolicRegexNode right1 = _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); Debug.Assert(left1 is not null && right1 is not null); return left1 == _left && right1 == _right ? this : - CreateAlternate(_builder, left1, right1); + CreateAlternate(builder, left1, right1); } case SymbolicRegexNodeKind.Effect: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return left1 == _left ? this : - CreateEffect(_builder, left1, _right); + CreateEffect(builder, left1, _right); } case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - SymbolicRegexNode child = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode child = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return child == _left ? this : - _builder.CreateDisableBacktrackingSimulation(child); + builder.CreateDisableBacktrackingSimulation(child); default: return this; @@ -2175,7 +2151,7 @@ internal int ResolveFixedLength(uint context) /// and the resulting elements re-wrapped to maintain the metadata. /// /// an enumeration of the elements of the alternation, or just the node itself if there is no alternation - internal IEnumerable> EnumerateAlternationBranches() + internal IEnumerable> EnumerateAlternationBranches(SymbolicRegexBuilder builder) { switch (_kind) { @@ -2183,10 +2159,10 @@ internal IEnumerable> EnumerateAlternationBranches() Debug.Assert(_left is not null); // This call should never recurse more than one level Debug.Assert(_left._kind is not SymbolicRegexNodeKind.DisableBacktrackingSimulation); - foreach (SymbolicRegexNode element in _left.EnumerateAlternationBranches()) + foreach (SymbolicRegexNode element in _left.EnumerateAlternationBranches(builder)) { // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too - yield return _builder.CreateDisableBacktrackingSimulation(element); + yield return builder.CreateDisableBacktrackingSimulation(element); } break; case SymbolicRegexNodeKind.Alternate: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 431a85590c7..079b42e523a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -37,8 +37,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim } } - rootNode = rootNode.AddFixedLengthMarkers(); - BDD[] minterms = rootNode.ComputeMinterms(); + rootNode = rootNode.AddFixedLengthMarkers(bddBuilder); + BDD[] minterms = rootNode.ComputeMinterms(bddBuilder); _matcher = minterms.Length > 64 ? SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs index ee39405583e..48d94c51037 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -64,6 +64,38 @@ public static bool TryEnsureSufficientExecutionStack() .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The third argument to pass to the action. + /// The fourth argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => action(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The type of the fifth argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The third argument to pass to the action. + /// The fourth argument to pass to the action. + /// The fifth argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4, TArg5 arg5) => + Task.Run(() => action(arg1, arg2, arg3, arg4, arg5)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + /// Calls the provided function on the stack of a different thread pool thread. /// The type of the first argument to pass to the function. /// The type of the second argument to pass to the function. @@ -126,5 +158,21 @@ public static bool TryEnsureSufficientExecutionStack() Task.Run(() => func(arg1, arg2, arg3)) .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + /// The fourth argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => func(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index db9a4fd61eb..56ce038cb09 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -86,7 +86,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize }; } // add .*? in front of the pattern, this adds 1 more NFA state @@ -94,7 +94,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(".*?" + Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, 1 + ExpectedSafeSize}; + yield return new object[] { bddBuilder, rootNode, 1 + ExpectedSafeSize}; } // use of anchors increases the estimate by 5x in general but in reality much less, at most 3x @@ -102,7 +102,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern + "$", options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, 5 * ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, 5 * ExpectedSafeSize }; } // use of captures has no effect on the estimations @@ -110,31 +110,32 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern, options, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize }; } } [Theory] [MemberData(nameof(SafeThresholdTests_MemberData))] - public void SafeThresholdTests(object obj, int expectedSafeSize) + public void SafeThresholdTests(object builderObj, object nodeObj, int expectedSafeSize) { - SymbolicRegexNode node = (SymbolicRegexNode)obj; + SymbolicRegexBuilder builder = (SymbolicRegexBuilder)builderObj; + SymbolicRegexNode node = (SymbolicRegexNode)nodeObj; int safeSize = node.EstimateNfaSize(); Assert.Equal(expectedSafeSize, safeSize); - int nfaStateCount = CalculateNfaStateCount(node); + int nfaStateCount = CalculateNfaStateCount(builder, node); Assert.True(nfaStateCount <= expectedSafeSize); } /// /// Compute the closure of all NFA states from root and return the size of the resulting state space. /// - private static int CalculateNfaStateCount(SymbolicRegexNode root) + private static int CalculateNfaStateCount(SymbolicRegexBuilder builder, SymbolicRegexNode root) { // Here we are actually using the original BDD algebra (not converting to the BV or Uint64 algebra) // because it does not matter which algebra we use here (this matters only for performance) HashSet<(uint, SymbolicRegexNode)> states = new(); Stack<(uint, SymbolicRegexNode)> frontier = new(); - List minterms = MintermGenerator.GenerateMinterms(root._builder._solver, root.GetSets()); + List minterms = MintermGenerator.GenerateMinterms(builder._solver, root.GetSets(builder)); // Start from the initial state that has kind 'General' when no anchors are being used, else kind 'BeginningEnd' (uint, SymbolicRegexNode) initialState = (root._info.ContainsSomeAnchor ? CharKind.BeginningEnd : CharKind.General, root); @@ -150,7 +151,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode root) foreach (BDD minterm in minterms) { uint kind = GetCharKind(minterm); - SymbolicRegexNode target = source.Node.CreateDerivativeWithoutEffects(minterm, source.Kind); + SymbolicRegexNode target = source.Node.CreateDerivativeWithoutEffects(builder, minterm, source.Kind); //In the case of an NFA all the different alternatives in the DFA state become individual states themselves foreach (SymbolicRegexNode node in GetAlternatives(target)) @@ -169,7 +170,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode root) return states.Count; // Enumerates the alternatives from a node, for eaxmple (ab|(bc|cd)) has three alternatives - static IEnumerable> GetAlternatives(SymbolicRegexNode node) + IEnumerable> GetAlternatives(SymbolicRegexNode node) { if (node._kind == SymbolicRegexNodeKind.Alternate) { @@ -178,7 +179,7 @@ static IEnumerable> GetAlternatives(SymbolicRegexNode elem in GetAlternatives(node._right!)) yield return elem; } - else if (!node.IsNothing) // omit deadend states + else if (!node.IsNothing(builder._solver)) // omit deadend states { yield return node; } @@ -187,8 +188,8 @@ static IEnumerable> GetAlternatives(SymbolicRegexNode - minterm.Equals(root._builder._newLineSet) ? CharKind.Newline : // is \n - (!root._builder._solver.IsEmpty(root._builder._solver.And(root._builder._wordLetterForBoundariesSet, minterm)) ? + minterm.Equals(builder._newLineSet) ? CharKind.Newline : // is \n + (!builder._solver.IsEmpty(builder._solver.And(builder._wordLetterForBoundariesSet, minterm)) ? CharKind.WordLetter : // in \w CharKind.General); // anything else, thus in particular in \W } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj index 62948390630..8c45a0c5adb 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj @@ -51,7 +51,7 @@ - + -- GitLab