diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 56a71e7e8126f69861a8a518ebe8083fc1318209..94490e85c60c9bfbea8ca83952265271356f42e9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -63,7 +63,7 @@ - + @@ -75,6 +75,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs index aacdf06702290b58804a88c413e8902afd7bd32a..9f21787baae2ed2a0af6ecb8c061f7045b013334 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs @@ -43,5 +43,8 @@ internal static class CharKind WordLetter => @"\w", _ => string.Empty, }; + + /// Returns whether the given value is in the range of valid character kinds. + internal static bool IsValidCharKind(uint charKind) => charKind < CharKindCount; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs deleted file mode 100644 index ecbb44415eea6a761be6094230c6d696c1135292..0000000000000000000000000000000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections.Generic; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Net; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Captures a state of a DFA explored during matching. - internal sealed class DfaMatchingState where TSet : IComparable, IEquatable - { - internal DfaMatchingState(SymbolicRegexNode node, uint prevCharKind) - { - Node = node; - PrevCharKind = prevCharKind; - } - - internal SymbolicRegexNode Node { get; } - - internal uint PrevCharKind { get; } - - internal int Id { get; set; } - - /// This is a deadend state - internal bool IsDeadend => Node.IsNothing; - - /// The node must be nullable here - internal int FixedLength(uint nextCharKind) - { - Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.ResolveFixedLength(context); - } - - /// If true then the state is a dead-end, rejects all inputs. - internal bool IsNothing => Node.IsNothing; - - /// If true then state starts with a ^ or $ or \Z - internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; - - /// - /// Translates a minterm set to a character kind, which is a general categorization of characters used - /// for cheaply deciding the nullability of anchors. - /// - /// - /// An empty set is handled as a special case to indicate the very last \n. - /// - /// the minterm to translate - /// the character kind of the minterm - private uint GetNextCharKind(ref TSet minterm) - { - ISolver solver = Node._builder._solver; - TSet wordLetterPredicate = Node._builder._wordLetterForBoundariesSet; - TSet newLinePredicate = Node._builder._newLineSet; - - // minterm == solver.False is used to represent the very last \n - uint nextCharKind = CharKind.General; - if (solver.Empty.Equals(minterm)) - { - nextCharKind = CharKind.NewLineS; - minterm = newLinePredicate; - } - else if (newLinePredicate.Equals(minterm)) - { - // If the previous state was the start state, mark this as the very FIRST \n. - // Essentially, this looks the same as the very last \n and is used to nullify - // rev(\Z) in the conext of a reversed automaton. - nextCharKind = PrevCharKind == CharKind.BeginningEnd ? - CharKind.NewLineS : - CharKind.Newline; - } - else if (!solver.IsEmpty(solver.And(wordLetterPredicate, minterm))) - { - nextCharKind = CharKind.WordLetter; - } - return nextCharKind; - } - - /// - /// Compute the target state for the given input minterm. - /// If is False this means that this is \n and it is the last character of the input. - /// - /// minterm corresponding to some input character or False corresponding to last \n - internal DfaMatchingState Next(TSet minterm) - { - uint nextCharKind = GetNextCharKind(ref minterm); - - // Combined character context - uint context = CharKind.Context(PrevCharKind, nextCharKind); - - // Compute the derivative of the node for the given context - SymbolicRegexNode derivative = Node.CreateDerivativeWithoutEffects(minterm, context); - - // nextCharKind will be the PrevCharKind of the target state - // use an existing state instead if one exists already - // otherwise create a new new id for it - return Node._builder.CreateState(derivative, nextCharKind, capturing: false); - } - - /// - /// Compute a set of transitions for the given minterm. - /// - /// minterm corresponding to some input character or False corresponding to last \n - /// an enumeration of the transitions as pairs of the target state and a list of effects to be applied - internal List<(DfaMatchingState State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm) - { - uint nextCharKind = GetNextCharKind(ref minterm); - - // Combined character context - uint context = CharKind.Context(PrevCharKind, nextCharKind); - - // Compute the transitions for the given context - List<(SymbolicRegexNode, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context); - - var list = new List<(DfaMatchingState State, DerivativeEffect[] Effects)>(); - foreach ((SymbolicRegexNode node, DerivativeEffect[]? effects) in nodesAndEffects) - { - // nextCharKind will be the PrevCharKind of the target state - // use an existing state instead if one exists already - // otherwise create a new new id for it - DfaMatchingState state = Node._builder.CreateState(node, nextCharKind, capturing: true); - if (!state.IsDeadend) - list.Add((state, effects)); - } - return list; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal bool IsNullableFor(uint nextCharKind) - { - Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.IsNullableFor(context); - } - - public override bool Equals(object? obj) => - obj is DfaMatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); - - public override int GetHashCode() => (PrevCharKind, Node).GetHashCode(); - -#if DEBUG - public override string ToString() => - PrevCharKind == 0 ? Node.ToString() : - $"({CharKind.DescribePrev(PrevCharKind)},{Node})"; -#endif - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs new file mode 100644 index 0000000000000000000000000000000000000000..38226258df4a2a7fe326c623d2340951aec36344 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -0,0 +1,118 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Net; + +namespace System.Text.RegularExpressions.Symbolic +{ + /// Captures a state explored during matching. + internal sealed class MatchingState where TSet : IComparable, IEquatable + { + internal MatchingState(SymbolicRegexNode node, uint prevCharKind) + { + Node = node; + PrevCharKind = prevCharKind; + } + + /// The regular expression that labels this state and gives it its semantics. + internal SymbolicRegexNode Node { get; } + + /// + /// The kind of the previous character in the input. The is responsible + /// for ensuring that in all uses of this state this invariant holds by both selecting initial states accordingly + /// and transitioning on each character to states that match that character's kind. + /// + /// + /// Tracking this information is an optimization that allows each transition taken in the matcher to only depend + /// on the next character (and its kind). In general, the transitions from a state with anchors in its pattern + /// depend on both the previous and the next character. Creating distinct states for each kind of the previous + /// character embeds the necessary information about the previous character into the state space of the automaton. + /// However, this does incur a memory overhead due to the duplication of states. For patterns with no anchors + /// this will always be set to , which can reduce the number of states created. + /// + /// The performance effect of this optimization has not been investigated. If this optimization were removed, the + /// transition logic would in turn have to become more complicated for derivatives that depend on the nullability + /// of anchors. Care should be taken to not slow down transitions without anchors involved. + /// + internal uint PrevCharKind { get; } + + /// + /// A unique identifier for this state, which is used in to index into + /// state information and transition arrays. Valid IDs are always >= 1. + /// + internal int Id { get; set; } + + /// Whether this state is known to be a dead end, i.e. no nullable states are reachable from here. + internal bool IsDeadend(ISolver solver) => Node.IsNothing(solver); + + /// + /// Returns the fixed length that any match ending with this state must have, or -1 if there is no such + /// fixed length, . The context is defined + /// by of this state and the given nextCharKind. The node must be nullable here. + /// + internal int FixedLength(uint nextCharKind) + { + Debug.Assert(IsNullableFor(nextCharKind)); + Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); + uint context = CharKind.Context(PrevCharKind, nextCharKind); + return Node.ResolveFixedLength(context); + } + + /// If true then state starts with a ^ or $ or \Z + internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; + + /// + /// Compute the target state for the given input minterm. + /// If is False this means that this is \n and it is the last character of the input. + /// + /// the builder that owns + /// minterm corresponding to some input character or False corresponding to last \n + /// + internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet minterm, uint nextCharKind) + { + // Combined character context + uint context = CharKind.Context(PrevCharKind, nextCharKind); + + // Compute the derivative of the node for the given context + return Node.CreateDerivativeWithoutEffects(builder, minterm, context); + } + + /// + /// Compute a set of transitions for the given minterm. + /// + /// the builder that owns + /// minterm corresponding to some input character or False corresponding to last \n + /// + /// an enumeration of the transitions as pairs of the target state and a list of effects to be applied + internal List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)> NfaNextWithEffects(SymbolicRegexBuilder builder, TSet minterm, uint nextCharKind) + { + // Combined character context + uint context = CharKind.Context(PrevCharKind, nextCharKind); + + // Compute the transitions for the given context + return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsNullableFor(uint nextCharKind) + { + Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); + uint context = CharKind.Context(PrevCharKind, nextCharKind); + return Node.IsNullableFor(context); + } + + public override bool Equals(object? obj) => + obj is MatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); + + public override int GetHashCode() => (PrevCharKind, Node).GetHashCode(); + +#if DEBUG + public override string ToString() => + PrevCharKind == 0 ? Node.ToString() : + $"({CharKind.DescribePrev(PrevCharKind)},{Node})"; +#endif + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 8c4fd992f9352de10da7ce3d3580042981fd59b6..857b8d51972645782a2fc39dbf7995c48ac26906 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -240,12 +240,12 @@ static string UnexpectedNodeType(RegexNode node) SymbolicRegexNode elem = childResult.Count == 1 ? childResult.FirstElement : _builder.CreateConcatAlreadyReversed(childResult); - if (elem.IsNothing) + if (elem.IsNothing(_builder._solver)) { continue; } - or = elem.IsAnyStar ? + or = elem.IsAnyStar(_builder._solver) ? elem : // .* is the absorbing element SymbolicRegexNode.CreateAlternate(_builder, elem, or); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index 3ba759e2b04fd7dcb72b50acd89954ec4ab16563..eceaadd247eaab80098dd61f27700459c5839e1f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -30,43 +30,34 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal SymbolicRegexNode Epsilon => _epsilon ??= SymbolicRegexNode.CreateEpsilon(this); private SymbolicRegexNode? _beginningAnchor; - internal SymbolicRegexNode BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BeginningAnchor); + internal SymbolicRegexNode BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BeginningAnchor); private SymbolicRegexNode? _endAnchor; - internal SymbolicRegexNode EndAnchor => _endAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchor); + internal SymbolicRegexNode EndAnchor => _endAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchor); private SymbolicRegexNode? _endAnchorZ; - internal SymbolicRegexNode EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZ); + internal SymbolicRegexNode EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZ); private SymbolicRegexNode? _endAnchorZReverse; - internal SymbolicRegexNode EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse); + internal SymbolicRegexNode EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse); private SymbolicRegexNode? _bolAnchor; - internal SymbolicRegexNode BolAnchor => _bolAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BOLAnchor); + internal SymbolicRegexNode BolAnchor => _bolAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BOLAnchor); private SymbolicRegexNode? _eolAnchor; - internal SymbolicRegexNode EolAnchor => _eolAnchor ??= SymbolicRegexNode.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EOLAnchor); + internal SymbolicRegexNode EolAnchor => _eolAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.EOLAnchor); private SymbolicRegexNode? _wbAnchor; - internal SymbolicRegexNode BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor); + internal SymbolicRegexNode BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor); private SymbolicRegexNode? _nwbAnchor; - internal SymbolicRegexNode NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor); + internal SymbolicRegexNode NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode.CreateAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor); internal TSet _wordLetterForBoundariesSet; internal TSet _newLineSet; - /// Partition of the input space of sets. - internal TSet[]? _minterms; - private readonly Dictionary> _singletonCache = new(); - // states that have been created - internal HashSet> _stateCache = new(); - - // capturing states that have been created - internal HashSet> _capturingStateCache = new(); - /// /// This cache is used in to keep all nodes associated with this builder /// unique. This ensures that reference equality can be used for syntactic equality and that all shared subexpressions @@ -84,7 +75,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, // matching when simplification rules fail to eliminate the portions being walked over. /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node to derivate /// -The character or minterm to take the derivative with /// -The surrounding character context @@ -93,7 +84,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal readonly Dictionary<(SymbolicRegexNode, TSet elem, uint context), SymbolicRegexNode> _derivativeCache = new(); /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node to prune /// -The surrounding character context /// The value is the pruned node. @@ -101,74 +92,13 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, internal readonly Dictionary<(SymbolicRegexNode, uint), SymbolicRegexNode> _pruneLowerPriorityThanNullabilityCache = new(); /// - /// Cache for keyed by: + /// Cache for keyed by: /// -The node R potentially subsuming S /// -The node S potentially being subsumed by R /// The value indicates if subsumption is known to hold. /// internal readonly Dictionary<(SymbolicRegexNode, SymbolicRegexNode), bool> _subsumptionCache = new(); - /// - /// Maps state ids to states, initial capacity is 1024 states. - /// Each time more states are needed the length is increased by 1024. - /// - internal DfaMatchingState[]? _stateArray; - internal DfaMatchingState[]? _capturingStateArray; - - /// - /// Maps state IDs to context-independent information for all states in . - /// - private ContextIndependentState[] _stateInfo = Array.Empty(); - - /// Context-independent information available for every state. - [Flags] - private enum ContextIndependentState : byte - { - IsInitial = 1, - IsDeadend = 2, - IsNullable = 4, - CanBeNullable = 8, - } - - /// - /// For these "delta" arrays, technically Volatile.Read should be used to read out an element, - /// but in practice that's not needed on the runtimes in use (though that needs to be documented - /// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is - /// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789). - /// - internal int[]? _delta; - internal List<(DfaMatchingState, DerivativeEffect[])>?[]? _capturingDelta; - private const int InitialStateLimit = 1024; - - /// 1 + Log2(_minterms.Length), the smallest k s.t. 2^k >= minterms.Length + 1 - internal int _mintermsLog; - - /// - /// Maps each NFA state id to the state id of the DfaMatchingState stored in _stateArray. - /// This map is used to compactly represent NFA state ids in NFA mode in order to utilize - /// the property that all NFA states are small integers in one interval. - /// The valid entries are 0 to -1. - /// - internal int[] _nfaStateArray = Array.Empty(); - - /// - /// Maps the id of a DfaMatchingState to the NFA state id that it is being identifed with in the NFA. - /// It is the inverse of used entries in _nfaStateArray. - /// The range of this map is 0 to -1. - /// - internal readonly Dictionary _nfaStateArrayInverse = new(); - - /// Gets .Count - internal int NfaStateCount => _nfaStateArrayInverse.Count; - - /// - /// Transition function for NFA transitions in NFA mode. - /// Each NFA entry maps to a list of NFA target states. - /// Each list of target states is without repetitions. - /// If the entry is null then the targets states have not been computed yet. - /// - internal int[]?[] _nfaDelta = Array.Empty(); - /// Create a new symbolic regex builder. internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) { @@ -176,24 +106,6 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) _charSetSolver = charSetSolver; _solver = solver; - // minterms = null if partition of the solver is undefined and returned as null - _minterms = solver.GetMinterms(); - if (_minterms == null) - { - _mintermsLog = -1; - } - else - { - _stateArray = new DfaMatchingState[InitialStateLimit]; - _capturingStateArray = new DfaMatchingState[InitialStateLimit]; - _stateInfo = new ContextIndependentState[InitialStateLimit]; - - // the extra +1 slot with id minterms.Length is reserved for \Z (last occurrence of \n) - _mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1; - _delta = new int[InitialStateLimit << _mintermsLog]; - _capturingDelta = new List<(DfaMatchingState, DerivativeEffect[])>[InitialStateLimit << _mintermsLog]; - } - // initialized to False but updated later to the actual condition ony if \b or \B occurs anywhere in the regex // this implies that if a regex never uses \b or \B then the character context will never // update the previous character context to distinguish word and nonword letters @@ -213,94 +125,6 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) _singletonCache[_solver.Full] = _anyChar; } - /// Assign the context-independent information for the given state. - internal void SetStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) - { - Debug.Assert(stateId > 0); - Debug.Assert(!isNullable || canBeNullable); - - ContextIndependentState info = 0; - - if (isInitial) - { - info |= ContextIndependentState.IsInitial; - } - - if (isDeadend) - { - info |= ContextIndependentState.IsDeadend; - } - - if (canBeNullable) - { - info |= ContextIndependentState.CanBeNullable; - if (isNullable) - { - info |= ContextIndependentState.IsNullable; - } - } - - _stateInfo[stateId] = info; - } - - /// Get context-independent information for the given state. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId) - { - Debug.Assert(stateId > 0); - - ContextIndependentState info = _stateInfo[stateId]; - return ((info & ContextIndependentState.IsInitial) != 0, - (info & ContextIndependentState.IsDeadend) != 0, - (info & ContextIndependentState.IsNullable) != 0, - (info & ContextIndependentState.CanBeNullable) != 0); - } - - /// Lookup the actual minterm based on its ID. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal TSet GetMinterm(int mintermId) - { - TSet[]? minterms = _minterms; - Debug.Assert(minterms is not null); - return (uint)mintermId < (uint)minterms.Length ? - minterms[mintermId] : - _solver.Empty; // minterm=False represents \Z - } - - /// Returns the span from that may contain transitions for the given state - internal Span GetDeltasFor(DfaMatchingState state) - { - if (_delta is null || _minterms is null) - { - return default; - } - - int numMinterms = _minterms.Length; - if (state.StartsWithLineAnchor) - { - numMinterms++; - } - - return _delta.AsSpan(state.Id << _mintermsLog, numMinterms); - } - - /// Returns the span from that may contain transitions for the given state - internal Span GetNfaDeltasFor(DfaMatchingState state) - { - if (_nfaDelta is null || _minterms is null || !_nfaStateArrayInverse.TryGetValue(state.Id, out int nfaState)) - { - return default; - } - - int numMinterms = _minterms.Length; - if (state.StartsWithLineAnchor) - { - numMinterms++; - } - - return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms); - } - /// /// Make an alternation of given nodes, simplify by eliminating any regex that accepts no inputs /// @@ -509,224 +333,5 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n return null; } } - - /// - /// Create a state with given node and previous character context. - /// - /// the pattern that this state will represent - /// the kind of the character that led to this state - /// whether to use the separate space of states with capturing transitions or not - /// whether to mark the state as an initial state or not - /// - public DfaMatchingState CreateState(SymbolicRegexNode node, uint prevCharKind, bool capturing = false, bool isInitialState = false) - { - //first prune the anchors in the node - TSet wlbSet = _wordLetterForBoundariesSet; - TSet startSet = node.GetStartSet(); - - //true if the startset of the node overlaps with some wordletter or the node can be nullable - bool contWithWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(wlbSet, startSet)); - - //true if the startset of the node overlaps with some nonwordletter or the node can be nullable - bool contWithNWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(_solver.Not(wlbSet), startSet)); - SymbolicRegexNode pruned_node = node.PruneAnchors(prevCharKind, contWithWL, contWithNWL); - var s = new DfaMatchingState(pruned_node, prevCharKind); - if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState? state)) - { - state = MakeNewState(s, capturing, isInitialState); - } - - return state; - } - - private DfaMatchingState MakeNewState(DfaMatchingState state, bool capturing, bool isInitialState) - { - lock (this) - { - HashSet> cache = capturing ? _capturingStateCache : _stateCache; - cache.Add(state); // Add to cache first to make 1 the first state ID - state.Id = cache.Count; - - Debug.Assert(_stateArray is not null && _capturingStateArray is not null); - - const int GrowthSize = 1024; - if (capturing) - { - if (state.Id == _capturingStateArray.Length) - { - int newsize = _capturingStateArray.Length + GrowthSize; - Array.Resize(ref _capturingStateArray, newsize); - Array.Resize(ref _capturingDelta, newsize << _mintermsLog); - } - _capturingStateArray[state.Id] = state; - } - else - { - if (state.Id == _stateArray.Length) - { - int newsize = _stateArray.Length + GrowthSize; - Array.Resize(ref _stateArray, newsize); - Array.Resize(ref _delta, newsize << _mintermsLog); - Array.Resize(ref _stateInfo, newsize); - } - _stateArray[state.Id] = state; - SetStateInfo(state.Id, isInitialState, state.IsDeadend, state.Node.IsNullable, state.Node.CanBeNullable); - } - return state; - } - } - - /// - /// Make an NFA state for the given node and previous character kind. - /// - public int CreateNfaState(SymbolicRegexNode node, uint prevCharKind) - { - Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate); - - // First make the underlying core state - DfaMatchingState coreState = CreateState(node, prevCharKind); - - if (!_nfaStateArrayInverse.TryGetValue(coreState.Id, out int nfaStateId)) - { - nfaStateId = MakeNewNfaState(coreState.Id); - } - - return nfaStateId; - } - - /// Critical region that creates a new NFA state for the underlying core state - private int MakeNewNfaState(int coreStateId) - { - lock (this) - { - if (NfaStateCount == _nfaStateArray.Length) - { - // TBD: is 1024 reasonable? - int newsize = _nfaStateArray.Length + 1024; - Array.Resize(ref _nfaStateArray, newsize); - Array.Resize(ref _nfaDelta, newsize << _mintermsLog); - // TBD: capturing - } - - int nfaStateId = NfaStateCount; - _nfaStateArray[nfaStateId] = coreStateId; - _nfaStateArrayInverse[coreStateId] = nfaStateId; - return nfaStateId; - } - } - - /// Gets the core state Id corresponding to the NFA state - public int GetCoreStateId(int nfaStateId) - { - Debug.Assert(_stateArray is not null); - Debug.Assert(nfaStateId < _nfaStateArray.Length); - Debug.Assert(_nfaStateArray[nfaStateId] < _stateArray.Length); - return _nfaStateArray[nfaStateId]; - } - - /// Gets the core state corresponding to the NFA state - public DfaMatchingState GetCoreState(int nfaStateId) - { - Debug.Assert(_stateArray is not null); - return _stateArray[GetCoreStateId(nfaStateId)]; - } - - /// Critical region for defining a new core transition - public DfaMatchingState CreateNewTransition(DfaMatchingState sourceState, int mintermId, int offset) - { - TryCreateNewTransition(sourceState, mintermId, offset, checkThreshold: false, out DfaMatchingState? nextState); - Debug.Assert(nextState is not null); - return nextState; - } - - /// Gets or creates a new DFA transition. - public bool TryCreateNewTransition( - DfaMatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out DfaMatchingState? nextState) - { - Debug.Assert(_delta is not null && _stateArray is not null); - lock (this) - { - Debug.Assert(offset < _delta.Length); - - // check if meanwhile delta[offset] has become defined possibly by another thread - DfaMatchingState? targetState = _stateArray[_delta[offset]]; - if (targetState is null) - { - if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) - { - nextState = null; - return false; - } - - targetState = sourceState.Next(GetMinterm(mintermId)); - Volatile.Write(ref _delta[offset], targetState.Id); - } - - nextState = targetState; - return true; - } - } - - /// Gets or creates a new NFA transition. - public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset) - { - Debug.Assert(_delta is not null && _stateArray is not null); - lock (this) - { - Debug.Assert(nfaOffset < _nfaDelta.Length); - - // check if meanwhile the nfaoffset has become defined possibly by another thread - int[]? targets = _nfaDelta[nfaOffset]; - if (targets is null) - { - // Create the underlying transition from the core state corresponding to the nfa state - DfaMatchingState coreState = GetCoreState(nfaStateId); - int coreOffset = (coreState.Id << _mintermsLog) | mintermId; - int coreTargetId = _delta[coreOffset]; - DfaMatchingState? coreTarget = coreTargetId > 0 ? - _stateArray[coreTargetId] : CreateNewTransition(coreState, mintermId, coreOffset); - - SymbolicRegexNode node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - coreTarget.Node._left! : coreTarget.Node; - if (node.Kind == SymbolicRegexNodeKind.Alternate) - { - // Create separate NFA states for all members of a disjunction - // Here duplicate NFA states cannot arise because there are no duplicate nodes in the disjunction - List> alts = node.ToList(listKind: SymbolicRegexNodeKind.Alternate); - targets = new int[alts.Count]; - int targetIndex = 0; - foreach (SymbolicRegexNode q in alts) - { - Debug.Assert(!q.IsNothing); - // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too - SymbolicRegexNode targetNode = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - CreateDisableBacktrackingSimulation(q) : q; - targets[targetIndex++] = CreateNfaState(targetNode, coreTarget.PrevCharKind); - } - Debug.Assert(targetIndex == targets.Length); - } - else if (coreTarget.IsDeadend) - { - // Omit deadend states from the target list of states - // target list being empty means that the NFA state itself is a deadend - targets = Array.Empty(); - } - else - { - // Add the single NFA target state correponding to the core target state - if (!_nfaStateArrayInverse.TryGetValue(coreTarget.Id, out int nfaTargetId)) - { - nfaTargetId = MakeNewNfaState(coreTarget.Id); - } - - targets = new[] { nfaTargetId }; - } - - Volatile.Write(ref _nfaDelta[nfaOffset], targets); - } - - return targets; - } - } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index cd333942b491a4d3259c57c7008a1e259b1bd2cb..ff95195292bfa476c695058df51074017f8f5255 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -1,6 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; + namespace System.Text.RegularExpressions.Symbolic { /// Misc information of structural properties of a that is computed bottom up. @@ -14,54 +16,34 @@ namespace System.Text.RegularExpressions.Symbolic private const uint StartsWithSomeAnchorMask = 32; private const uint IsHighPriorityNullableMask = 64; private const uint ContainsEffectMask = 128; + private const uint ContainsLineAnchorMask = 256; private readonly uint _info; private SymbolicRegexInfo(uint i) => _info = i; - internal static SymbolicRegexInfo Create( + private static SymbolicRegexInfo Create( bool isAlwaysNullable = false, bool canBeNullable = false, - bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, + bool startsWithLineAnchor = false, bool containsLineAnchor = false, + bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isHighPriorityNullable = false, bool containsEffect = false) { - uint i = 0; - - if (canBeNullable || isAlwaysNullable) - { - i |= CanBeNullableMask; - - if (isAlwaysNullable) - { - i |= IsAlwaysNullableMask; - } - } - - if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor) - { - i |= ContainsSomeAnchorMask; - - if (startsWithLineAnchor) - { - i |= StartsWithLineAnchorMask; - } - - if (startsWithLineAnchor || startsWithSomeAnchor) - { - i |= StartsWithSomeAnchorMask; - } - } - - if (isHighPriorityNullable) - { - i |= IsHighPriorityNullableMask; - } - - if (containsEffect) - { - i |= ContainsEffectMask; - } - - return new SymbolicRegexInfo(i); + // Assert that the expected implications hold. For example, every node that contains a line anchor + // must also be marked as containing some anchor. + Debug.Assert(!isAlwaysNullable || canBeNullable); + Debug.Assert(!startsWithLineAnchor || containsLineAnchor); + Debug.Assert(!startsWithLineAnchor || startsWithSomeAnchor); + Debug.Assert(!containsLineAnchor || containsSomeAnchor); + Debug.Assert(!startsWithSomeAnchor || containsSomeAnchor); + return new SymbolicRegexInfo( + (isAlwaysNullable ? IsAlwaysNullableMask : 0) | + (canBeNullable ? CanBeNullableMask : 0) | + (startsWithLineAnchor ? StartsWithLineAnchorMask : 0) | + (containsLineAnchor ? ContainsLineAnchorMask : 0) | + (startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) | + (containsSomeAnchor ? ContainsSomeAnchorMask : 0) | + (isHighPriorityNullable ? IsHighPriorityNullableMask : 0) | + (containsEffect ? ContainsEffectMask : 0)); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -70,6 +52,8 @@ namespace System.Text.RegularExpressions.Symbolic public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; + public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0; + public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; @@ -80,6 +64,27 @@ namespace System.Text.RegularExpressions.Symbolic public bool ContainsEffect => (_info & ContainsEffectMask) != 0; + /// + /// Used for any node that acts as an epsilon, i.e., something that always matches the empty string. + /// + public static SymbolicRegexInfo Epsilon() => + Create( + isAlwaysNullable: true, + canBeNullable: true, + isHighPriorityNullable: true); + + /// + /// Used for all anchors. + /// + /// whether this anchor is a line anchor + public static SymbolicRegexInfo Anchor(bool isLineAnchor) => + Create( + canBeNullable: true, + startsWithLineAnchor: isLineAnchor, + containsLineAnchor: isLineAnchor, + startsWithSomeAnchor: true, + containsSomeAnchor: true); + /// /// The alternation remains high priority nullable if the left alternative is so. /// All other info properties are the logical disjunction of the resepctive info properties @@ -90,6 +95,7 @@ namespace System.Text.RegularExpressions.Symbolic isAlwaysNullable: left_info.IsNullable || right_info.IsNullable, canBeNullable: left_info.CanBeNullable || right_info.CanBeNullable, startsWithLineAnchor: left_info.StartsWithLineAnchor || right_info.StartsWithLineAnchor, + containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor, startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor, containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable, @@ -105,6 +111,7 @@ namespace System.Text.RegularExpressions.Symbolic isAlwaysNullable: left_info.IsNullable && right_info.IsNullable, canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable, startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor), + containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor, startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor), containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs index ea1f68075000b64c44dd496b705560d3b269be82..bc01b913f7ce63045fed4ff9f2fb6316fcf0c56d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs @@ -48,7 +48,7 @@ internal enum SymbolicRegexNodeKind /// Effects to be applied when taking a transition. /// /// Left child is the pattern itself and the right child is a concatenation of nodes whose effects should be applied. - /// Effect nodes are created in the rule for concatenation in , + /// Effect nodes are created in the rule for concatenation in , /// where they are used to represent additional operations that should be performed in the current position if /// the pattern in the left child is used to match the input. Since these Effect nodes are relative to the current /// position in the input, the effects from the right child must be applied in the transition that the derivative is diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs new file mode 100644 index 0000000000000000000000000000000000000000..9912da4da8ef39c801cf288710b839072ccc65a0 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -0,0 +1,441 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Text.RegularExpressions.Symbolic +{ + internal sealed partial class SymbolicRegexMatcher + { + /// + /// Initial capacity for DFA related arrays. + /// + private const int InitialDfaStateCapacity = 1024; + + /// + /// Minimum capacity for NFA related arrays when the matcher first enters NFA mode. The arrays start out empty, + /// but are resized to this capacity upon first use. + /// + private const int InitialNfaStateCapacity = 64; + + /// + /// Cache for the states that have been created. Each state is uniquely identified by its associated + /// and the kind of the previous character. + /// + private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new(); + + /// + /// Maps state ids to states, initial capacity is given by . + /// Each time more states are needed the length is doubled. + /// The first valid state is at index 1. + /// + private MatchingState?[] _stateArray; + + /// + /// Maps state IDs to context-independent information for all states in . + /// The first valid entry is at index 1. + /// + private ContextIndependentState[] _stateInfo; + + /// Context-independent information available for every state. + [Flags] + private enum ContextIndependentState : byte + { + IsInitial = 1, + IsDeadend = 2, + IsNullable = 4, + CanBeNullable = 8, + } + + /// + /// The transition function for DFA mode. + /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is + /// the number of bits required to represent the largest minterm ID , is reserved + /// for each state. This makes indexing into this array not require a multiplication + /// , but does mean some unused space may be present. + /// The first valid state ID is 1. + /// + /// + /// For these "delta" arrays, technically Volatile.Read should be used to read out an element, + /// but in practice that's not needed on the runtimes in use (though that needs to be documented + /// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is + /// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789). + /// + private int[] _dfaDelta; + + /// + /// Maps each NFA state id to the state id of the MatchingState stored in _stateArray. + /// This map is used to compactly represent NFA state ids in NFA mode in order to utilize + /// the property that all NFA states are small integers in one interval. + /// The valid entries are 0 to the size of - 1. + /// + private int[] _nfaCoreIdArray = Array.Empty(); + + /// + /// Maps the id of a MatchingState to the NFA state id that it is being identifed with in the NFA. + /// It is the inverse of used entries in _nfaStateArray. + /// The range of this map is 0 to its size - 1. + /// + private readonly Dictionary _nfaIdByCoreId = new(); + + /// + /// Transition function for NFA transitions in NFA mode. + /// Each NFA entry maps to a list of NFA target states. + /// Each list of target states is without repetitions. + /// If the entry is null then the targets states have not been computed yet. + /// + private int[]?[] _nfaDelta = Array.Empty(); + + /// + /// The transition function for , + /// which is an NFA mode with additional state to track capture start and end positions. + /// Each entry is an array of pairs of target state and effects to be applied when taking the transition. + /// If the entry is null then the transition has not been computed yet. + /// + private (int, DerivativeEffect[])[]?[] _capturingNfaDelta = Array.Empty<(int, DerivativeEffect[])[]?>(); + + /// + /// Implements a version of that is guaranteed to not publish an array before values + /// have been copied over. + /// + /// + /// This may not be strictly necessary for arrays of primitive or reference types (which have atomic + /// reads/writes), as when, e.g., is found to not have an entry the array is checked again + /// after a lock on the matcher has been acquired. However, in a highly threaded use case it still seems better + /// to avoid unnecessarily causing other threads to acquire the lock. + /// + private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) + { + Debug.Assert(newSize >= array.Length); + T[] newArray = new T[newSize]; + Array.Copy(array, newArray, array.Length); + Volatile.Write(ref array, newArray); + } + + private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId; + + /// Returns the span from that may contain transitions for the given state + private Span GetDeltasFor(MatchingState state) + { + Debug.Assert(Monitor.IsEntered(this)); + + int numMinterms = _minterms.Length; + if (state.StartsWithLineAnchor) + { + numMinterms++; + } + + return _dfaDelta.AsSpan(state.Id << _mintermsLog, numMinterms); + } + + /// Returns the span from that may contain transitions for the given state + private Span GetNfaDeltasFor(MatchingState state) + { + Debug.Assert(Monitor.IsEntered(this)); + + if (!_nfaIdByCoreId.TryGetValue(state.Id, out int nfaState)) + { + return default; + } + + int numMinterms = _minterms.Length; + if (state.StartsWithLineAnchor) + { + numMinterms++; + } + + return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms); + } + + /// Get context-independent information for the given state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId) + { + Debug.Assert(stateId > 0); + + ContextIndependentState info = _stateInfo[stateId]; + return ((info & ContextIndependentState.IsInitial) != 0, + (info & ContextIndependentState.IsDeadend) != 0, + (info & ContextIndependentState.IsNullable) != 0, + (info & ContextIndependentState.CanBeNullable) != 0); + } + + /// + /// Create a state with given node and previous character context. + /// + /// the pattern that this state will represent + /// the kind of the character that led to this state + /// + private MatchingState GetOrCreateState(SymbolicRegexNode node, uint prevCharKind) + { + Debug.Assert(Monitor.IsEntered(this)); + return GetOrCreateState_NoLock(node, prevCharKind); + } + + /// + /// Create a state with given node and previous character context. + /// + /// the pattern that this state will represent + /// the kind of the character that led to this state + /// whether to mark the state as an initial state or not + /// + private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node, uint prevCharKind, bool isInitialState = false) + { + SymbolicRegexNode prunedNode = node.PruneAnchors(_builder, prevCharKind); + (SymbolicRegexNode Node, uint PrevCharKind) key = (prunedNode, prevCharKind); + if (!_stateCache.TryGetValue(key, out MatchingState? state)) + { + state = new MatchingState(key.Node, key.PrevCharKind); + _stateCache.Add(key, state); // Add to cache first to make 1 the first state ID + state.Id = _stateCache.Count; + + Debug.Assert(_stateArray is not null); + + if (state.Id == _stateArray.Length) + { + // The growth factor 2 matches that of List + int newsize = _stateArray.Length * 2; + ArrayResizeAndVolatilePublish(ref _stateArray, newsize); + ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); + ArrayResizeAndVolatilePublish(ref _stateInfo, newsize); + } + _stateArray[state.Id] = state; + _stateInfo[state.Id] = BuildStateInfo(state.Id, isInitialState, state.IsDeadend(Solver), state.Node.IsNullable, state.Node.CanBeNullable); + } + + return state; + + // Assign the context-independent information for the given state + static ContextIndependentState BuildStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) + { + Debug.Assert(stateId > 0); + Debug.Assert(!isNullable || canBeNullable); + + ContextIndependentState info = 0; + + if (isInitial) + { + info |= ContextIndependentState.IsInitial; + } + + if (isDeadend) + { + info |= ContextIndependentState.IsDeadend; + } + + if (canBeNullable) + { + info |= ContextIndependentState.CanBeNullable; + if (isNullable) + { + info |= ContextIndependentState.IsNullable; + } + } + + return info; + } + } + + /// + /// Make an NFA state for the given node and previous character kind. NFA states include a "core state" of a + /// allocated with , + /// which stores the pattern and previous character kind and can be used for creating further NFA transitions. + /// In addition to the ID of the core state, NFA states are allocated a new NFA mode specific ID, which is + /// used to index into NFA mode transition arrays (e.g. ). + /// + /// + /// Using an ID numbering for NFA mode that is separate from DFA mode allows the IDs to be smaller, which saves + /// space both in the NFA mode arrays and in the instances used during matching for + /// sets of NFA states. + /// The core state ID can be looked up by the NFA ID with . + /// + /// the NFA ID of the new state, or null if the state is a dead end + private int? CreateNfaState(SymbolicRegexNode node, uint prevCharKind) + { + Debug.Assert(Monitor.IsEntered(this)); + Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate); + + // First make the core state for the node, which is used for creating further transitions out of this state + MatchingState coreState = GetOrCreateState(node, prevCharKind); + + // If the state is a dead end then don't create an NFA state, as dead ends in NFA mode are represented + // as empty lists of states. + if (coreState.IsDeadend(Solver)) + { + return null; + } + + // The NFA state itself is an ID that can be mapped back to the ID of the MatchingState. These NFA states are + // allocated separately from the IDs used in DFA mode to avoid large values, which helps save memory in the + // SparseIntMap data structures used in NFA matching modes. + if (!_nfaIdByCoreId.TryGetValue(coreState.Id, out int nfaStateId)) + { + // No NFA state already exists, so make a new one. NFA state IDs are allocated sequentially from zero by + // giving each new state an ID equal to the number of existing NFA states. + nfaStateId = _nfaIdByCoreId.Count; + + // If the next ID is past the end of the NFA state array, increase the sizes of the NFA arrays + if (nfaStateId == _nfaCoreIdArray.Length) + { + // The growth factor 2 matches that of List + int newsize = Math.Max(_nfaCoreIdArray.Length * 2, InitialNfaStateCapacity); + ArrayResizeAndVolatilePublish(ref _nfaCoreIdArray, newsize); + ArrayResizeAndVolatilePublish(ref _nfaDelta, newsize << _mintermsLog); + ArrayResizeAndVolatilePublish(ref _capturingNfaDelta, newsize << _mintermsLog); + } + + // Store the mapping from NFA state ID to core state ID + Debug.Assert(nfaStateId < _nfaCoreIdArray.Length); + _nfaCoreIdArray[nfaStateId] = coreState.Id; + + // Store the mapping from core state ID to NFA state ID + // Adding an entry here increments the ID that will be given to the next NFA state + _nfaIdByCoreId.Add(coreState.Id, nfaStateId); + } + + return nfaStateId; + } + + /// Gets the corresponding to the given state ID. + private MatchingState GetState(int stateId) + { + Debug.Assert(stateId > 0); + MatchingState? state = _stateArray[stateId]; + Debug.Assert(state is not null); + return state; + } + + /// Gets the core state Id corresponding to the NFA state + private int GetCoreStateId(int nfaStateId) + { + Debug.Assert(nfaStateId < _nfaCoreIdArray.Length); + Debug.Assert(_nfaCoreIdArray[nfaStateId] < _stateArray.Length); + return _nfaCoreIdArray[nfaStateId]; + } + + /// Gets or creates a new DFA transition. + /// This function locks the matcher for safe concurrent use of the + private bool TryCreateNewTransition( + MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState) + { + Debug.Assert(offset < _dfaDelta.Length); + + lock (this) + { + // check if meanwhile delta[offset] has become defined possibly by another thread + MatchingState? targetState = _stateArray[_dfaDelta[offset]]; + if (targetState is null) + { + if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) + { + nextState = null; + return false; + } + + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + targetState = GetOrCreateState(sourceState.Next(_builder, minterm, nextCharKind), nextCharKind); + Volatile.Write(ref _dfaDelta[offset], targetState.Id); + } + + nextState = targetState; + return true; + } + } + + /// Gets or creates a new NFA transition. + /// This function locks the matcher for safe concurrent use of the + private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset) + { + Debug.Assert(nfaOffset < _nfaDelta.Length); + + lock (this) + { + // check if meanwhile the nfaoffset has become defined possibly by another thread + int[]? targets = _nfaDelta[nfaOffset]; + if (targets is null) + { + // Create the underlying transition from the core state corresponding to the nfa state + int coreId = GetCoreStateId(nfaStateId); + int coreOffset = (coreId << _mintermsLog) | mintermId; + int coreTargetId = _dfaDelta[coreOffset]; + MatchingState coreState = GetState(coreId); + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + SymbolicRegexNode? targetNode = coreTargetId > 0 ? + GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); + + List targetsList = new(); + ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) => + targetsList.Add(nfaId)); + + targets = targetsList.ToArray(); + Volatile.Write(ref _nfaDelta[nfaOffset], targets); + } + + return targets; + } + } + + /// Gets or creates a new capturing NFA transition. + /// This function locks the matcher for safe concurrent use of the + private (int, DerivativeEffect[])[] CreateNewCapturingTransition(int nfaStateId, int mintermId, int offset) + { + lock (this) + { + // Get the next state if it exists. The caller should have already tried and found it null (not yet created), + // but in the interim another thread could have created it. + (int, DerivativeEffect[])[]? targets = _capturingNfaDelta[offset]; + if (targets is null) + { + MatchingState coreState = GetState(GetCoreStateId(nfaStateId)); + TSet minterm = GetMintermFromId(mintermId); + uint nextCharKind = GetPositionKind(mintermId); + List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind); + // Build the new state and store it into the array. + List<(int, DerivativeEffect[])> targetsList = new(); + foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition) + { + ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects), + static (int nfaId, (List<(int, DerivativeEffect[])> Targets, DerivativeEffect[] Effects) args) => + args.Targets.Add((nfaId, args.Effects))); + } + targets = targetsList.ToArray(); + Volatile.Write(ref _capturingNfaDelta[offset], targets); + } + + return targets; + } + } + + /// + /// Iterates through the alternation branches + /// and tries to create NFA states for each. The supplied action is called for each created NFA state. These never + /// include dead ends as will filter those out. + /// + /// This function locks the matcher for safe concurrent use of the + /// the type of the additional argument passed through to the action + /// the node to break up into NFA states + /// the previous character kind for each created NFA state + /// an additional argument passed through to each call to the action + /// action to call for each NFA state + private void ForEachNfaState(SymbolicRegexNode node, uint prevCharKind, T arg, Action action) + { + lock (this) + { + foreach (SymbolicRegexNode nfaNode in node.EnumerateAlternationBranches(_builder)) + { + if (CreateNfaState(nfaNode, prevCharKind) is int nfaId) + { + action(nfaId, arg); + } + } + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs index 1225c4748e4c57bd4f6bb3ca0f91550da2389efb..157fd7d332db92d877894096d82dfb9d7cf642cd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs @@ -16,140 +16,140 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override void SaveDGML(TextWriter writer, int maxLabelLength) { - if (maxLabelLength < 0) - maxLabelLength = int.MaxValue; - - Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> transitions = GatherTransitions(_builder); - - writer.WriteLine(""); - writer.WriteLine(""); - writer.WriteLine(" "); - writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); - writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); - foreach (DfaMatchingState state in _builder._stateCache) + lock (this) { - string info = CharKind.DescribePrev(state.PrevCharKind); - string deriv = WebUtility.HtmlEncode(state.Node.ToString()); - string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info} ")}{(deriv == string.Empty ? "()" : deriv)}"; + if (maxLabelLength < 0) + maxLabelLength = int.MaxValue; + + Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> transitions = GatherTransitions(this); - writer.WriteLine(" ", state.Id, nodeDgmlView); - if (_builder.GetStateInfo(state.Id).IsInitial) + writer.WriteLine(""); + writer.WriteLine(""); + writer.WriteLine(" "); + writer.WriteLine(" ", FormatInfo(this, transitions.Count)); + writer.WriteLine(" ", FormatInfo(this, transitions.Count)); + foreach (MatchingState state in _stateCache.Values) { - writer.WriteLine(" "); + string info = CharKind.DescribePrev(state.PrevCharKind); + string deriv = WebUtility.HtmlEncode(state.Node.ToString()); + string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info} ")}{(deriv == string.Empty ? "()" : deriv)}"; + + writer.WriteLine(" ", state.Id, nodeDgmlView); + if (GetStateInfo(state.Id).IsInitial) + { + writer.WriteLine(" "); + } + if (state.Node.CanBeNullable) + { + writer.WriteLine(" "); + } + writer.WriteLine(" "); + writer.WriteLine(" ", state.Id, nodeDgmlView); } - if (state.Node.CanBeNullable) + writer.WriteLine(" "); + writer.WriteLine(" "); + foreach (MatchingState initialState in GetInitialStates(this)) { - writer.WriteLine(" "); + writer.WriteLine(" ", initialState.Id); } - writer.WriteLine(" "); - writer.WriteLine(" ", state.Id, nodeDgmlView); - } - writer.WriteLine(" "); - writer.WriteLine(" "); - foreach (DfaMatchingState initialState in GetInitialStates(this)) - { - Debug.Assert(_builder._stateCache.Contains(initialState)); - writer.WriteLine(" ", initialState.Id); - } - writer.WriteLine(" "); + writer.WriteLine(" "); - foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List NfaTargets)> transition in transitions) - { - string label = DescribeLabel(transition.Value.Rule, _builder); - string info = ""; - if (label.Length > maxLabelLength) + foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List NfaTargets)> transition in transitions) { - info = $"FullLabel = \"{label}\" "; - label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + string label = DescribeLabel(transition.Value.Rule, _builder); + string info = ""; + if (label.Length > maxLabelLength) + { + info = $"FullLabel = \"{label}\" "; + label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + } + + writer.WriteLine($" "); + // Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character) + // from the target of the DFA transition. + foreach (int nfaTarget in transition.Value.NfaTargets) + { + writer.WriteLine($" "); + } } - writer.WriteLine($" "); - // Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character) - // from the target of the DFA transition. - foreach (int nfaTarget in transition.Value.NfaTargets) + foreach (MatchingState state in _stateCache.Values) { - writer.WriteLine($" "); + writer.WriteLine(" ", state.Id); } - } - foreach (DfaMatchingState state in _builder._stateCache) - { - writer.WriteLine(" ", state.Id); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(""); } - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(""); - // This function gathers all transitions in the given builder and groups them by (source,destination) state ID - static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> GatherTransitions(SymbolicRegexBuilder builder) + static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> GatherTransitions(SymbolicRegexMatcher matcher) { - Debug.Assert(builder._delta is not null); - Debug.Assert(builder._minterms is not null); Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> result = new(); - foreach (DfaMatchingState source in builder._stateCache) + foreach (MatchingState source in matcher._stateCache.Values) { // Get the span of entries in delta that gives the transitions for the different minterms - Span deltas = builder.GetDeltasFor(source); - Span nfaDeltas = builder.GetNfaDeltasFor(source); - Debug.Assert(deltas.Length == builder._minterms.Length); + Span deltas = matcher.GetDeltasFor(source); + Span nfaDeltas = matcher.GetNfaDeltasFor(source); + Debug.Assert(deltas.Length == matcher._minterms.Length); for (int i = 0; i < deltas.Length; ++i) { // negative entries are transitions not explored yet, so skip them @@ -160,7 +160,7 @@ static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> G (int Source, int Target) key = (source.Id, targetId); if (!result.TryGetValue(key, out (TSet Rule, List NfaTargets) entry)) { - entry = (builder._solver.Empty, new List()); + entry = (matcher.Solver.Empty, new List()); } // If this state has an NFA transition for the same minterm, then associate // those with the transition. @@ -168,24 +168,24 @@ static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> G { foreach (int nfaTarget in nfaTargets) { - entry.NfaTargets.Add(builder._nfaStateArray[nfaTarget]); + entry.NfaTargets.Add(matcher._nfaCoreIdArray[nfaTarget]); } } // Expand the rule for this minterm - result[key] = (builder._solver.Or(entry.Rule, builder._minterms[i]), entry.NfaTargets); + result[key] = (matcher.Solver.Or(entry.Rule, matcher._minterms[i]), entry.NfaTargets); } } } return result; } - static string FormatInfo(SymbolicRegexBuilder builder, int transitionCount) + static string FormatInfo(SymbolicRegexMatcher matcher, int transitionCount) { StringBuilder sb = new(); - sb.Append($"States = {builder._stateCache.Count} "); + sb.Append($"States = {matcher._stateCache.Count} "); sb.Append($"Transitions = {transitionCount} "); - sb.Append($"Min Terms ({builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', - DescribeLabels(builder._solver.GetMinterms()!, builder)); + sb.Append($"Min Terms ({matcher.Solver.GetMinterms()!.Length}) = ").AppendJoin(',', + DescribeLabels(matcher.Solver.GetMinterms()!, matcher._builder)); return sb.ToString(); } @@ -200,13 +200,13 @@ static IEnumerable DescribeLabels(IEnumerable labels, SymbolicRege static string DescribeLabel(TSet label, SymbolicRegexBuilder builder) => WebUtility.HtmlEncode(builder._solver.PrettyPrint(label, builder._charSetSolver)); - static IEnumerable> GetInitialStates(SymbolicRegexMatcher matcher) + static IEnumerable> GetInitialStates(SymbolicRegexMatcher matcher) { - foreach (DfaMatchingState state in matcher._dotstarredInitialStates) + foreach (MatchingState state in matcher._dotstarredInitialStates) yield return state; - foreach (DfaMatchingState state in matcher._initialStates) + foreach (MatchingState state in matcher._initialStates) yield return state; - foreach (DfaMatchingState state in matcher._reverseInitialStates) + foreach (MatchingState state in matcher._reverseInitialStates) yield return state; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs index 6808434ef984197a841d8290a5f86974d00afb6b..09880c1ad448afcc8d4839b73f106d4f6a14926d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs @@ -16,89 +16,91 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa) { - Debug.Assert(_builder._minterms is not null); - - // Track seen states to avoid exploring twice - HashSet> seen = new(); - // Use a queue for unexplored states - // This results in a breadth-first exploration - Queue> toExplore = new(); + lock (this) + { + // Track seen states to avoid exploring twice + HashSet> seen = new(); + // Use a queue for unexplored states + // This results in a breadth-first exploration + Queue> toExplore = new(); - // Explore all initial states as requested - if (includeDotStarred) - EnqueueAll(_dotstarredInitialStates, seen, toExplore); - if (includeReverse) - EnqueueAll(_reverseInitialStates, seen, toExplore); - if (includeOriginal) - EnqueueAll(_initialStates, seen, toExplore); + // Explore all initial states as requested + if (includeDotStarred) + EnqueueAll(_dotstarredInitialStates, seen, toExplore); + if (includeReverse) + EnqueueAll(_reverseInitialStates, seen, toExplore); + if (includeOriginal) + EnqueueAll(_initialStates, seen, toExplore); - if (exploreDfa) - { - while (toExplore.Count > 0) + if (exploreDfa) { - // Don't dequeue yet, because a transition might fail - DfaMatchingState state = toExplore.Peek(); - // Include the special minterm for the last end-of-line if the state is sensitive to it - int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; - // Explore successor states for each minterm - for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + while (toExplore.Count > 0) { - int offset = (state.Id << _builder._mintermsLog) | mintermId; - if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState? nextState)) - goto DfaLimitReached; - EnqueueIfUnseen(nextState, seen, toExplore); + // Don't dequeue yet, because a transition might fail + MatchingState state = toExplore.Peek(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + { + int offset = DeltaOffset(state.Id, mintermId); + if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState? nextState)) + goto DfaLimitReached; + EnqueueIfUnseen(nextState, seen, toExplore); + } + // Safe to dequeue now that the state has been completely handled + toExplore.Dequeue(); } - // Safe to dequeue now that the state has been completely handled - toExplore.Dequeue(); } - } - DfaLimitReached: - if (exploreNfa && toExplore.Count > 0) - { - // DFA states are broken up into NFA states when they are alternations - DfaMatchingState[] toBreakUp = toExplore.ToArray(); - toExplore.Clear(); - foreach (DfaMatchingState dfaState in toBreakUp) + DfaLimitReached: + if (exploreNfa && toExplore.Count > 0) { - // Remove state from seen so that it can be added back in if necessary - seen.Remove(dfaState); - // Enqueue all elements of a top level alternation or the state itself - foreach (var element in dfaState.Node.EnumerateAlternationBranches()) + // DFA states are broken up into NFA states when they are alternations + MatchingState[] toBreakUp = toExplore.ToArray(); + toExplore.Clear(); + foreach (MatchingState dfaState in toBreakUp) { - int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind); - EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore); + // Remove state from seen so that it can be added back in if necessary + seen.Remove(dfaState); + // Enqueue all elements of a top level alternation or the state itself + ForEachNfaState(dfaState.Node, dfaState.PrevCharKind, (this, seen, toExplore), + static (int nfaId, (SymbolicRegexMatcher Matcher, HashSet> Seen, Queue> ToExplore) args) => + { + MatchingState? coreState = args.Matcher.GetState(args.Matcher.GetCoreStateId(nfaId)); + EnqueueIfUnseen(coreState, args.Seen, args.ToExplore); + }); } - } - while (toExplore.Count > 0) - { - // NFA transitions can't fail, so its safe to dequeue here - DfaMatchingState state = toExplore.Dequeue(); - // Include the special minterm for the last end-of-line if the state is sensitive to it - int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; - // Explore successor states for each minterm - for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + while (toExplore.Count > 0) { - int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId; - int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset); - foreach (int nextNfaState in nextNfaStates) + // NFA transitions can't fail, so its safe to dequeue here + MatchingState state = toExplore.Dequeue(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _minterms.Length : _minterms.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) { - EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore); + int nfaOffset = DeltaOffset(_nfaIdByCoreId[state.Id], mintermId); + int[] nextNfaStates = CreateNewNfaTransition(_nfaIdByCoreId[state.Id], mintermId, nfaOffset); + foreach (int nextNfaState in nextNfaStates) + { + EnqueueIfUnseen(GetState(GetCoreStateId(nextNfaState)), seen, toExplore); + } } } } } - static void EnqueueAll(DfaMatchingState[] states, HashSet> seen, Queue> toExplore) + static void EnqueueAll(MatchingState[] states, HashSet> seen, Queue> toExplore) { - foreach (DfaMatchingState state in states) + foreach (MatchingState state in states) { EnqueueIfUnseen(state, seen, toExplore); } } - static void EnqueueIfUnseen(DfaMatchingState state, HashSet> seen, Queue> queue) + static void EnqueueIfUnseen(MatchingState state, HashSet> seen, Queue> queue) { if (seen.Add(state)) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs index e5040d7b121894ac2f662077c7e6aeeebd8cc953..dc62647080b0e9f997186af7fba3495bfc5ca458 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs @@ -30,133 +30,134 @@ internal sealed partial class SymbolicRegexMatcher [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] public override IEnumerable SampleMatches(int k, int randomseed) { - // Zero is treated as no seed, instead using a system provided one - Random random = randomseed != 0 ? new Random(randomseed) : new Random(); - - ISolver solver = _builder._solver; - CharSetSolver charSetSolver = _builder._charSetSolver; + lock (this) + { + // Zero is treated as no seed, instead using a system provided one + Random random = randomseed != 0 ? new Random(randomseed) : new Random(); + CharSetSolver charSetSolver = _builder._charSetSolver; - // Create helper BDDs for handling anchors and preferentially generating ASCII inputs - BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { + // Create helper BDDs for handling anchors and preferentially generating ASCII inputs + BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { charSetSolver.CreateBDDFromRange('A', 'Z'), charSetSolver.CreateBDDFromRange('a', 'z'), charSetSolver.CreateBDDFromChar('_'), charSetSolver.CreateBDDFromRange('0', '9')}); - // Visible ASCII range for input character generation - BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E'); - BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); - - // Set up two sets of minterms, one with the additional special minterm for the last end-of-line - Debug.Assert(_builder._minterms is not null); - int[] mintermIdsWithoutZ = new int[_builder._minterms.Length]; - int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1]; - for (int i = 0; i < _builder._minterms.Length; ++i) - { - mintermIdsWithoutZ[i] = i; - mintermIdsWithZ[i] = i; - } - mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length; - - for (int i = 0; i < k; i++) - { - // Holds the generated input so far - StringBuilder inputSoFar = new(); - StringBuilder? latestCandidate = null; + // Visible ASCII range for input character generation + BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E'); + BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); + + // Set up two sets of minterms, one with the additional special minterm for the last end-of-line + Debug.Assert(_minterms is not null); + int[] mintermIdsWithoutZ = new int[_minterms.Length]; + int[] mintermIdsWithZ = new int[_minterms.Length + 1]; + for (int i = 0; i < _minterms.Length; ++i) + { + mintermIdsWithoutZ[i] = i; + mintermIdsWithZ[i] = i; + } + mintermIdsWithZ[_minterms.Length] = _minterms.Length; - // Current set of states reached initially contains just the root - NfaMatchingState states = new(_builder); - // Here one could also consider previous characters for example for \b, \B, and ^ anchors - // and initialize inputSoFar accordingly - states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan.Empty, -1)]); - CurrentState statesWrapper = new(states); + for (int i = 0; i < k; i++) + { + // Holds the generated input so far + StringBuilder inputSoFar = new(); + StringBuilder? latestCandidate = null; - // Used for end suffixes - List possibleEndings = new(); + // Current set of states reached initially contains just the root + NfaMatchingState states = new(); + // Here one could also consider previous characters for example for \b, \B, and ^ anchors + // and initialize inputSoFar accordingly + states.InitializeFrom(this, _initialStates[GetCharKind(ReadOnlySpan.Empty, -1)]); + CurrentState statesWrapper = new(states); - while (true) - { - Debug.Assert(states.NfaStateSet.Count > 0); + // Used for end suffixes + List possibleEndings = new(); - // Gather the possible endings for satisfying nullability - possibleEndings.Clear(); - if (NfaStateHandler.CanBeNullable(ref statesWrapper)) + while (true) { - // Unconditionally final state or end of the input due to \Z anchor for example - if (NfaStateHandler.IsNullable(ref statesWrapper) || - NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd)) - { - possibleEndings.Add(""); - } + Debug.Assert(states.NfaStateSet.Count > 0); - // End of line due to end-of-line anchor - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline)) + // Gather the possible endings for satisfying nullability + possibleEndings.Clear(); + if (SymbolicRegexMatcher.NfaStateHandler.CanBeNullable(this, in statesWrapper)) { - possibleEndings.Add("\n"); + // Unconditionally final state or end of the input due to \Z anchor for example + if (SymbolicRegexMatcher.NfaStateHandler.IsNullable(this, in statesWrapper) || + SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.BeginningEnd)) + { + possibleEndings.Add(""); + } + + // End of line due to end-of-line anchor + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.Newline)) + { + possibleEndings.Add("\n"); + } + + // Related to wordborder due to \b or \B + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.WordLetter)) + { + possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); + } + + // Related to wordborder due to \b or \B + if (SymbolicRegexMatcher.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.General)) + { + possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); + } } - // Related to wordborder due to \b or \B - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter)) + // If we have a possible ending, then store a candidate input + if (possibleEndings.Count > 0) { - possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); + latestCandidate ??= new(); + latestCandidate.Clear(); + latestCandidate.Append(inputSoFar); + //Choose some suffix that allows some anchor (if any) to be nullable + latestCandidate.Append(Choose(random, possibleEndings)); + + // Choose to stop here based on a coin-toss + if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) + { + yield return latestCandidate.ToString(); + break; + } } - // Related to wordborder due to \b or \B - if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General)) + // Shuffle the minterms, including the last end-of-line marker if appropriate + int[] mintermIds = SymbolicRegexMatcher.NfaStateHandler.StartsWithLineAnchor(this, in statesWrapper) ? + Shuffle(random, mintermIdsWithZ) : + Shuffle(random, mintermIdsWithoutZ); + foreach (int mintermId in mintermIds) { - possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); + bool success = SymbolicRegexMatcher.NfaStateHandler.TryTakeTransition(this, ref statesWrapper, mintermId); + Debug.Assert(success); + if (states.NfaStateSet.Count > 0) + { + TSet minterm = GetMintermFromId(mintermId); + // Append a random member of the minterm + inputSoFar.Append(ChooseChar(random, ToBDD(minterm, Solver, charSetSolver), ascii, charSetSolver)); + break; + } + else + { + // The transition was a dead end, undo and continue to try another minterm + NfaStateHandler.UndoTransition(ref statesWrapper); + } } - } - // If we have a possible ending, then store a candidate input - if (possibleEndings.Count > 0) - { - latestCandidate ??= new(); - latestCandidate.Clear(); - latestCandidate.Append(inputSoFar); - //Choose some suffix that allows some anchor (if any) to be nullable - latestCandidate.Append(Choose(random, possibleEndings)); - - // Choose to stop here based on a coin-toss - if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) + // In the case that there are no next states or input has become too large: stop here + if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) { - yield return latestCandidate.ToString(); + // Ending up here without an ending is unlikely but possible for example for infeasible patterns + // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. + if (latestCandidate != null) + { + yield return latestCandidate.ToString(); + } break; } } - - // Shuffle the minterms, including the last end-of-line marker if appropriate - int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ? - Shuffle(random, mintermIdsWithZ) : - Shuffle(random, mintermIdsWithoutZ); - foreach (int mintermId in mintermIds) - { - bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId); - Debug.Assert(success); - if (states.NfaStateSet.Count > 0) - { - TSet minterm = _builder.GetMinterm(mintermId); - // Append a random member of the minterm - inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver)); - break; - } - else - { - // The transition was a dead end, undo and continue to try another minterm - NfaStateHandler.UndoTransition(ref statesWrapper); - } - } - - // In the case that there are no next states or input has become too large: stop here - if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) - { - // Ending up here without an ending is unlikely but possible for example for infeasible patterns - // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. - if (latestCandidate != null) - { - yield return latestCandidate.ToString(); - } - break; - } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index beec486dce6e544c6f03f1b548a0cd14a2e4170c..b84df67463f6807d8a7289d276eacbde6c678cbe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.IO; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; @@ -84,19 +85,31 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _initialStates; + private readonly MatchingState[] _initialStates; /// The initial states for the dot-star pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _dotstarredInitialStates; + private readonly MatchingState[] _dotstarredInitialStates; /// The initial states for the reverse pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. - private readonly DfaMatchingState[] _reverseInitialStates; + private readonly MatchingState[] _reverseInitialStates; - /// Lookup table to quickly determine the character kind for ASCII characters. - /// Non-null iff the pattern contains anchors; otherwise, it's unused. - private readonly uint[]? _asciiCharKinds; + /// Partition of the input space of sets. + private readonly TSet[] _minterms; + + /// + /// Character kinds for all minterms in as well as two special + /// cases: character positions outside the input bounds and an end-of-line as the last input character. + /// + private readonly uint[] _positionKinds; + + /// + /// The smallest k s.t. 2^k >= minterms.Length + 1. The "delta arrays", e.g., allocate 2^k + /// consecutive slots for each state ID to represent the transitions for each minterm. The extra slot at index + /// _minterms.Length is used to represent an \n occurring at the very end of input, for supporting the \Z anchor. + /// + private readonly int _mintermsLog; /// Number of capture groups. private readonly int _capsize; @@ -105,14 +118,10 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// This determines whether the matcher uses the special capturing NFA simulation mode. internal bool HasSubcaptures => _capsize > 1; - /// Get the minterm of . - /// character code - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private TSet GetMinterm(int c) - { - Debug.Assert(_builder._minterms is not null); - return _builder._minterms[_mintermClassifier.GetMintermID(c)]; - } + /// + /// Both solvers supported here, and are thread safe. + /// + private ISolver Solver => _builder._solver; /// Creates a new . /// The number of captures in the regular expression. @@ -136,25 +145,46 @@ private TSet GetMinterm(int c) _newLineSet = solver.ConvertFromBDD(bddBuilder._newLineSet, charSetSolver) }; - // Convert the BDD-based AST to TSetType-based AST + // Convert the BDD-based AST to TSet-based AST SymbolicRegexNode rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver)); - return new SymbolicRegexMatcher(rootNode, captureCount, findOptimizations, matchTimeout); + return new SymbolicRegexMatcher(builder, rootNode, captureCount, findOptimizations, matchTimeout); } /// Constructs matcher for given symbolic regex. - private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) + private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) { - Debug.Assert(rootNode._builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {rootNode._builder._solver}"); + Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}"); _pattern = rootNode; - _builder = rootNode._builder; + _builder = builder; _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms - _mintermClassifier = _builder._solver is UInt64Solver bv64 ? + TSet[]? solverMinterms = builder._solver.GetMinterms(); + Debug.Assert(solverMinterms is not null); + _minterms = solverMinterms; + // BitOperations.Log2 gives the integer floor of the log, so the +1 below either rounds up with non-power-of-two + // minterms or adds an extra bit with power-of-two minterms. The extra slot at index _minterms.Length is used to + // represent an \n occurring at the very end of input, for supporting the \Z anchor. + _mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1; + _mintermClassifier = builder._solver is UInt64Solver bv64 ? bv64._classifier : - ((BitVectorSolver)(object)_builder._solver)._classifier; + ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; + // Initialization for fields in SymbolicRegexMatcher.Automata.cs + _stateArray = new MatchingState[InitialDfaStateCapacity]; + _stateInfo = new ContextIndependentState[InitialDfaStateCapacity]; + _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; + + // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm + // ID _minterms.Length, which is used to represent a \n at the very end of input, and another ID -1, + // which is used to represent any position outside the bounds of the input. + _positionKinds = new uint[_minterms.Length + 2]; + for (int mintermId = -1; mintermId < _positionKinds.Length - 1; mintermId++) + { + _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); + } + // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's // handling for beginning anchors. @@ -168,26 +198,28 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, // character kind 0 is ever going to be used for all initial states. int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; + // The loops below and how character kinds are calculated assume that the "general" character kind is zero + Debug.Assert(CharKind.General == 0); + // Create the initial states for the original pattern. - var initialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < initialStates.Length; i++) + var initialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < initialStates.Length; charKind++) { - initialStates[i] = _builder.CreateState(_pattern, i, capturing: HasSubcaptures); + initialStates[charKind] = GetOrCreateState_NoLock(_pattern, charKind); } _initialStates = initialStates; // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. - _dotStarredPattern = _builder.CreateConcat(_builder._anyStarLazy, _pattern); - var dotstarredInitialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < dotstarredInitialStates.Length; i++) + _dotStarredPattern = builder.CreateConcat(builder._anyStarLazy, _pattern); + var dotstarredInitialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < dotstarredInitialStates.Length; charKind++) { // Used to detect if initial state was reentered, // but observe that the behavior from the state may ultimately depend on the previous // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, // in that sense there can be several "versions" (not more than StateCount) of the initial state. - DfaMatchingState state = _builder.CreateState(_dotStarredPattern, i, capturing: false, isInitialState: true); - dotstarredInitialStates[i] = state; + dotstarredInitialStates[charKind] = GetOrCreateState_NoLock(_dotStarredPattern, charKind, isInitialState: true); } _dotstarredInitialStates = dotstarredInitialStates; @@ -195,84 +227,91 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, // initial states. Also disable backtracking simulation to ensure the reverse path from // the final state that was found is followed. Not doing so might cause the earliest // starting point to not be found. - _reversePattern = _builder.CreateDisableBacktrackingSimulation(_pattern.Reverse()); - var reverseInitialStates = new DfaMatchingState[statesCount]; - for (uint i = 0; i < reverseInitialStates.Length; i++) + _reversePattern = builder.CreateDisableBacktrackingSimulation(_pattern.Reverse(builder)); + var reverseInitialStates = new MatchingState[statesCount]; + for (uint charKind = 0; charKind < reverseInitialStates.Length; charKind++) { - reverseInitialStates[i] = _builder.CreateState(_reversePattern, i, capturing: false); + reverseInitialStates[charKind] = GetOrCreateState_NoLock(_reversePattern, charKind); } _reverseInitialStates = reverseInitialStates; - // Initialize our fast-lookup for determining the character kind of ASCII characters. - // This is only required when the pattern contains anchors, as otherwise there's only - // ever a single kind used. - if (_pattern._info.ContainsSomeAnchor) + // Maps a minterm ID to a character kind + uint CalculateMintermIdKind(int mintermId) { - var asciiCharKinds = new uint[128]; - for (int i = 0; i < asciiCharKinds.Length; i++) + // Only patterns with anchors use anything except the general kind + if (_pattern._info.ContainsSomeAnchor) { - TSet set; - uint charKind; - - if (i == '\n') + // A minterm ID of -1 represents the positions before the first and after the last character + // in the input. + if (mintermId == -1) { - set = _builder._newLineSet; - charKind = CharKind.Newline; + return CharKind.BeginningEnd; } - else + + // A minterm ID of minterms.Length represents a \n at the very end of input, which is matched + // by the \Z anchor. + if ((uint)mintermId == (uint)_minterms.Length) { - set = _builder._wordLetterForBoundariesSet; - charKind = CharKind.WordLetter; + return CharKind.NewLineS; } - asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), set).Equals(_builder._solver.Empty) ? 0 : charKind; + TSet minterm = _minterms[mintermId]; + + // Examine the minterm to figure out its character kind + if (_builder._newLineSet.Equals(minterm)) + { + // The minterm is a new line character + return CharKind.Newline; + } + else if (!Solver.IsEmpty(Solver.And(_builder._wordLetterForBoundariesSet, minterm))) + { + Debug.Assert(Solver.IsEmpty(Solver.And(Solver.Not(_builder._wordLetterForBoundariesSet), minterm))); + // The minterm is a subset of word letters as considered by \b and \B + return CharKind.WordLetter; + } } - _asciiCharKinds = asciiCharKinds; + + // All other minterms belong to the general kind + return CharKind.General; } } /// /// Create a PerThreadData with the appropriate parts initialized for this matcher's pattern. /// - internal PerThreadData CreatePerThreadData() => new PerThreadData(_builder, _capsize); + internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize); - /// Compute the target state for the source state and input[i] character and transition to it. - /// The associated builder. - /// The input text. - /// The index into at which the target character lives. - /// The current state being transitioned from. Upon return it's the new state if the transition succeeded. + /// Look up what is the character kind given a position ID [MethodImpl(MethodImplOptions.AggressiveInlining)] - private bool TryTakeTransition(SymbolicRegexBuilder builder, ReadOnlySpan input, int i, ref CurrentState state) - where TStateHandler : struct, IStateHandler + private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1]; + + /// + /// Lookup the actual minterm based on its ID. Also get its character kind, which is a general categorization of + /// characters used for cheaply deciding the nullability of anchors. + /// + internal TSet GetMintermFromId(int mintermId) { - int c = input[i]; + TSet[] minterms = _minterms; - // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor - int mintermId = c == '\n' && i == input.Length - 1 && TStateHandler.StartsWithLineAnchor(builder, ref state) ? - builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input - _mintermClassifier.GetMintermID(c); + // A minterm ID of minterms.Length represents a \n at the very end of input, which is matched + // by the \Z anchor. + if ((uint)mintermId >= (uint)minterms.Length) + { + return _builder._newLineSet; + } - return TStateHandler.TakeTransition(builder, ref state, mintermId); + // Otherwise look up the minterm from the array + return minterms[mintermId]; } - private List<(DfaMatchingState, DerivativeEffect[])> CreateNewCapturingTransitions(DfaMatchingState state, TSet minterm, int offset) - { - Debug.Assert(_builder._capturingDelta is not null); - lock (this) - { - // Get the next state if it exists. The caller should have already tried and found it null (not yet created), - // but in the interim another thread could have created it. - List<(DfaMatchingState, DerivativeEffect[])>? p = _builder._capturingDelta[offset]; - if (p is null) - { - // Build the new state and store it into the array. - p = state.NfaNextWithEffects(minterm); - Volatile.Write(ref _builder._capturingDelta[offset], p); - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private uint GetCharKind(ReadOnlySpan input, int i) + where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ? + CharKind.General : // The previous character kind is irrelevant when anchors are not used. + GetPositionKind(TInputReader.GetPositionId(this, input, i)); - return p; - } - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsMintermId(int positionId) => positionId >= 0; private void CheckTimeout(long timeoutOccursAt) { @@ -309,12 +348,16 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. int matchStartLowBoundary, matchStartLengthMarker; - int matchEnd = (_findOpts is not null, _pattern._info.ContainsSomeAnchor) switch + int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch { - (true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), }; // If there wasn't a match, we're done. @@ -345,9 +388,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { Debug.Assert(matchEnd >= startat - 1); matchStart = matchEnd < startat ? - startat : _pattern._info.ContainsSomeAnchor ? - FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData) : - FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData); + startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch + { + (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (false, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + (false, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + }; } // Phase 3: @@ -361,7 +408,9 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData); + Registers endRegisters = _pattern._info.ContainsLineAnchor ? + FindSubcaptures(input, matchStart, matchEnd, perThreadData) : + FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } @@ -377,15 +426,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { initialStatePos = pos; int initialStatePosCandidate = pos; - var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); - SymbolicRegexBuilder builder = _builder; + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; int endStateId = -1; @@ -404,8 +453,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i input; bool done = currentState.NfaState is not null ? - FindEndPositionDeltas(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - FindEndPositionDeltas(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + FindEndPositionDeltas(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + FindEndPositionDeltas(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -421,10 +470,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Because there was still more input available, a failure to transition in DFA mode must be the cause // of the early exit. Upgrade to NFA mode. - DfaMatchingState? dfaState = currentState.DfaState(_builder); - Debug.Assert(dfaState is not null); NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(dfaState); + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); currentState = new CurrentState(nfaState); } @@ -437,7 +484,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Check whether there's a fixed-length marker for the current state. If there is, we can // use that length to optimize subsequent matching phases. - matchLength = endStateId > 0 ? _builder._stateArray![endStateId].FixedLength(GetCharKind(input, endPos)) : -1; + matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } @@ -448,8 +495,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's /// to be non-negative and its to be null; vice versa for /// . /// @@ -458,15 +505,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// 0 if iteration completed because we reached an initial state. /// A negative value if iteration completed because we ran out of input or we failed to transition. /// - private bool FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, RegexRunnerMode mode, - ref int posRef, ref CurrentState stateRef, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + private bool FindEndPositionDeltas(ReadOnlySpan input, RegexRunnerMode mode, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; - CurrentState state = stateRef; int endPos = endPosRef; int endStateId = endStateIdRef; int initialStatePos = initialStatePosRef; @@ -476,13 +523,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Loop through each character in the input, transitioning from state to state for each. while (true) { - (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state); // Check if currentState represents an initial state. If it does, call into any possible find optimizations // to hopefully more quickly find the next possible starting location. if (isInitial) { - if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) + if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) { return true; } @@ -496,12 +543,14 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i return true; } + int positionId = TInputReader.GetPositionId(this, input, pos); + // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. - if (TNullabilityHandler.IsNullableAt(this, ref state, input, pos, isNullable, canBeNullable)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, isNullable, canBeNullable)) { endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, ref state, input, pos); + endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. @@ -512,7 +561,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } // If there is more input available try to transition with the next character. - if ((uint)pos >= (uint)input.Length || !TryTakeTransition(builder, input, pos, ref state)) + if (!IsMintermId(positionId) || !TStateHandler.TryTakeTransition(this, ref state, positionId)) { return false; } @@ -525,7 +574,6 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Write back the local copies of the ref values. posRef = pos; - stateRef = state; endPosRef = endPos; endStateIdRef = endStateId; initialStatePosRef = initialStatePos; @@ -546,7 +594,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// The initial starting location discovered in phase 1, a point we must not walk earlier than. /// Per thread data reused between calls. /// The found starting position for the match. - private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { Debug.Assert(i >= 0, $"{nameof(i)} == {i}"); @@ -555,18 +604,17 @@ private int FindStartPosition(ReadOnlySpan input, int // Get the starting state for the reverse pattern. This depends on previous character (which, because we're // going backwards, is character number i). - var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); + var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); int lastStart = -1; // invalid sentinel value // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. - SymbolicRegexBuilder builder = _builder; while (true) { // Run the DFA or NFA traversal backwards from the current point using the current state. bool done = currentState.NfaState is not null ? - FindStartPositionDeltas(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart) : - FindStartPositionDeltas(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart); + FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : + FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart); // If we found the starting position, we're done. if (done) @@ -578,10 +626,8 @@ private int FindStartPosition(ReadOnlySpan input, int // if we were unable to transition, which should only happen if we were in DFA mode and exceeded our graph size. // Upgrade to NFA mode and continue. Debug.Assert(i >= matchStartBoundary); - DfaMatchingState? dfaState = currentState.DfaState(_builder); - Debug.Assert(dfaState is not null); NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(dfaState); + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); currentState = new CurrentState(nfaState); } @@ -594,23 +640,25 @@ private int FindStartPosition(ReadOnlySpan input, int /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState currentState, ref int lastStart) + private bool FindStartPositionDeltas(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. int pos = i; - CurrentState state = currentState; try { // Loop backwards through each character in the input, transitioning from state to state for each. while (true) { - (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state); + + int positionId = TInputReader.GetPositionId(this, input, pos - 1); // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (TNullabilityHandler.IsNullableAt(this, ref state, input, pos - 1, isNullable, canBeNullable)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, isNullable, canBeNullable)) { lastStart = pos; } @@ -624,7 +672,7 @@ private int FindStartPosition(ReadOnlySpan input, int } // Try to transition with the next character, the one before the current position. - if (!TryTakeTransition(builder, input, pos - 1, ref state)) + if (!TStateHandler.TryTakeTransition(this, ref state, positionId)) { // Return false to indicate the search didn't finish. return false; @@ -637,7 +685,6 @@ private int FindStartPosition(ReadOnlySpan input, int finally { // Write back the local copies of the ref values. - currentState = state; i = pos; } } @@ -649,10 +696,11 @@ private int FindStartPosition(ReadOnlySpan input, int /// exclusive end position /// Per thread data reused between calls. /// the final register values, which indicate capture starts and ends - private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData) + private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData) + where TInputReader : struct, IInputReader { // Pick the correct start state based on previous character kind. - DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; + MatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; Registers initialRegisters = perThreadData.InitialRegisters; @@ -667,52 +715,45 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per SparseIntMap current = perThreadData.Current, next = perThreadData.Next; current.Clear(); next.Clear(); - current.Add(initialState.Id, initialRegisters); - SymbolicRegexBuilder builder = _builder; + ForEachNfaState(initialState.Node, initialState.PrevCharKind, (current, initialRegisters), + static (int nfaId, (SparseIntMap Current, Registers InitialRegisters) args) => + args.Current.Add(nfaId, args.InitialRegisters.Clone())); while ((uint)i < (uint)iEnd) { Debug.Assert(next.Count == 0); - // Read the next character and find its minterm - int c = input[i]; - int normalMintermId = _mintermClassifier.GetMintermID(c); + // i is guaranteed to be within bounds, so the position ID is a minterm ID + int mintermId = TInputReader.GetPositionId(this, input, i); foreach ((int sourceId, Registers sourceRegisters) in current.Values) { - Debug.Assert(builder._capturingStateArray is not null); - DfaMatchingState sourceState = builder._capturingStateArray[sourceId]; - - // Handle the special case for the last \n for states that start with a relevant anchor - int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ? - builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input - normalMintermId; - TSet minterm = builder.GetMinterm(mintermId); - // Get or create the transitions - int offset = (sourceId << builder._mintermsLog) | mintermId; - Debug.Assert(builder._capturingDelta is not null); - List<(DfaMatchingState, DerivativeEffect[])>? transitions = - builder._capturingDelta[offset] ?? - CreateNewCapturingTransitions(sourceState, minterm, offset); + int offset = DeltaOffset(sourceId, mintermId); + (int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ?? + CreateNewCapturingTransition(sourceId, mintermId, offset); // Take the transitions in their prioritized order - for (int j = 0; j < transitions.Count; ++j) + for (int j = 0; j < transitions.Length; ++j) { - (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j]; - Debug.Assert(!targetState.IsDeadend, "Transitions should not include dead ends."); + (int targetStateId, DerivativeEffect[] effects) = transitions[j]; // Try to add the state and handle the case where it didn't exist before. If the state already // exists, then the transition can be safely ignored, as the existing state was generated by a // higher priority transition. - if (next.Add(targetState.Id, out int index)) + if (next.Add(targetStateId, out int index)) { // Avoid copying the registers on the last transition from this state, reusing the registers instead - Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters; + Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters; newRegisters.ApplyEffects(effects, i); - next.Update(index, targetState.Id, newRegisters); - if (targetState.IsNullableFor(GetCharKind(input, i + 1))) + next.Update(index, targetStateId, newRegisters); + + int coreStateId = GetCoreStateId(targetStateId); + (bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = GetStateInfo(coreStateId); + Debug.Assert(!isDeadend); + + if (isNullable || (canBeNullable && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) { // No lower priority transitions from this or other source states are taken because the // backtracking engines would return the match ending here. @@ -732,15 +773,14 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per } Debug.Assert(current.Count > 0); - Debug.Assert(_builder._capturingStateArray is not null); foreach (var (endStateId, endRegisters) in current.Values) { - DfaMatchingState endState = _builder._capturingStateArray[endStateId]; - if (endState.IsNullableFor(GetCharKind(input, iEnd))) + MatchingState endState = GetState(GetCoreStateId(endStateId)); + if (endState.IsNullableFor(GetCharKind(input, iEnd))) { // Apply effects for finishing at the stored end state endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos), - CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); + CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); return endRegisters; } } @@ -749,39 +789,6 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, Per return default; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private uint GetCharKind(ReadOnlySpan input, int i) - { - return !_pattern._info.ContainsSomeAnchor ? - CharKind.General : // The previous character kind is irrelevant when anchors are not used. - GetCharKindWithAnchor(input, i); - - uint GetCharKindWithAnchor(ReadOnlySpan input, int i) - { - Debug.Assert(_asciiCharKinds is not null); - - if ((uint)i >= (uint)input.Length) - { - return CharKind.BeginningEnd; - } - - char nextChar = input[i]; - if (nextChar == '\n') - { - return - _builder._newLineSet.Equals(_builder._solver.Empty) ? 0 : // ignore \n - i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). - CharKind.Newline; - } - - uint[] asciiCharKinds = _asciiCharKinds; - return - nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] : - _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterForBoundariesSet).Equals(_builder._solver.Empty) ? 0 : // intersect with the wordletter set to compute the kind of the next character - CharKind.WordLetter; - } - } - /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. internal struct Registers @@ -867,9 +874,9 @@ internal sealed class PerThreadData /// Registers used for the capturing third phase. public readonly Registers InitialRegisters; - public PerThreadData(SymbolicRegexBuilder builder, int capsize) + public PerThreadData(int capsize) { - NfaState = new NfaMatchingState(builder); + NfaState = new NfaMatchingState(); // Only create data used for capturing mode if there are subcaptures if (capsize > 1) @@ -883,11 +890,9 @@ public PerThreadData(SymbolicRegexBuilder builder, int capsize) /// Stores the state that represents a current state in NFA mode. /// The entire state is composed of a list of individual states. + /// New instances should only be created once per runner. internal sealed class NfaMatchingState { - /// The associated builder used to lazily add new DFA or NFA nodes to the graph. - public readonly SymbolicRegexBuilder Builder; - /// Ordered set used to store the current NFA states. /// The value is unused. The type is used purely for its keys. public SparseIntMap NfaStateSet = new(); @@ -899,24 +904,17 @@ internal sealed class NfaMatchingState /// public SparseIntMap NfaStateSetScratch = new(); - /// Create the instance. - /// New instances should only be created once per runner. - public NfaMatchingState(SymbolicRegexBuilder builder) => Builder = builder; - /// Resets this NFA state to represent the supplied DFA state. + /// /// The DFA state to use to initialize the NFA state. - public void InitializeFrom(DfaMatchingState dfaMatchingState) + public void InitializeFrom(SymbolicRegexMatcher matcher, MatchingState dfaMatchingState) { NfaStateSet.Clear(); // If the DFA state is a union of multiple DFA states, loop through all of them // adding an NFA state for each. - foreach (SymbolicRegexNode element in dfaMatchingState.Node.EnumerateAlternationBranches()) - { - // Create (possibly new) NFA states for all the members. - // Add their IDs to the current set of NFA states and into the list. - NfaStateSet.Add(Builder.CreateNfaState(element, dfaMatchingState.PrevCharKind), out _); - } + matcher.ForEachNfaState(dfaMatchingState.Node, dfaMatchingState.PrevCharKind, NfaStateSet, + static (int nfaId, SparseIntMap nfaStateSet) => nfaStateSet.Add(nfaId, out _)); } } @@ -925,7 +923,7 @@ public void InitializeFrom(DfaMatchingState dfaMatchingState) private struct CurrentState { /// Initializes the state as a DFA state. - public CurrentState(DfaMatchingState dfaState) + public CurrentState(MatchingState dfaState) { DfaStateId = dfaState.Id; NfaState = null; @@ -942,51 +940,48 @@ public CurrentState(NfaMatchingState nfaState) public int DfaStateId; /// The NFA state. public NfaMatchingState? NfaState; - - public DfaMatchingState? DfaState(SymbolicRegexBuilder builder) => DfaStateId > 0 ? builder._stateArray![DfaStateId] : null; } /// Represents a set of routines for operating over a . private interface IStateHandler { - public static abstract bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state); - public static abstract bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind); - public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos); - public static abstract int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind); - public static abstract bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId); - public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state); + public static abstract bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state); + public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); + public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos); + public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); + public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId); + public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state); } /// An for operating over instances configured as DFA states. private readonly struct DfaStateHandler : IStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state) => state.DfaState(builder)!.StartsWithLineAnchor; + public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.IsNullableFor(nextCharKind); + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind); /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos) => state.DfaStateId; + public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) => state.DfaStateId; /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.FixedLength(nextCharKind); + public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).FixedLength(nextCharKind); /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); - Debug.Assert(builder._delta is not null); // Use the mintermId for the character being read to look up which state to transition to. // If that state has already been materialized, move to it, and we're done. If that state // hasn't been materialized, try to create it; if we can, move to it, and we're done. - int dfaOffset = (state.DfaStateId << builder._mintermsLog) | mintermId; - int nextStateId = builder._delta[dfaOffset]; + int dfaOffset = matcher.DeltaOffset(state.DfaStateId, mintermId); + int nextStateId = matcher._dfaDelta[dfaOffset]; if (nextStateId > 0) { // There was an existing DFA transition to some state. Move to it and @@ -995,7 +990,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren return true; } - if (builder.TryCreateNewTransition(state.DfaState(builder)!, mintermId, dfaOffset, checkThreshold: true, out DfaMatchingState? nextState)) + if (matcher.TryCreateNewTransition(matcher.GetState(state.DfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState? nextState)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. @@ -1014,22 +1009,19 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren /// - whether this state may be contextually nullable /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state) - { - Debug.Assert(state.DfaStateId > 0); - return builder.GetStateInfo(state.DfaStateId); - } + public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state) + => matcher.GetStateInfo(state.DfaStateId); } /// An for operating over instances configured as NFA states. private readonly struct NfaStateHandler : IStateHandler { /// Check if any underlying core state starts with a line anchor. - public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetCoreState(nfaState.Key).StartsWithLineAnchor) + if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).StartsWithLineAnchor) { return true; } @@ -1039,11 +1031,11 @@ public static bool StartsWithLineAnchor(SymbolicRegexBuilder builder, ref } /// Check if any underlying core state is nullable in the context of the next character kind. - public static bool IsNullableFor(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetCoreState(nfaState.Key).IsNullableFor(nextCharKind)) + if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).IsNullableFor(nextCharKind)) { return true; } @@ -1053,12 +1045,12 @@ public static bool IsNullableFor(SymbolicRegexBuilder builder, ref Current } /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos) + public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) { - uint nextCharKind = matcher.GetCharKind(input, pos); + uint nextCharKind = matcher.GetCharKind(input, pos); foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - DfaMatchingState coreState = matcher._builder.GetCoreState(nfaState.Key); + MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); if (coreState.IsNullableFor(nextCharKind)) { return coreState.Id; @@ -1070,11 +1062,11 @@ public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, } /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. - public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentState state, uint nextCharKind) + public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - DfaMatchingState coreState = builder.GetCoreState(nfaState.Key); + MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); if (coreState.IsNullableFor(nextCharKind)) { return coreState.FixedLength(nextCharKind); @@ -1086,7 +1078,7 @@ public static int FixedLength(SymbolicRegexBuilder builder, ref CurrentSta } /// Take the transition to the next NFA state. - public static bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); @@ -1105,7 +1097,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren { // We have a single source state. We know its next states are already deduped, // so we can just add them directly to the destination states list. - foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, builder)) + foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, matcher)) { nextStates.Add(nextState, out _); } @@ -1118,7 +1110,7 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren // to the set, then add the known-unique state to the destination list. foreach (ref KeyValuePair sourceState in CollectionsMarshal.AsSpan(sourceStates.Values)) { - foreach (int nextState in GetNextStates(sourceState.Key, mintermId, builder)) + foreach (int nextState in GetNextStates(sourceState.Key, mintermId, matcher)) { nextStates.Add(nextState, out _); } @@ -1128,13 +1120,13 @@ public static bool TakeTransition(SymbolicRegexBuilder builder, ref Curren return true; [MethodImpl(MethodImplOptions.AggressiveInlining)] - static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder builder) + static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher matcher) { // Calculate the offset into the NFA transition table. - int nfaOffset = (sourceState << builder._mintermsLog) | mintermId; + int nfaOffset = matcher.DeltaOffset(sourceState, mintermId); // Get the next NFA state. - return builder._nfaDelta[nfaOffset] ?? builder.CreateNewNfaTransition(sourceState, mintermId, nfaOffset); + return matcher._nfaDelta[nfaOffset] ?? matcher.CreateNewNfaTransition(sourceState, mintermId, nfaOffset); } } @@ -1153,15 +1145,15 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder< /// can transition back to a DFA state. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder builder, ref CurrentState state) => - (false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(builder, ref state), CanBeNullable(builder, ref state)); + public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher matcher, in CurrentState state) => + (false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(matcher, in state), CanBeNullable(matcher, in state)); /// Check if any underlying core state is unconditionally nullable. - private static bool IsNullable(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool IsNullable(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).IsNullable) + if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).IsNullable) { return true; } @@ -1171,11 +1163,11 @@ private static bool IsNullable(SymbolicRegexBuilder builder, ref CurrentSt } /// Check if any underlying core state can be nullable in some context. - private static bool CanBeNullable(SymbolicRegexBuilder builder, ref CurrentState state) + public static bool CanBeNullable(SymbolicRegexMatcher matcher, in CurrentState state) { foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).CanBeNullable) + if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).CanBeNullable) { return true; } @@ -1185,10 +1177,10 @@ private static bool CanBeNullable(SymbolicRegexBuilder builder, ref Curren } #if DEBUG - /// Undo a previous call to . + /// Undo a previous call to . public static void UndoTransition(ref CurrentState state) { - Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaState)}."); + Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); NfaMatchingState nfaState = state.NfaState!; @@ -1202,37 +1194,43 @@ public static void UndoTransition(ref CurrentState state) // Sanity check: if there are any next states, then there must have been some source states. Debug.Assert(nextStates.Count == 0 || sourceStates.Count > 0); } +#endif + } - /// Check if any underlying core state is unconditionally nullable. - public static bool IsNullable(ref CurrentState state) - { - SymbolicRegexBuilder builder = state.NfaState!.Builder; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - if (builder.GetCoreState(nfaState.Key).Node.IsNullable) - { - return true; - } - } + /// + /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to + /// both take transitions and decide nullability. For positions of valid characters that are handled normally, + /// these IDs coincide with minterm IDs (i.e. indices to ). Positions outside the bounds + /// of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be + /// mapped to _minterms.Length for supporting the \Z anchor. + /// + private interface IInputReader + { + public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); + } - return false; - } + /// This reader omits the special handling of \n for the \Z anchor. + private readonly struct NoZAnchorInputReader : IInputReader + { + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => + (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]); + } - /// Check if any underlying core state can be nullable. - public static bool CanBeNullable(ref CurrentState state) + /// This reader includes full handling of an \n as the last character of input for the \Z anchor. + private readonly struct FullInputReader : IInputReader + { + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) { - SymbolicRegexBuilder builder = state.NfaState!.Builder; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - if (builder.GetCoreState(nfaState.Key).Node.CanBeNullable) - { - return true; - } - } + if ((uint)pos >= (uint)input.Length) + return -1; - return false; + int c = input[pos]; + + // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor + return c == '\n' && pos == input.Length - 1 ? + matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input + matcher._mintermClassifier.GetMintermID(c); } -#endif } /// @@ -1240,7 +1238,8 @@ public static bool CanBeNullable(ref CurrentState state) /// private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos); + public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader; } /// @@ -1249,7 +1248,8 @@ private interface IInitialStateHandler private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader { // return true to indicate that the current position is a possible starting position return true; @@ -1262,7 +1262,8 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0)) @@ -1273,7 +1274,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche // Update the starting state based on where TryFindNextStartingPosition moved us to. // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); return true; } } @@ -1283,7 +1284,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche /// private interface INullabilityHandler { - public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler; } @@ -1293,7 +1294,7 @@ private interface INullabilityHandler private readonly struct NoAnchorsNullabilityHandler : INullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler { Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); @@ -1307,10 +1308,10 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche private readonly struct FullNullabilityHandler : INullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, ref CurrentState state, ReadOnlySpan input, int pos, bool isNullable, bool canBeNullable) + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable) where TStateHandler : struct, IStateHandler { - return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher._builder, ref state, matcher.GetCharKind(input, pos))); + return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 849fad5f82e939cb734a655a81a68e6963e5c1e7..f3fcc33a2a261301051f5ee11751d493b793bffc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -33,24 +33,25 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE /// internal const int SubsumptionCheckDepthLimit = 50; - internal readonly SymbolicRegexBuilder _builder; internal readonly SymbolicRegexNodeKind _kind; internal readonly int _lower; internal readonly int _upper; internal readonly TSet? _set; internal readonly SymbolicRegexNode? _left; internal readonly SymbolicRegexNode? _right; + internal readonly SymbolicRegexInfo _info; /// /// Caches nullability of this node for any given context (0 <= context < ContextLimit) /// when _info.StartsWithSomeAnchor and _info.CanBeNullable are true. Otherwise the cache is null. /// - private byte[]? _nullabilityCache; + private readonly byte[]? _nullabilityCache; - private TSet _startSet; +#if DEBUG + internal SymbolicRegexBuilder? _debugBuilder; +#endif /// AST node of a symbolic regex - /// the builder /// what kind of node /// left child /// right child @@ -58,9 +59,8 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE /// upper boubd of a loop /// singelton set /// misc flags including laziness - private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) + private SymbolicRegexNode(SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) { - _builder = builder; _kind = kind; _left = left; _right = right; @@ -68,7 +68,6 @@ private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeK _upper = upper; _set = set; _info = info; - _startSet = ComputeStartSet(); _nullabilityCache = info.StartsWithSomeAnchor && info.CanBeNullable ? new byte[CharKind.ContextLimit] : null; } @@ -78,7 +77,10 @@ private static SymbolicRegexNode Create(SymbolicRegexBuilder builder var key = (kind, left, right, lower, upper, set, info); if (!builder._nodeCache.TryGetValue(key, out SymbolicRegexNode? node)) { - node = new SymbolicRegexNode(builder, kind, left, right, lower, upper, set, info); + node = new SymbolicRegexNode(kind, left, right, lower, upper, set, info); +#if DEBUG + node._debugBuilder = builder; +#endif builder._nodeCache[key] = node; } return node; @@ -172,9 +174,6 @@ internal bool CanBeNullable } } - internal SymbolicRegexInfo _info; - - /// /// Converts a list of a given kind, e.g. Concat or Alternate, into an array, /// returns anything else in a singleton array. @@ -331,71 +330,31 @@ bool WithCache(uint context) } /// Returns true if this is equivalent to .* (the node must be eager also) - public bool IsAnyStar + public bool IsAnyStar(ISolver solver) { - get + if (IsStar) { - if (IsStar) - { - Debug.Assert(_left is not null); - if (_left._kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_left._set is not null); - return !IsLazy && _builder._solver.Full.Equals(_left._set); - } - } - - return false; - } - } - - /// Returns true if this is equivalent to .+ (the node must be eager also) - public bool IsAnyPlus - { - get - { - if (IsPlus) + Debug.Assert(_left is not null); + if (_left._kind == SymbolicRegexNodeKind.Singleton) { - Debug.Assert(_left is not null); - if (_left._kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_left._set is not null); - return !IsLazy && _builder._solver.Full.Equals(_left._set); - } + Debug.Assert(_left._set is not null); + return !IsLazy && solver.Full.Equals(_left._set); } - - return false; } - } - - /// Returns true if this is equivalent to [\0-\xFFFF] - public bool IsAnyChar - { - get - { - if (_kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_set is not null); - return _builder._solver.IsFull(_set); - } - return false; - } + return false; } /// Returns true if this is equivalent to [0-[0]] - public bool IsNothing + public bool IsNothing(ISolver solver) { - get + if (_kind == SymbolicRegexNodeKind.Singleton) { - if (_kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(_set is not null); - return _builder._solver.IsEmpty(_set); - } - - return false; + Debug.Assert(_set is not null); + return solver.IsEmpty(_set); } + + return false; } /// Returns true iff this is a loop whose lower bound is 0 and upper bound is max @@ -415,39 +374,33 @@ public bool IsNothing #region called only once, in the constructor of SymbolicRegexBuilder internal static SymbolicRegexNode CreateFalse(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, default); internal static SymbolicRegexNode CreateTrue(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, default); internal static SymbolicRegexNode CreateFixedLengthMarker(SymbolicRegexBuilder builder, int length) => - Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateEpsilon(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Epsilon()); - internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) + internal static SymbolicRegexNode CreateAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) { Debug.Assert(kind is + SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor or SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true, - startsWithLineAnchor: kind is + return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); } - internal static SymbolicRegexNode CreateBoundaryAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) - { - Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true)); - } - #endregion internal static SymbolicRegexNode CreateSingleton(SymbolicRegexBuilder builder, TSet set) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, default); internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder builder, SymbolicRegexNode body, int lower, int upper, bool isLazy) { @@ -480,10 +433,10 @@ internal static SymbolicRegexNode CreateEffect(SymbolicRegexBuilder } internal static SymbolicRegexNode CreateCaptureStart(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateCaptureEnd(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon()); internal static SymbolicRegexNode CreateDisableBacktrackingSimulation(SymbolicRegexBuilder builder, SymbolicRegexNode child) => Create(builder, SymbolicRegexNodeKind.DisableBacktrackingSimulation, child, null, -1, -1, default, child._info); @@ -530,7 +483,7 @@ internal static SymbolicRegexNode CreateConcat(SymbolicRegexBuilder /// internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, bool deduplicated = false, bool hintRightLikelySubsumes = false) { - if (left.IsAnyStar || right == builder._nothing || left == right || (left.IsNullable && right.IsEpsilon)) + if (left.IsAnyStar(builder._solver) || right.IsNothing(builder._solver) || left == right || (left.IsNullable && right.IsEpsilon)) return left; if (left == builder._nothing) return right; @@ -541,16 +494,16 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder tail = right._kind == SymbolicRegexNodeKind.Alternate ? right._right! : builder._nothing; // Simplify away right side if left side subsumes it. For example X?Y|Y|Z would simplify to just X?Y|Z. - if (!hintRightLikelySubsumes && left.Subsumes(head)) + if (!hintRightLikelySubsumes && left.Subsumes(builder, head)) return CreateAlternate(builder, left, tail); // Simplify by folding right side into left side if right side subsumes the left side. For example Y|X?Y|Z // would simplify to X??Y|Z. - if (head.Subsumes(left) && TryFoldAlternation(left, head, out SymbolicRegexNode? result)) + if (head.Subsumes(builder, left) && TryFoldAlternation(builder, left, head, out SymbolicRegexNode? result)) return CreateAlternate(builder, result, tail); // This is a repeat of a rule above, but for the case when the hint tells us to try reverse subsumption first. - if (hintRightLikelySubsumes && left.Subsumes(head)) + if (hintRightLikelySubsumes && left.Subsumes(builder, head)) return CreateAlternate(builder, left, tail); // If left is not an Alternate, try to avoid allocation by checking if deduplication is necessary @@ -647,53 +600,54 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder + /// the builder that owns this node /// the node to check for being subsumed /// the current recursion depth /// - internal bool Subsumes(SymbolicRegexNode other, int depth = 0) + internal bool Subsumes(SymbolicRegexBuilder builder, SymbolicRegexNode other, int depth = 0) { // A node subsumes itself if (this == other) return true; // Nothing has an empty language, which is subsumed by anything - if (other == _builder._nothing) + if (other.IsNothing(builder._solver)) return true; // Early exit if we've gone too deep if (depth >= SubsumptionCheckDepthLimit) return false; - if (_builder._subsumptionCache.TryGetValue((this, other), out bool cached)) + if (builder._subsumptionCache.TryGetValue((this, other), out bool cached)) { return cached; } if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(Subsumes, other, depth); + return StackHelper.CallOnEmptyStack(Subsumes, builder, other, depth); } // Try to apply all subsumption rules - bool? subsumes = ApplySubsumptionRules(this, other, depth + 1); + bool? subsumes = ApplySubsumptionRules(builder, this, other, depth + 1); // Cache and return the result if any rule applied if (subsumes.HasValue) { - return (_builder._subsumptionCache[(this, other)] = subsumes.Value); + return (builder._subsumptionCache[(this, other)] = subsumes.Value); } // Assume false if no rule applied return false; - static bool? ApplySubsumptionRules(SymbolicRegexNode left, SymbolicRegexNode right, int depth) + static bool? ApplySubsumptionRules(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, int depth) { // Rule: Effect(X,E) subsumes Y iff X subsumes Y // Effectively this ignores any effects if (left._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(left._left is not null && left._right is not null); - return left._left.Subsumes(right, depth); + return left._left.Subsumes(builder, right, depth); } // Rule: X subsumes Effect(Y,E) iff X subsumes Y @@ -701,7 +655,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) if (right._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(right._left is not null && right._right is not null); - return left.Subsumes(right._left, depth); + return left.Subsumes(builder, right._left, depth); } // Rule: XY subsumes (X')??Y' if X equals X' and Y subsumes Y' @@ -714,7 +668,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) { Debug.Assert(rl._left is not null); if (TrySkipPrefix(left, rl._left, out SymbolicRegexNode? tail)) - return tail.Subsumes(right._right, depth); + return tail.Subsumes(builder, right._right, depth); } } @@ -728,7 +682,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) { Debug.Assert(ll._left is not null); if (TrySkipPrefix(right, ll._left, out SymbolicRegexNode? tail)) - return left._right.Subsumes(tail, depth); + return left._right.Subsumes(builder, tail, depth); } } @@ -738,7 +692,7 @@ internal bool Subsumes(SymbolicRegexNode other, int depth = 0) Debug.Assert(left._left is not null && left._right is not null); if (left._left.IsNullable) { - return left._right.Subsumes(right, depth); + return left._right.Subsumes(builder, right, depth); } } @@ -804,18 +758,19 @@ private SymbolicRegexNode UnwrapEffects() /// eliminate the alternation by simplifying to (xyz){0,3}?abc. Note that the transformation preserves the priority /// of the shorter "abc" match by making the prefix lazy. /// + /// the builder that owns this node /// the lower priority alternative /// the higher priority alternative /// the folded regex that eliminates alternation, or null if the operation fails /// accumulated effects from the right side /// whether folding was successful - private static bool TryFoldAlternation(SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? result, + private static bool TryFoldAlternation(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? result, SymbolicRegexNode? rightEffects = null) { // The rules below assume that the right side subsumes the left side - Debug.Assert(right.Subsumes(left)); + Debug.Assert(right.Subsumes(builder, left)); - rightEffects ??= left._builder.Epsilon; + rightEffects ??= builder.Epsilon; // If the sides are equal (ignoring effects) then just return the higher priority left side if (left.UnwrapEffects() == right.UnwrapEffects()) @@ -830,20 +785,20 @@ private SymbolicRegexNode UnwrapEffects() if (left._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(left._left is not null && left._right is not null); - Debug.Assert(right.Subsumes(left._left)); + Debug.Assert(right.Subsumes(builder, left._left)); // If there are any accumulated effects we don't know how to handle them here. // This shouldn't normally happen because this rule has priority over the rule // for effects on the right side. - if (rightEffects != left._builder.Epsilon) + if (rightEffects != builder.Epsilon) { result = null; return false; } - if (TryFoldAlternation(left._left, right, out SymbolicRegexNode? innerResult, rightEffects)) + if (TryFoldAlternation(builder, left._left, right, out SymbolicRegexNode? innerResult, rightEffects)) { - result = CreateEffect(left._builder, innerResult, left._right); + result = CreateEffect(builder, innerResult, left._right); return true; } } @@ -853,19 +808,19 @@ private SymbolicRegexNode UnwrapEffects() if (right._kind == SymbolicRegexNodeKind.Effect) { Debug.Assert(right._left is not null && right._right is not null); - Debug.Assert(right._left.Subsumes(left)); - rightEffects = CreateConcat(left._builder, right._right, rightEffects); - return TryFoldAlternation(left, right._left, out result, rightEffects); + Debug.Assert(right._left.Subsumes(builder, left)); + rightEffects = CreateConcat(builder, right._right, rightEffects); + return TryFoldAlternation(builder, left, right._left, out result, rightEffects); } // If we have Y | XY then this rule will find X and fold to X??Y. if (right._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(right._left is not null && right._right is not null); - if (right._left.IsNullable && TrySplitConcatSubsumption(left, right, out SymbolicRegexNode? prefix)) + if (right._left.IsNullable && TrySplitConcatSubsumption(builder, left, right, out SymbolicRegexNode? prefix)) { - prefix = CreateEffect(left._builder, prefix, rightEffects); - result = left._builder.CreateConcat(CreateLoop(left._builder, prefix, 0, 1, true), left); + prefix = CreateEffect(builder, prefix, rightEffects); + result = builder.CreateConcat(CreateLoop(builder, prefix, 0, 1, true), left); return true; } } @@ -875,7 +830,7 @@ private SymbolicRegexNode UnwrapEffects() return false; // This rule tries to find a prefix P that the right side has such that right is PR and left is equivalent to R - static bool TrySplitConcatSubsumption(SymbolicRegexNode left, SymbolicRegexNode right, + static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? prefix) { List> prefixElements = new(); @@ -884,25 +839,25 @@ private SymbolicRegexNode UnwrapEffects() { Debug.Assert(suffix._left is not null && suffix._right is not null); // We maintain a loop invariant that the suffix subsumes the left hand side - Debug.Assert(suffix.Subsumes(left)); + Debug.Assert(suffix.Subsumes(builder, left)); if (suffix == left) { // We found a split, so store the prefix and return success prefixElements.Reverse(); - prefix = left._builder.CreateConcatAlreadyReversed(prefixElements); + prefix = builder.CreateConcatAlreadyReversed(prefixElements); return true; } - else if (suffix._right.Subsumes(left)) + else if (suffix._right.Subsumes(builder, left)) { // The tail of the suffix still subsumes left, so we can extend the prefix prefixElements.Add(suffix._left); suffix = suffix._right; } - else if (left.Subsumes(suffix)) + else if (left.Subsumes(builder, suffix)) { // If left subsumes the suffix, then due to the loop invariant we have equivalence prefixElements.Reverse(); - prefix = left._builder.CreateConcatAlreadyReversed(prefixElements); + prefix = builder.CreateConcatAlreadyReversed(prefixElements); return true; } else @@ -1015,9 +970,10 @@ public int GetFixedLength() /// This function will rebuild concatenations because it pushes the FixedLengthMarker into the rightmost element. /// Due to this this function should not be called on every character. /// + /// the builder that owns this node /// accumulater used in the recursion for lengths of paths /// the node with fixed length markers added - public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) + public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder builder, int lengthSoFar = 0) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { @@ -1029,9 +985,9 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); // For an Alternate attempt to add markers separately for each element - return CreateAlternate(_builder, - _left.AddFixedLengthMarkers(lengthSoFar), - _right.AddFixedLengthMarkers(lengthSoFar), deduplicated: true); + return CreateAlternate(builder, + _left.AddFixedLengthMarkers(builder, lengthSoFar), + _right.AddFixedLengthMarkers(builder, lengthSoFar), deduplicated: true); case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); @@ -1039,13 +995,13 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) int leftLength = _left.GetFixedLength(); if (leftLength >= 0) { - return CreateConcat(_builder, _left, _right.AddFixedLengthMarkers(lengthSoFar + leftLength)); + return CreateConcat(builder, _left, _right.AddFixedLengthMarkers(builder, lengthSoFar + leftLength)); } // If the right side is always zero length, then just recurse to the left side int rightLength = _right.GetFixedLength(); if (rightLength == 0) { - return CreateConcat(_builder, _left.AddFixedLengthMarkers(lengthSoFar), _right); + return CreateConcat(builder, _left.AddFixedLengthMarkers(builder, lengthSoFar), _right); } break; @@ -1058,80 +1014,82 @@ public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) // if there is one. int thisLength = GetFixedLength(); return thisLength < 0 ? this : - CreateConcat(_builder, this, CreateFixedLengthMarker(_builder, lengthSoFar + thisLength)); + CreateConcat(builder, this, CreateFixedLengthMarker(builder, lengthSoFar + thisLength)); } /// - /// Create a derivative ( and ) and then strip + /// Create a derivative ( and ) and then strip /// effects with . /// This derivative simulates backtracking, i.e. it only considers paths that backtracking would /// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking /// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - internal SymbolicRegexNode CreateDerivativeWithoutEffects(TSet elem, uint context) => CreateDerivativeWrapper(elem, context).StripEffects(); + internal SymbolicRegexNode CreateDerivativeWithoutEffects(SymbolicRegexBuilder builder, TSet elem, uint context) => CreateDerivativeWrapper(builder, elem, context).StripEffects(builder); /// - /// Create a derivative ( and ) and then strip + /// Create a derivative ( and ) and then strip /// and map effects for use in NFA simulation with . /// This derivative simulates backtracking, i.e. it only considers paths that backtracking would /// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking /// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed. /// /// - /// The differences of this to are that (1) effects (e.g. capture starts and ends) + /// The differences of this to are that (1) effects (e.g. capture starts and ends) /// are considered and (2) the different elements that would form a top level union are instead returned as separate /// nodes (paired with their associated effects). This function is meant to be used for NFA simulation, where top level /// unions would be broken up into separate states. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(TSet elem, uint context) + internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context) { List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new(); - CreateDerivativeWrapper(elem, context).StripAndMapEffects(context, transitions); + CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions); return transitions; } // This wrapper handles the shared top-level concerns of constructing derivatives. Namely: // -Unwrapping and rewrapping nodes in DisableBacktrackingSimulation // -When backtracking is being simulated calling into PruneLowerPriorityThanNullability - private SymbolicRegexNode CreateDerivativeWrapper(TSet elem, uint context) + private SymbolicRegexNode CreateDerivativeWrapper(SymbolicRegexBuilder builder, TSet elem, uint context) { if (this._kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation) { // This node kind can only occur at the top level and indicates that backtracking simulation is turned off Debug.Assert(_left is not null); - SymbolicRegexNode derivative = _left.CreateDerivative(elem, context); + SymbolicRegexNode derivative = _left.CreateDerivative(builder, elem, context); // Reinsert the marker that maintains the non-backtracking semantics - return _builder.CreateDisableBacktrackingSimulation(derivative); + return builder.CreateDisableBacktrackingSimulation(derivative); } else { // If this node is nullable for the given context then prune any branches that are less preferred than // just the empty match. This is done in order to maintain backtracking semantics. - SymbolicRegexNode node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(context) : this; - return node.CreateDerivative(elem, context); + SymbolicRegexNode node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(builder, context) : this; + return node.CreateDerivative(builder, elem, context); } } /// Prune this node wrt the given context in order to maintain backtracking semantics. Mimics how backtracking chooses a path. - private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) + private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexBuilder builder, uint context) { //caching pruning to avoid otherwise potential quadratic worst case behavior SymbolicRegexNode? prunedNode; (SymbolicRegexNode, uint) key = (this, context); - if (_builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) + if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) { return prunedNode; } if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, context); + return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, builder, context); } switch (_kind) @@ -1143,8 +1101,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) // In a alternation (X|Y) where X is nullable (in the given context), Y must be eliminated. // Thus, taking the higher-priority branch in backtracking that is known to lead to a match // at which point the other branches become irrelevant and must no longer be used. - prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(context) : - CreateAlternate(_builder, _left, _right.PruneLowerPriorityThanNullability(context), deduplicated: true); + prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(builder, context) : + CreateAlternate(builder, _left, _right.PruneLowerPriorityThanNullability(builder, context), deduplicated: true); break; case SymbolicRegexNodeKind.Concat: @@ -1159,20 +1117,20 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) //e.g. a{0,5}?b{0,5}? reduces to () prunedNode = _left._kind == SymbolicRegexNodeKind.Alternate ? (_left._left!.IsNullableFor(context) ? - CreateConcat(_builder, _left._left, _right).PruneLowerPriorityThanNullability(context) : - CreateAlternate(_builder, CreateConcat(_builder, _left._left, _right), CreateConcat(_builder, _left._right!, _right).PruneLowerPriorityThanNullability(context))) : - CreateConcat(_builder, _left.PruneLowerPriorityThanNullability(context), _right.PruneLowerPriorityThanNullability(context)); + CreateConcat(builder, _left._left, _right).PruneLowerPriorityThanNullability(builder, context) : + CreateAlternate(builder, CreateConcat(builder, _left._left, _right), CreateConcat(builder, _left._right!, _right).PruneLowerPriorityThanNullability(builder, context))) : + CreateConcat(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right.PruneLowerPriorityThanNullability(builder, context)); break; case SymbolicRegexNodeKind.Loop when _info.IsLazyLoop && _lower == 0: //lazy nullable loop reduces to (), i.e., the loop body is just forgotten - prunedNode = _builder.Epsilon; + prunedNode = builder.Epsilon; break; case SymbolicRegexNodeKind.Effect: //Effects are maintained and the pruning is propagated to the body of the effect Debug.Assert(_left is not null && _right is not null); - prunedNode = CreateEffect(_builder, _left.PruneLowerPriorityThanNullability(context), _right); + prunedNode = CreateEffect(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right); break; default: @@ -1181,7 +1139,7 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) break; } - _builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode; + builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode; return prunedNode; } @@ -1205,19 +1163,20 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) /// positions for capture starts and ends. For example, given a DerivativeEffect for CaptureStart of capture number 0 /// and an input position 5, applying it to a Registers instance is simply assigning the relevant value to 5. /// + /// the builder that owns this node /// given element wrt which the derivative is taken /// immediately surrounding character context that affects nullability of anchors /// the derivative - private SymbolicRegexNode CreateDerivative(TSet elem, uint context) + private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder builder, TSet elem, uint context) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(CreateDerivative, elem, context); + return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context); } SymbolicRegexNode? derivative; (SymbolicRegexNode, TSet, uint) key = (this, elem, context); - if (_builder._derivativeCache.TryGetValue(key, out derivative)) + if (builder._derivativeCache.TryGetValue(key, out derivative)) { return derivative; } @@ -1230,14 +1189,14 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) // The following check assumes that either (1) the element and set are minterms, in which case // the element is exactly the set if the intersection is non-empty (satisfiable), or (2) the element is a singleton // set in which case it is fully contained in the set if the intersection is non-empty. - if (!_builder._solver.IsEmpty(_builder._solver.And(elem, _set))) + if (!builder._solver.IsEmpty(builder._solver.And(elem, _set))) { // the sigleton is consumed so the derivative is epsilon - derivative = _builder.Epsilon; + derivative = builder.Epsilon; } else { - derivative = _builder._nothing; + derivative = builder._nothing; } break; } @@ -1250,12 +1209,12 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) { // If the left side is not nullable then the character must be consumed there. // For example, Da(ab) = Da(a)b = b. - derivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right); + derivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right); } else { - SymbolicRegexNode leftDerivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right); - SymbolicRegexNode rightDerivative = _builder.CreateEffect(_right.CreateDerivative(elem, context), _left); + SymbolicRegexNode leftDerivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right); + SymbolicRegexNode rightDerivative = builder.CreateEffect(_right.CreateDerivative(builder, elem, context), _left); // If the left alternative is high-priority-nullable then // the priority is to skip left and prioritize rderiv over lderivR // Two examples: suppose elem = a @@ -1268,8 +1227,8 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) // In the second case backtracking would try to continue to follow (ab)* after reading b // This backtracking semantics is effectively being recorded into the order of the alternatives derivative = _left.IsHighPriorityNullableFor(context) ? - CreateAlternate(_builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : - CreateAlternate(_builder, leftDerivative, rightDerivative); + CreateAlternate(builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : + CreateAlternate(builder, leftDerivative, rightDerivative); } break; } @@ -1279,10 +1238,10 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) Debug.Assert(_left is not null); Debug.Assert(_upper > 0); - SymbolicRegexNode bodyDerivative = _left.CreateDerivative(elem, context); - if (bodyDerivative.IsNothing) + SymbolicRegexNode bodyDerivative = _left.CreateDerivative(builder, elem, context); + if (bodyDerivative.IsNothing(builder._solver)) { - derivative = _builder._nothing; + derivative = builder._nothing; } else { @@ -1294,7 +1253,7 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) int newlower = _lower == 0 || _lower == int.MaxValue ? _lower : _lower - 1; // the continued loop becomes epsilon when newlower == newupper == 0 // in which case the returned concatenation will be just bodyDerivative - derivative = _builder.CreateConcat(bodyDerivative, _builder.CreateLoop(_left, IsLazy, newlower, newupper)); + derivative = builder.CreateConcat(bodyDerivative, builder.CreateLoop(_left, IsLazy, newlower, newupper)); } break; } @@ -1302,7 +1261,7 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - derivative = CreateAlternate(_builder, _left.CreateDerivative(elem, context), _right.CreateDerivative(elem, context)); + derivative = CreateAlternate(builder, _left.CreateDerivative(builder, elem, context), _right.CreateDerivative(builder, elem, context)); break; } @@ -1314,11 +1273,11 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) default: // The derivative of any other case is nothing // e.g. taking the derivative of () (epsilon) is [] (nothing) - derivative = _builder._nothing; + derivative = builder._nothing; break; } - _builder._derivativeCache[key] = derivative; + builder._derivativeCache[key] = derivative; return derivative; } @@ -1327,11 +1286,11 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) /// So Effect(R,E) would be simplified to just R. /// /// the node with all Effect nodes stripped away - internal SymbolicRegexNode StripEffects() + internal SymbolicRegexNode StripEffects(SymbolicRegexBuilder builder) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(StripEffects); + return StackHelper.CallOnEmptyStack(StripEffects, builder); } // If the node doesn't contain any Effect nodes under it we are done @@ -1344,12 +1303,12 @@ internal SymbolicRegexNode StripEffects() case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null && _right is not null); // This is the place where the effect (the right child) is getting ignored - return _left.StripEffects(); + return _left.StripEffects(builder); case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect); - return _builder.CreateConcat(_left.StripEffects(), _right); + return builder.CreateConcat(_left.StripEffects(builder), _right); case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); @@ -1357,16 +1316,16 @@ internal SymbolicRegexNode StripEffects() // the elements. We don't want to omit deduplication here, since he stripping may make nodes equal. List> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate); for (int i = 0; i < elems.Count; i++) - elems[i] = elems[i].StripEffects(); - return _builder.Alternate(elems); + elems[i] = elems[i].StripEffects(builder); + return builder.Alternate(elems); case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return _builder.CreateDisableBacktrackingSimulation(_left.StripEffects()); + return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _builder.CreateLoop(_left.StripEffects(), IsLazy, _lower, _upper); + return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); default: Debug.Fail($"{nameof(StripEffects)}:{_kind}"); @@ -1386,15 +1345,16 @@ internal SymbolicRegexNode StripEffects() /// Here both include the CaptureStart_0 effect, since both are nested inside the outer Effect node, /// while only R includes the CaptureStart_1 effect. /// + /// the builder that owns this node /// immediately surrounding character context that affects nullability of anchors /// the list to insert the pairs of nodes and their effects into in priority order /// a helper list this function uses to accumulate effects in recursive calls - internal void StripAndMapEffects(uint context, List<(SymbolicRegexNode, DerivativeEffect[])> alternativesAndEffects, + internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint context, List<(SymbolicRegexNode, DerivativeEffect[])> alternativesAndEffects, List? currentEffects = null) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StackHelper.CallOnEmptyStack(StripAndMapEffects, context, alternativesAndEffects, currentEffects); + StackHelper.CallOnEmptyStack(StripAndMapEffects, builder, context, alternativesAndEffects, currentEffects); return; } @@ -1418,7 +1378,7 @@ internal SymbolicRegexNode StripEffects() int oldEffectCount = currentEffects.Count; _right.ApplyEffects((e, s) => s.Add(e), context, currentEffects); // Recurse into the main child - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); // Pop all the effects that were pushed above currentEffects.RemoveRange(oldEffectCount, currentEffects.Count - oldEffectCount); return; @@ -1430,19 +1390,19 @@ internal SymbolicRegexNode StripEffects() // For concat the nodes for the left hand side are added first and then fixed up by concatenating // the right side to each of them. int oldAlternativesCount = alternativesAndEffects.Count; - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { var (node, effects) = alternativesAndEffects[i]; - alternativesAndEffects[i] = (_builder.CreateConcat(node, _right), effects); + alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects); } break; } case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); - _right.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); + _right.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); break; case SymbolicRegexNodeKind.Loop when _lower == 0 && _upper == 1: @@ -1452,14 +1412,14 @@ internal SymbolicRegexNode StripEffects() Debug.Assert(_left is not null); // For lazy loops skipping is preferred, so output the epsilon first if (IsLazy) - alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ? + alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ? currentEffects.ToArray() : Array.Empty())); // Recurse into the body - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); // For eager loops the body is preferred, so output the epsilon last if (!IsLazy) - alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ? + alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ? currentEffects.ToArray() : Array.Empty())); break; @@ -1468,11 +1428,11 @@ internal SymbolicRegexNode StripEffects() { Debug.Assert(_left is not null); int oldAlternativesCount = alternativesAndEffects.Count; - _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); + _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { var (node, effects) = alternativesAndEffects[i]; - alternativesAndEffects[i] = (_builder.CreateDisableBacktrackingSimulation(node), effects); + alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects); } break; } @@ -1632,12 +1592,12 @@ internal void ToStringHelper(StringBuilder sb) case SymbolicRegexNodeKind.Singleton: Debug.Assert(_set is not null); - sb.Append(_builder._solver.PrettyPrint(_set, _builder._charSetSolver)); + sb.Append(_debugBuilder!._solver.PrettyPrint(_set, _debugBuilder._charSetSolver)); return; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - if (IsAnyStar) + if (IsAnyStar(_debugBuilder!._solver)) { sb.Append(".*"); } @@ -1782,19 +1742,19 @@ static void AppendNumberSuperscript(StringBuilder sb, int value) /// /// Returns all sets that occur in the regex or the full set if there are no sets in the regex (e.g. the regex is "^"). /// - public HashSet GetSets() + public HashSet GetSets(SymbolicRegexBuilder builder) { var sets = new HashSet(); - CollectSets(sets); + CollectSets(builder, sets); return sets; } /// Collects all sets that occur in the regex into the specified collection. - private void CollectSets(HashSet sets) + private void CollectSets(SymbolicRegexBuilder builder, HashSet sets) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StackHelper.CallOnEmptyStack(CollectSets, sets); + StackHelper.CallOnEmptyStack(CollectSets, builder, sets); return; } @@ -1804,7 +1764,7 @@ private void CollectSets(HashSet sets) case SymbolicRegexNodeKind.EOLAnchor: case SymbolicRegexNodeKind.EndAnchorZ: case SymbolicRegexNodeKind.EndAnchorZReverse: - sets.Add(_builder._newLineSet); + sets.Add(builder._newLineSet); return; case SymbolicRegexNodeKind.BeginningAnchor: @@ -1822,13 +1782,13 @@ private void CollectSets(HashSet sets) case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - _left.CollectSets(sets); + _left.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - _left.CollectSets(sets); - _right.CollectSets(sets); + _left.CollectSets(builder, sets); + _right.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.Concat: @@ -1837,20 +1797,20 @@ private void CollectSets(HashSet sets) while (conc._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(conc._left is not null && conc._right is not null); - conc._left.CollectSets(sets); + conc._left.CollectSets(builder, sets); conc = conc._right; } - conc.CollectSets(sets); + conc.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - _left.CollectSets(sets); + _left.CollectSets(builder, sets); return; case SymbolicRegexNodeKind.NonBoundaryAnchor: case SymbolicRegexNodeKind.BoundaryAnchor: - sets.Add(_builder._wordLetterForBoundariesSet); + sets.Add(builder._wordLetterForBoundariesSet); return; default: @@ -1860,10 +1820,10 @@ private void CollectSets(HashSet sets) } /// Compute and sort all the minterms from the sets in this regex. - public TSet[] ComputeMinterms() + public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) { - HashSet sets = GetSets(); - List minterms = MintermGenerator.GenerateMinterms(_builder._solver, sets); + HashSet sets = GetSets(builder); + List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets); minterms.Sort(); return minterms.ToArray(); } @@ -1871,69 +1831,69 @@ public TSet[] ComputeMinterms() /// /// Create the reverse of this regex /// - public SymbolicRegexNode Reverse() + public SymbolicRegexNode Reverse(SymbolicRegexBuilder builder) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(Reverse); + return StackHelper.CallOnEmptyStack(Reverse, builder); } switch (_kind) { case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _builder.CreateLoop(_left.Reverse(), IsLazy, _lower, _upper); + return builder.CreateLoop(_left.Reverse(builder), IsLazy, _lower, _upper); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode rev = _left.Reverse(); + SymbolicRegexNode rev = _left.Reverse(builder); SymbolicRegexNode rest = _right; while (rest._kind == SymbolicRegexNodeKind.Concat) { Debug.Assert(rest._left is not null && rest._right is not null); - SymbolicRegexNode rev1 = rest._left.Reverse(); - rev = _builder.CreateConcat(rev1, rev); + SymbolicRegexNode rev1 = rest._left.Reverse(builder); + rev = builder.CreateConcat(rev1, rev); rest = rest._right; } - SymbolicRegexNode restr = rest.Reverse(); - rev = _builder.CreateConcat(restr, rev); + SymbolicRegexNode restr = rest.Reverse(builder); + rev = builder.CreateConcat(restr, rev); return rev; } case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - return CreateAlternate(_builder, _left.Reverse(), _right.Reverse()); + return CreateAlternate(builder, _left.Reverse(builder), _right.Reverse(builder)); case SymbolicRegexNodeKind.FixedLengthMarker: // Fixed length markers are omitted in reverse - return _builder.Epsilon; + return builder.Epsilon; case SymbolicRegexNodeKind.BeginningAnchor: // The reverse of BeginningAnchor is EndAnchor - return _builder.EndAnchor; + return builder.EndAnchor; case SymbolicRegexNodeKind.EndAnchor: - return _builder.BeginningAnchor; + return builder.BeginningAnchor; case SymbolicRegexNodeKind.BOLAnchor: // The reverse of BOLanchor is EOLanchor - return _builder.EolAnchor; + return builder.EolAnchor; case SymbolicRegexNodeKind.EOLAnchor: - return _builder.BolAnchor; + return builder.BolAnchor; case SymbolicRegexNodeKind.EndAnchorZ: // The reversal of the \Z anchor - return _builder.EndAnchorZReverse; + return builder.EndAnchorZReverse; case SymbolicRegexNodeKind.EndAnchorZReverse: Debug.Fail("Should only happen if a reversed regex is reversed again, which isn't expected"); - return _builder.EndAnchorZ; + return builder.EndAnchorZ; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return _builder.CreateDisableBacktrackingSimulation(_left.Reverse()); + return builder.CreateDisableBacktrackingSimulation(_left.Reverse(builder)); // Remaining cases map to themselves: case SymbolicRegexNodeKind.Epsilon: @@ -1974,12 +1934,8 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1) }; } - - /// Gets the set that includes all elements that can start a match. - internal TSet GetStartSet() => _startSet; - /// Computes the set that includes all elements that can start a match. - private TSet ComputeStartSet() + public TSet GetStartSet(SymbolicRegexBuilder builder) { switch (_kind) { @@ -1996,7 +1952,7 @@ private TSet ComputeStartSet() case SymbolicRegexNodeKind.BOLAnchor: case SymbolicRegexNodeKind.CaptureStart: case SymbolicRegexNodeKind.CaptureEnd: - return _builder._solver.Empty; + return builder._solver.Empty; case SymbolicRegexNodeKind.Singleton: Debug.Assert(_set is not null); @@ -2004,44 +1960,64 @@ private TSet ComputeStartSet() case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return _left._startSet; + return _left.GetStartSet(builder); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - TSet startSet = _left.CanBeNullable ? _builder._solver.Or(_left._startSet, _right._startSet) : _left._startSet; + TSet startSet = _left.CanBeNullable ? builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder)) : _left.GetStartSet(builder); return startSet; } case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - return _builder._solver.Or(_left._startSet, _right._startSet); + return builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder)); } case SymbolicRegexNodeKind.DisableBacktrackingSimulation: case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null); - return _left._startSet; + return _left.GetStartSet(builder); default: - Debug.Fail($"{nameof(ComputeStartSet)}:{_kind}"); - return _builder._solver.Full; + Debug.Fail($"{nameof(GetStartSet)}:{_kind}"); + return builder._solver.Full; } } /// /// Replace anchors that are infeasible by [] wrt the given previous character kind and what continuation is possible. /// + /// + /// This helps the matcher detect deadend states that have no viable matches in situations where the pattern's + /// language is empty due to interactions between anchors and the rest of the pattern. For example, a*\ba would + /// be simplified to [] when prevKind is a word letter. This allows the matcher to avoid spurious work and return + /// early. + /// + /// the builder that owns this node /// previous character kind - /// if true the continuation can start with wordletter or stop - /// if true the continuation can start with nonwordletter or stop - internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bool contWithNWL) + internal SymbolicRegexNode PruneAnchors(SymbolicRegexBuilder builder, uint prevKind) + { + //first prune the anchors in the node + TSet wlbSet = builder._wordLetterForBoundariesSet; + TSet startSet = GetStartSet(builder); + + //true if the startset of the node overlaps with some wordletter or the node can be nullable + bool contWithWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(wlbSet, startSet)); + + //true if the startset of the node overlaps with some nonwordletter or the node can be nullable + bool contWithNWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(builder._solver.Not(wlbSet), startSet)); + + return PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + } + + private SymbolicRegexNode PruneAnchorsImpl(SymbolicRegexBuilder builder, uint prevKind, bool contWithWL, bool contWithNWL) { // Guard against stack overflow due to deep recursion if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(PruneAnchors, prevKind, contWithWL, contWithNWL); + return StackHelper.CallOnEmptyStack(PruneAnchorsImpl, builder, prevKind, contWithWL, contWithNWL); } if (!_info.StartsWithSomeAnchor) @@ -2052,73 +2028,73 @@ internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bo case SymbolicRegexNodeKind.BeginningAnchor: return prevKind == CharKind.BeginningEnd ? this : - _builder._nothing; //start anchor is only nullable if the previous character is Start + builder._nothing; //start anchor is only nullable if the previous character is Start case SymbolicRegexNodeKind.EndAnchorZReverse: return ((prevKind & CharKind.BeginningEnd) != 0) ? this : - _builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n + builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n case SymbolicRegexNodeKind.BoundaryAnchor: return (prevKind == CharKind.WordLetter ? contWithNWL : contWithWL) ? this : // \b is impossible when the previous character is \w but no continuation matches \W // or the previous character is \W but no continuation matches \w - _builder._nothing; + builder._nothing; case SymbolicRegexNodeKind.NonBoundaryAnchor: return (prevKind == CharKind.WordLetter ? contWithWL : contWithNWL) ? this : // \B is impossible when the previous character is \w but no continuation matches \w // or the previous character is \W but no continuation matches \W - _builder._nothing; + builder._nothing; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - SymbolicRegexNode body = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode body = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return body == _left ? this : - CreateLoop(_builder, body, _lower, _upper, IsLazy); + CreateLoop(builder, body, _lower, _upper, IsLazy); case SymbolicRegexNodeKind.Concat: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); - SymbolicRegexNode right1 = _left.IsNullable ? _right.PruneAnchors(prevKind, contWithWL, contWithNWL) : _right; + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + SymbolicRegexNode right1 = _left.IsNullable ? _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL) : _right; Debug.Assert(left1 is not null && right1 is not null); return left1 == _left && right1 == _right ? this : - CreateConcat(_builder, left1, right1); + CreateConcat(builder, left1, right1); } case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); - SymbolicRegexNode right1 = _right.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); + SymbolicRegexNode right1 = _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); Debug.Assert(left1 is not null && right1 is not null); return left1 == _left && right1 == _right ? this : - CreateAlternate(_builder, left1, right1); + CreateAlternate(builder, left1, right1); } case SymbolicRegexNodeKind.Effect: { Debug.Assert(_left is not null && _right is not null); - SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return left1 == _left ? this : - CreateEffect(_builder, left1, _right); + CreateEffect(builder, left1, _right); } case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - SymbolicRegexNode child = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); + SymbolicRegexNode child = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL); return child == _left ? this : - _builder.CreateDisableBacktrackingSimulation(child); + builder.CreateDisableBacktrackingSimulation(child); default: return this; @@ -2175,7 +2151,7 @@ internal int ResolveFixedLength(uint context) /// and the resulting elements re-wrapped to maintain the metadata. /// /// an enumeration of the elements of the alternation, or just the node itself if there is no alternation - internal IEnumerable> EnumerateAlternationBranches() + internal IEnumerable> EnumerateAlternationBranches(SymbolicRegexBuilder builder) { switch (_kind) { @@ -2183,10 +2159,10 @@ internal IEnumerable> EnumerateAlternationBranches() Debug.Assert(_left is not null); // This call should never recurse more than one level Debug.Assert(_left._kind is not SymbolicRegexNodeKind.DisableBacktrackingSimulation); - foreach (SymbolicRegexNode element in _left.EnumerateAlternationBranches()) + foreach (SymbolicRegexNode element in _left.EnumerateAlternationBranches(builder)) { // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too - yield return _builder.CreateDisableBacktrackingSimulation(element); + yield return builder.CreateDisableBacktrackingSimulation(element); } break; case SymbolicRegexNodeKind.Alternate: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 431a85590c7d6bd5c8df89e6a125f2f369efe347..079b42e523ac036907b5d3c7bb00e179e7a13510 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -37,8 +37,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim } } - rootNode = rootNode.AddFixedLengthMarkers(); - BDD[] minterms = rootNode.ComputeMinterms(); + rootNode = rootNode.AddFixedLengthMarkers(bddBuilder); + BDD[] minterms = rootNode.ComputeMinterms(bddBuilder); _matcher = minterms.Length > 64 ? SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs index ee39405583e2551083ede273810c7ac7c7e5ecf5..48d94c51037961e17f56bf46e0d29f4a3f59c717 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -64,6 +64,38 @@ public static bool TryEnsureSufficientExecutionStack() .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The third argument to pass to the action. + /// The fourth argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => action(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The type of the fifth argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The third argument to pass to the action. + /// The fourth argument to pass to the action. + /// The fifth argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4, TArg5 arg5) => + Task.Run(() => action(arg1, arg2, arg3, arg4, arg5)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + /// Calls the provided function on the stack of a different thread pool thread. /// The type of the first argument to pass to the function. /// The type of the second argument to pass to the function. @@ -126,5 +158,21 @@ public static bool TryEnsureSufficientExecutionStack() Task.Run(() => func(arg1, arg2, arg3)) .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + /// The fourth argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => func(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index db9a4fd61eb45069f3640790b70b727bd6f50c22..56ce038cb09184b8781ea5e59ba291ad86b3aed1 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -86,7 +86,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize }; } // add .*? in front of the pattern, this adds 1 more NFA state @@ -94,7 +94,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(".*?" + Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, 1 + ExpectedSafeSize}; + yield return new object[] { bddBuilder, rootNode, 1 + ExpectedSafeSize}; } // use of anchors increases the estimate by 5x in general but in reality much less, at most 3x @@ -102,7 +102,7 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern + "$", options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, 5 * ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, 5 * ExpectedSafeSize }; } // use of captures has no effect on the estimations @@ -110,31 +110,32 @@ public static IEnumerable SafeThresholdTests_MemberData() { RegexNode tree = RegexParser.Parse(Pattern, options, CultureInfo.CurrentCulture).Root; SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - yield return new object[] { rootNode, ExpectedSafeSize }; + yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize }; } } [Theory] [MemberData(nameof(SafeThresholdTests_MemberData))] - public void SafeThresholdTests(object obj, int expectedSafeSize) + public void SafeThresholdTests(object builderObj, object nodeObj, int expectedSafeSize) { - SymbolicRegexNode node = (SymbolicRegexNode)obj; + SymbolicRegexBuilder builder = (SymbolicRegexBuilder)builderObj; + SymbolicRegexNode node = (SymbolicRegexNode)nodeObj; int safeSize = node.EstimateNfaSize(); Assert.Equal(expectedSafeSize, safeSize); - int nfaStateCount = CalculateNfaStateCount(node); + int nfaStateCount = CalculateNfaStateCount(builder, node); Assert.True(nfaStateCount <= expectedSafeSize); } /// /// Compute the closure of all NFA states from root and return the size of the resulting state space. /// - private static int CalculateNfaStateCount(SymbolicRegexNode root) + private static int CalculateNfaStateCount(SymbolicRegexBuilder builder, SymbolicRegexNode root) { // Here we are actually using the original BDD algebra (not converting to the BV or Uint64 algebra) // because it does not matter which algebra we use here (this matters only for performance) HashSet<(uint, SymbolicRegexNode)> states = new(); Stack<(uint, SymbolicRegexNode)> frontier = new(); - List minterms = MintermGenerator.GenerateMinterms(root._builder._solver, root.GetSets()); + List minterms = MintermGenerator.GenerateMinterms(builder._solver, root.GetSets(builder)); // Start from the initial state that has kind 'General' when no anchors are being used, else kind 'BeginningEnd' (uint, SymbolicRegexNode) initialState = (root._info.ContainsSomeAnchor ? CharKind.BeginningEnd : CharKind.General, root); @@ -150,7 +151,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode root) foreach (BDD minterm in minterms) { uint kind = GetCharKind(minterm); - SymbolicRegexNode target = source.Node.CreateDerivativeWithoutEffects(minterm, source.Kind); + SymbolicRegexNode target = source.Node.CreateDerivativeWithoutEffects(builder, minterm, source.Kind); //In the case of an NFA all the different alternatives in the DFA state become individual states themselves foreach (SymbolicRegexNode node in GetAlternatives(target)) @@ -169,7 +170,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode root) return states.Count; // Enumerates the alternatives from a node, for eaxmple (ab|(bc|cd)) has three alternatives - static IEnumerable> GetAlternatives(SymbolicRegexNode node) + IEnumerable> GetAlternatives(SymbolicRegexNode node) { if (node._kind == SymbolicRegexNodeKind.Alternate) { @@ -178,7 +179,7 @@ static IEnumerable> GetAlternatives(SymbolicRegexNode elem in GetAlternatives(node._right!)) yield return elem; } - else if (!node.IsNothing) // omit deadend states + else if (!node.IsNothing(builder._solver)) // omit deadend states { yield return node; } @@ -187,8 +188,8 @@ static IEnumerable> GetAlternatives(SymbolicRegexNode - minterm.Equals(root._builder._newLineSet) ? CharKind.Newline : // is \n - (!root._builder._solver.IsEmpty(root._builder._solver.And(root._builder._wordLetterForBoundariesSet, minterm)) ? + minterm.Equals(builder._newLineSet) ? CharKind.Newline : // is \n + (!builder._solver.IsEmpty(builder._solver.And(builder._wordLetterForBoundariesSet, minterm)) ? CharKind.WordLetter : // in \w CharKind.General); // anything else, thus in particular in \W } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj index 62948390630d38be8c1b41ab56d399dfe80fd7f6..8c45a0c5adbc7e63ce578c4ce6840e5952f972ef 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj @@ -51,7 +51,7 @@ - +