diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs index dc23cd9af0749db674a5bb898333e99e79539850..485fdd42bb9209010c44e7b9aa8a0d81f0cb4ba8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs @@ -51,7 +51,7 @@ internal int FixedLength /// If true then the state is a dead-end, rejects all inputs. internal bool IsNothing => Node.IsNothing; - /// If true then state starts with a ^ or $ or \A or \z or \Z + /// If true then state starts with a ^ or $ or \Z internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; /// @@ -134,7 +134,9 @@ internal List<(DfaMatchingState State, DerivativeEffect[] Effects)> NfaNex // nextCharKind will be the PrevCharKind of the target state // use an existing state instead if one exists already // otherwise create a new new id for it - list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects)); + DfaMatchingState state = Node._builder.CreateState(node, nextCharKind, capturing: true); + if (!state.IsDeadend) + list.Add((state, effects)); } return list; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs index dadd291ecd6436c3ec28708bf9222262fcc84836..cfd1048d109405264fd4fb07442ce0857bc44a2c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs @@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher srm, bool nfa, bool addDotStar, { _builder = srm._builder; uint startId = reverse ? - (srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) : - (srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0); + (srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) : + (srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0); // Create the initial state _initialState = _builder.CreateState( diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index 17377d10a9a7bc350aeec2449d6c3ce30c6139bb..651902547e1a34873d335723b678489b78df9c2a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable internal readonly SymbolicRegexNode _nothing; internal readonly SymbolicRegexNode _anyChar; internal readonly SymbolicRegexNode _anyStar; + internal readonly SymbolicRegexNode _anyStarLazy; private SymbolicRegexNode? _epsilon; internal SymbolicRegexNode Epsilon => _epsilon ??= SymbolicRegexNode.CreateEpsilon(this); @@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver) _nothing = SymbolicRegexNode.CreateFalse(this); _anyChar = SymbolicRegexNode.CreateTrue(this); _anyStar = SymbolicRegexNode.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false); + _anyStarLazy = SymbolicRegexNode.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true); // --- initialize singletonCache --- _singletonCache[_solver.Empty] = _nothing; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index 89fc8e5bf551a93c56d4a07d2f5bebce8644f4fe..cc5b5af109e320918a2657f3a3a200d34a8b52cf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic private const uint IsLazyMask = 4; private const uint CanBeNullableMask = 8; private const uint ContainsSomeAnchorMask = 16; - private const uint ContainsLineAnchorMask = 32; - private const uint ContainsSomeCharacterMask = 64; - private const uint StartsWithBoundaryAnchorMask = 128; + private const uint StartsWithSomeAnchorMask = 32; private readonly uint _info; private SymbolicRegexInfo(uint i) => _info = i; - internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, - bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false, - bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true) + internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, + bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true) { uint i = 0; @@ -35,31 +32,21 @@ namespace System.Text.RegularExpressions.Symbolic } } - if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor) + if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor) { i |= ContainsSomeAnchorMask; - if (startsWithLineAnchor || containsLineAnchor) + if (startsWithLineAnchor) { - i |= ContainsLineAnchorMask; - - if (startsWithLineAnchor) - { - i |= StartsWithLineAnchorMask; - } + i |= StartsWithLineAnchorMask; } - if (startsWithBoundaryAnchor) + if (startsWithLineAnchor || startsWithSomeAnchor) { - i |= StartsWithBoundaryAnchorMask; + i |= StartsWithSomeAnchorMask; } } - if (containsSomeCharacter) - { - i |= ContainsSomeCharacterMask; - } - if (isLazy) { i |= IsLazyMask; @@ -72,18 +59,12 @@ namespace System.Text.RegularExpressions.Symbolic public bool CanBeNullable => (_info & CanBeNullableMask) != 0; - public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0; - public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; - public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0; + public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; - public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0; - - public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0; - public bool IsLazy => (_info & IsLazyMask) != 0; public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos) @@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) return new SymbolicRegexInfo(i); } - public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) - { - bool isNullable = left_info.IsNullable && right_info.IsNullable; - bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable; - bool isLazy = left_info.IsLazy && right_info.IsLazy; - - bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor); - bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor); - bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor; - bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor; - bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter; - - return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy); - } + public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) => + Create( + isAlwaysNullable: left_info.IsNullable && right_info.IsNullable, + canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable, + startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor), + startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor), + containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, + isLazy: left_info.IsLazy && right_info.IsLazy); public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy) { @@ -171,10 +146,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound Create(isAlwaysNullable: !info.CanBeNullable, canBeNullable: !info.IsNullable, startsWithLineAnchor: info.StartsWithLineAnchor, - startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor, containsSomeAnchor: info.ContainsSomeAnchor, - containsLineAnchor: info.ContainsLineAnchor, - containsSomeCharacter: info.ContainsSomeCharacter, isLazy: info.IsLazy); public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index d9efcc3d7d08a6b2168d6d3b9cffdbdd57c88bd5..4a454d80c9ba9898592e0152d0040e8f09da65fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -210,7 +210,7 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount, // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. - _dotStarredPattern = _builder.CreateConcat(_builder._anyStar, _pattern); + _dotStarredPattern = _builder.CreateConcat(_builder._anyStarLazy, _pattern); var dotstarredInitialStates = new DfaMatchingState[statesCount]; for (uint i = 0; i < dotstarredInitialStates.Length; i++) { @@ -280,8 +280,9 @@ private bool TryTakeTransition(SymbolicRegexBuilder builder { int c = input[i]; + // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor int mintermId = c == '\n' && i == input.Length - 1 && TStateHandler.StartsWithLineAnchor(ref state) ? - builder._minterms!.Length : // mintermId = minterms.Length represents \Z (last \n) + builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input _mintermClassifier.GetMintermID(c); return TStateHandler.TakeTransition(builder, ref state, mintermId); @@ -335,29 +336,17 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5); } - // If we're starting at the end of the input, we don't need to do any work other than - // determine whether an empty match is valid, i.e. whether the pattern is "nullable" - // given the kinds of characters at and just before the end. - if (startat == input.Length) - { - // TODO https://github.com/dotnet/runtime/issues/65606: Handle capture groups. - uint prevKind = GetCharKind(input, startat - 1); - uint nextKind = GetCharKind(input, startat); - return _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind)) ? - new SymbolicMatch(startat, 0) : - SymbolicMatch.NoMatch; - } - // Phase 1: - // Determine whether there is a match by finding the first final state position. This only tells - // us whether there is a match but needn't give us the longest possible match. This may return -1 as - // a legitimate value when the initial state is nullable and startat == 0. It returns NoMatchExists (-2) - // when there is no match. As an example, consider the pattern a{5,10}b* run against an input - // of aaaaaaaaaaaaaaabbbc: phase 1 will find the position of the first b: aaaaaaaaaaaaaaab. - int i = FindFinalStatePosition(input, startat, timeoutOccursAt, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData); + // Determine the end point of the match. The returned index is one-past-the-end index for the characters + // in the match. Note that -1 is a valid end point for an empty match at the beginning of the input. + // It returns NoMatchExists (-2) when there is no match. + // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find + // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after + // the c as the low boundary for the starting position. + int matchEnd = FindEndPosition(input, startat, timeoutOccursAt, isMatch, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData); // If there wasn't a match, we're done. - if (i == NoMatchExists) + if (matchEnd == NoMatchExists) { return SymbolicMatch.NoMatch; } @@ -374,101 +363,138 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start // start position. That tells us the actual starting position of the match. We can skip this phase if we // recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that // exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from - // that first b until it finds the 6th a: aaaaaaaaaab. + // that last b until it finds the 4th a: aaabbbc. int matchStart; if (matchStartLengthMarker >= 0) { - matchStart = i - matchStartLengthMarker + 1; + matchStart = matchEnd - matchStartLengthMarker; + } + else if (_fixedMatchLength.HasValue) + { + matchStart = matchEnd - _fixedMatchLength.GetValueOrDefault(); } else { - Debug.Assert(i >= startat - 1); - matchStart = i < startat ? + Debug.Assert(matchEnd >= startat - 1); + matchStart = matchEnd < startat ? startat : - FindStartPosition(input, i, matchStartLowBoundary, perThreadData); + FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData); } // Phase 3: - // Match again, this time from the computed start position, to find the latest end position. That start - // and end then represent the bounds of the match. If the pattern has subcaptures (captures other than - // the top-level capture for the whole match), we need to do more work to compute their exact bounds, so we - // take a faster path if captures aren't required. Further, if captures aren't needed, and if any possible - // match of the whole pattern is a fixed length, we can skip this phase as well, just using that fixed-length - // to compute the ending position based on the starting position. Continuing the previous example, phase 3 - // will walk forwards from the 6th a until it finds the end of the match: aaaaaaaaaabbb. + // If there are no subcaptures, the matching process is done. For patterns with subcaptures (captures other + // than the top-level capture for the whole match), we need to do an additional pass to find their bounds. + // Continuing for the previous example, phase 3 will be executed for the characters inside the match, aaabbbc, + // and will find associate the one capture (b*) with it's match: bbb. if (!HasSubcaptures) { - if (_fixedMatchLength.HasValue) - { - return new SymbolicMatch(matchStart, _fixedMatchLength.GetValueOrDefault()); - } - - int matchEnd = FindEndPosition(input, matchStart, perThreadData); - return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart); + return new SymbolicMatch(matchStart, matchEnd - matchStart); } else { - int matchEnd = FindEndPositionCapturing(input, matchStart, out Registers endRegisters, perThreadData); - return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); + Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData); + return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } - /// Phase 3 of matching. From a found starting position, find the ending position of the match using the original pattern. - /// - /// The ending position is known to exist; this function just needs to determine exactly what it is. - /// We need to find the longest possible match and thus the latest valid ending position. - /// + /// Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call. /// The input text. - /// The starting position of the match. + /// The starting position in . + /// The time at which timeout occurs, if timeouts are being checked. + /// Whether this is an isMatch call. + /// The last position the initial state of was visited before the end position was found. + /// Length of the match if there's a match; otherwise, -1. /// Per thread data reused between calls. - /// The found ending position of the match. - private int FindEndPosition(ReadOnlySpan input, int i, PerThreadData perThreadData) + /// + /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. + /// + private int FindEndPosition(ReadOnlySpan input, int i, int timeoutOccursAt, bool isMatch, out int initialStateIndex, out int matchLength, PerThreadData perThreadData) { - // Get the starting state based on the current context. - DfaMatchingState dfaStartState = _initialStates[GetCharKind(input, i - 1)]; - - // If the starting state is nullable (accepts the empty string), then it's a valid - // match and we need to record the position as a possible end, but keep going looking - // for a better one. - int end = input.Length; // invalid sentinel value - if (dfaStartState.IsNullable(GetCharKind(input, i))) - { - // Empty match exists because the initial state is accepting. - end = i - 1; - } + int endPosition = NoMatchExists; - if ((uint)i < (uint)input.Length) + matchLength = -1; + initialStateIndex = i; + int initialStateIndexCandidate = i; + + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]); + SymbolicRegexBuilder builder = _pattern._builder; + + while (true) { - // Iterate from the starting state until we've found the best ending state. - SymbolicRegexBuilder builder = dfaStartState.Node._builder; - var currentState = new CurrentState(dfaStartState); - while (true) + if (currentState.DfaState is { IsInitialState: true }) + { + if (_findOpts is RegexFindOptimizations findOpts) + { + // Find the first position i that matches with some likely character. + if (!findOpts.TryFindNextStartingPosition(input, ref i, 0)) + { + // no match was found + break; + } + } + + initialStateIndexCandidate = i; + + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]); + } + + // Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked, + // we need to pop out of the inner loop every now and then to do the timeout check in this outer loop. + const int CharsPerTimeoutCheck = 1_000; + ReadOnlySpan inputForInnerLoop = _checkTimeout && input.Length - i > CharsPerTimeoutCheck ? + input.Slice(0, i + CharsPerTimeoutCheck) : + input; + + int newEndPosition; + int findResult = currentState.NfaState is not null ? + FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition) : + FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition); + + // If a new end position was found, commit to the matching initial state index + if (newEndPosition != -1) { - // Run the DFA or NFA traversal backwards from the current point using the current state. - bool done = currentState.NfaState is not null ? - FindEndPositionDeltas(builder, input, ref i, ref currentState, ref end) : - FindEndPositionDeltas(builder, input, ref i, ref currentState, ref end); + endPosition = newEndPosition; + initialStateIndex = initialStateIndexCandidate; + } - // If we successfully found the ending position, we're done. - if (done || (uint)i >= (uint)input.Length) + // If we reached the end of input or a deadend state, we're done. + if (findResult > 0) + { + break; + } + + // The search did not finish, so we either hit an initial state (in which case we want to loop around to apply our initial + // state processing logic and optimizations), or failed to transition (which should only happen if we were in DFA mode and + // need to switch over to NFA mode). If we exited because we hit an initial state, find result will be 0, otherwise -1. + if (findResult < 0) + { + if (i >= input.Length) { + // We ran out of input. break; } - // We exited out of the inner processing loop, but we didn't hit a dead end or run out - // of input, and that should only happen if we failed to transition from one state to - // the next, which should only happen if we were in DFA mode and we tried to create - // a new state and exceeded the graph size. Upgrade to NFA mode and continue; - Debug.Assert(currentState.DfaState is not null); - NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(currentState.DfaState); - currentState = new CurrentState(nfaState); + if (i < inputForInnerLoop.Length) + { + // We failed to transition. Upgrade to DFA mode. + Debug.Assert(i < inputForInnerLoop.Length); + Debug.Assert(currentState.DfaState is not null); + NfaMatchingState nfaState = perThreadData.NfaState; + nfaState.InitializeFrom(currentState.DfaState); + currentState = new CurrentState(nfaState); + } + } + + // Check for a timeout before continuing. + if (_checkTimeout) + { + DoCheckTimeout(timeoutOccursAt); } } - // Return the found ending position. - Debug.Assert(end < input.Length, "Expected to find an ending position but didn't"); - return end; + return endPosition; } /// @@ -476,158 +502,68 @@ private int FindEndPosition(ReadOnlySpan input, int i, PerThreadData perTh /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, ref CurrentState currentState, ref int endingIndex) + /// + /// The supplies the actual transitioning logic, controlling whether processing is + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's + /// to be non-null and its to be null; vice versa for + /// . + /// + /// + /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. + /// 0 if iteration completed because we reached an initial state. + /// A negative value if iteration completed because we ran out of input or we failed to transition. + /// + private int FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, bool isMatch, ref int i, ref CurrentState currentState, ref int matchLength, out int endPosition) where TStateHandler : struct, IStateHandler { - // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = i; CurrentState state = currentState; - - // Repeatedly read the next character from the input and use it to transition the current state to the next. - // We're looking for the furthest final state we can find. - while ((uint)pos < (uint)input.Length && TryTakeTransition(builder, input, pos, ref state)) + int endPos = -1; + try { - if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos + 1))) - { - // If the new state accepts the empty string, we found an ending state. Record the position. - endingIndex = pos; - } - else if (TStateHandler.IsDeadend(ref state)) + // Loop through each character in the input, transitioning from state to state for each. + while (true) { - // If the new state is a dead end, the match ended the last time endingIndex was updated. - currentState = state; - i = pos; - return true; - } - - // We successfully transitioned to the next state and consumed the current character, - // so move along to the next. - pos++; - } - - // We either ran out of input, in which case we successfully recorded an ending index, - // or we failed to transition to the next state due to the graph becoming too large. - currentState = state; - i = pos; - return false; - } - - /// Find match end position using the original pattern, end position is known to exist. This version also produces captures. - /// input span - /// inclusive start position - /// out parameter for the final register values, which indicate capture starts and ends - /// Per thread data reused between calls. - /// the match end position - private int FindEndPositionCapturing(ReadOnlySpan input, int i, out Registers resultRegisters, PerThreadData perThreadData) - { - int i_end = input.Length; - Registers endRegisters = default; - DfaMatchingState? endState = null; + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos))) + { + // Check whether there's a fixed-length marker for the current state. If there is, we can + // use that length to optimize subsequent matching phases. + matchLength = TStateHandler.FixedLength(ref state); + endPos = pos; + // If this is an isMatch call we are done, since a match is now known to exist. + if (isMatch) + return 1; + } - // Pick the correct start state based on previous character kind. - DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; + // If the state is a dead end, such that we can't transition anywhere else, end the search. + if (TStateHandler.IsDeadend(ref state)) + return 1; - Registers initialRegisters = perThreadData.InitialRegisters; + // If there is more input available try to transition with the next character. + if ((uint)pos >= (uint)input.Length || !TryTakeTransition(builder, input, pos, ref state)) + return -1; - // Initialize registers with -1, which means "not seen yet" - Array.Fill(initialRegisters.CaptureStarts, -1); - Array.Fill(initialRegisters.CaptureEnds, -1); + // We successfully transitioned, so update our current input index to match. + pos++; - if (initialState.IsNullable(GetCharKind(input, i))) - { - // Empty match exists because the initial state is accepting. - i_end = i - 1; - endRegisters.Assign(initialRegisters); - endState = initialState; + // Now that currentState and our position are coherent, check if currentState represents an initial state. + // If it does, we exit out in order to allow our find optimizations to kick in to hopefully more quickly + // find the next possible starting location. + if (TStateHandler.IsInitialState(ref state)) + return 0; + } } - - // Use two maps from state IDs to register values for the current and next set of states. - // Note that these maps use insertion order, which is used to maintain priorities between states in a way - // that matches the order the backtracking engines visit paths. - Debug.Assert(perThreadData.Current is not null && perThreadData.Next is not null); - SparseIntMap current = perThreadData.Current, next = perThreadData.Next; - current.Clear(); - next.Clear(); - current.Add(initialState.Id, initialRegisters); - - SymbolicRegexBuilder builder = _builder; - - while ((uint)i < (uint)input.Length) + finally { - Debug.Assert(next.Count == 0); - - int c = input[i]; - int normalMintermId = _mintermClassifier.GetMintermID(c); - - foreach ((int sourceId, Registers sourceRegisters) in current.Values) - { - Debug.Assert(builder._capturingStateArray is not null); - DfaMatchingState sourceState = builder._capturingStateArray[sourceId]; - - // Find the minterm, handling the special case for the last \n - int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ? - builder._minterms!.Length : - normalMintermId; // mintermId = minterms.Length represents \Z (last \n) - TSet minterm = builder.GetMinterm(mintermId); - - // Get or create the transitions - int offset = (sourceId << builder._mintermsLog) | mintermId; - Debug.Assert(builder._capturingDelta is not null); - List<(DfaMatchingState, DerivativeEffect[])>? transitions = - builder._capturingDelta[offset] ?? - CreateNewCapturingTransitions(sourceState, minterm, offset); - - // Take the transitions in their prioritized order - for (int j = 0; j < transitions.Count; ++j) - { - (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j]; - if (targetState.IsDeadend) - continue; - - // Try to add the state and handle the case where it didn't exist before. If the state already - // exists, then the transition can be safely ignored, as the existing state was generated by a - // higher priority transition. - if (next.Add(targetState.Id, out int index)) - { - // Avoid copying the registers on the last transition from this state, reusing the registers instead - Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters; - newRegisters.ApplyEffects(effects, i); - next.Update(index, targetState.Id, newRegisters); - if (targetState.IsNullable(GetCharKind(input, i + 1))) - { - // Accepting state has been reached. Record the position. - i_end = i; - endRegisters.Assign(newRegisters); - endState = targetState; - // No lower priority transitions from this or other source states are taken because the - // backtracking engines would return the match ending here. - goto BreakNullable; - } - } - } - } - - BreakNullable: - if (next.Count == 0) - { - // If all states died out some nullable state must have been seen before - break; - } - - // Swap the state sets and prepare for the next character - SparseIntMap tmp = current; - current = next; - next = tmp; - next.Clear(); - i++; + // Write back the local copies of the ref and out values. + currentState = state; + i = pos; + endPosition = endPos; } - - Debug.Assert(i_end != input.Length && endState is not null); - // Apply effects for finishing at the stored end state - endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos), - CharKind.Context(endState.PrevCharKind, GetCharKind(input, i_end + 1)), (Registers: endRegisters, Pos: i_end + 1)); - resultRegisters = endRegisters; - return i_end; } /// @@ -639,30 +575,24 @@ private int FindEndPositionCapturing(ReadOnlySpan input, int i, out Regist /// We need to find the earliest (lowest index) starting position that's not earlier than . /// /// The input text. - /// The ending position to walk backwards from. points at the last character of the match. + /// The ending position to walk backwards from. points one past the last character of the match. /// The initial starting location discovered in phase 1, a point we must not walk earlier than. /// Per thread data reused between calls. /// The found starting position for the match. private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) { Debug.Assert(i >= 0, $"{nameof(i)} == {i}"); - Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary < input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}"); + Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}"); Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}."); // Get the starting state for the reverse pattern. This depends on previous character (which, because we're - // going backwards, is character number i + 1). - var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i + 1)]); + // going backwards, is character number i). + var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); - // If the initial state is nullable, meaning it accepts the empty string, then we've already discovered - // a valid starting position, and we just need to keep looking for an earlier one in case there is one. int lastStart = -1; // invalid sentinel value - if (currentState.DfaState!.IsNullable(GetCharKind(input, i))) - { - lastStart = i + 1; - } // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. - SymbolicRegexBuilder builder = currentState.DfaState.Node._builder; + SymbolicRegexBuilder builder = currentState.DfaState!.Node._builder; while (true) { // Run the DFA or NFA traversal backwards from the current point using the current state. @@ -701,232 +631,145 @@ private bool FindStartPositionDeltas(SymbolicRegexBuilder b // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. int pos = i; CurrentState state = currentState; - - // Loop backwards through each character in the input, transitioning from state to state for each. - while (TryTakeTransition(builder, input, pos, ref state)) + try { - // We successfully transitioned. If the new state is a dead end, we're done, as we must have already seen - // and recorded a larger lastStart value that was the earliest valid starting position. - if (TStateHandler.IsDeadend(ref state)) - { - Debug.Assert(lastStart != -1); - currentState = state; - i = pos; - return true; - } - - // If the new state accepts the empty string, we found a valid starting position. Record it and keep going, - // since we're looking for the earliest one to occur within bounds. - if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos - 1))) + // Loop backwards through each character in the input, transitioning from state to state for each. + while (true) { - lastStart = pos; - } + // If the state accepts the empty string, we found a valid starting position. Record it and keep going, + // since we're looking for the earliest one to occur within bounds. + if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos - 1))) + lastStart = pos; + + // If we are past the start threshold or if the state is a dead end, bail; we should have already + // found a valid starting location. + if (pos <= startThreshold || TStateHandler.IsDeadend(ref state)) + { + Debug.Assert(lastStart != -1); + return true; + } - // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input. - pos--; + // Try to transition with the next character, the one before the current position. + if (!TryTakeTransition(builder, input, pos - 1, ref state)) + // Return false to indicate the search didn't finish. + return false; - // If doing so now puts us below the start threshold, bail; we should have already found a valid starting location. - if (pos < startThreshold) - { - Debug.Assert(lastStart != -1); - currentState = state; - i = pos; - return true; + // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input. + pos--; } } - - // Unable to transition further. - currentState = state; - i = pos; - return false; + finally + { + // Write back the local copies of the ref values. + currentState = state; + i = pos; + } } - /// Performs the initial Phase 1 match to find the first final state encountered. - /// The input text. - /// The starting position in . - /// The time at which timeout occurs, if timeouts are being checked. - /// The last position the initial state of was visited. - /// Length of the match if there's a match; otherwise, -1. + + /// Run the pattern on a match to record the capture starts and ends. + /// input span + /// inclusive start position + /// exclusive end position /// Per thread data reused between calls. - /// The index into input that matches the final state, or NoMatchExists if no match exists. It returns -1 when i=0 and the initial state is nullable. - private int FindFinalStatePosition(ReadOnlySpan input, int i, int timeoutOccursAt, out int initialStateIndex, out int matchLength, PerThreadData perThreadData) + /// the final register values, which indicate capture starts and ends + private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData) { - matchLength = -1; - initialStateIndex = i; + // Pick the correct start state based on previous character kind. + DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; - // Start with the start state of the dot-star pattern, which in general depends on the previous character kind in the input in order to handle anchors. - // If the starting state is a dead end, then no match exists. - var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]); - if (currentState.DfaState!.IsNothing) - { - // This can happen, for example, when the original regex starts with a beginning anchor but the previous char kind is not Beginning. - return NoMatchExists; - } + Registers initialRegisters = perThreadData.InitialRegisters; - // If the starting state accepts the empty string in this context (factoring in anchors), we're done. - if (currentState.DfaState.IsNullable(GetCharKind(input, i))) - { - // The initial state is nullable in this context so at least an empty match exists. - // The last position of the match is i - 1 because the match is empty. - // This value is -1 if i == 0. - return i - 1; - } + // Initialize registers with -1, which means "not seen yet" + Array.Fill(initialRegisters.CaptureStarts, -1); + Array.Fill(initialRegisters.CaptureEnds, -1); - // Otherwise, start searching from the current position until the end of the input. - if ((uint)i < (uint)input.Length) - { - SymbolicRegexBuilder builder = currentState.DfaState.Node._builder; - while (true) - { - // If we're at an initial state, try to search ahead for the next possible match location - // using any find optimizations that may have previously been computed. - if (currentState.DfaState is { IsInitialState: true }) - { - // i is the most recent position in the input when the dot-star pattern is in the initial state - initialStateIndex = i; + // Use two maps from state IDs to register values for the current and next set of states. + // Note that these maps use insertion order, which is used to maintain priorities between states in a way + // that matches the order the backtracking engines visit paths. + Debug.Assert(perThreadData.Current is not null && perThreadData.Next is not null); + SparseIntMap current = perThreadData.Current, next = perThreadData.Next; + current.Clear(); + next.Clear(); + current.Add(initialState.Id, initialRegisters); - if (_findOpts is RegexFindOptimizations findOpts) - { - // Find the first position i that matches with some likely character. - if (!findOpts.TryFindNextStartingPosition(input, ref i, 0)) - { - // no match was found - return NoMatchExists; - } + SymbolicRegexBuilder builder = _builder; - initialStateIndex = i; + while ((uint)i < (uint)iEnd) + { + Debug.Assert(next.Count == 0); - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]); - if (currentState.DfaState!.IsNothing) - { - return NoMatchExists; - } - } - } + // Read the next character and find its minterm + int c = input[i]; + int normalMintermId = _mintermClassifier.GetMintermID(c); - // Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked, - // we need to pop out of the inner loop every now and then to do the timeout check in this outer loop. - const int CharsPerTimeoutCheck = 10_000; - ReadOnlySpan inputForInnerLoop = _checkTimeout && input.Length - i > CharsPerTimeoutCheck ? - input.Slice(0, i + CharsPerTimeoutCheck) : - input; + foreach ((int sourceId, Registers sourceRegisters) in current.Values) + { + Debug.Assert(builder._capturingStateArray is not null); + DfaMatchingState sourceState = builder._capturingStateArray[sourceId]; - int finalStatePosition; - int findResult = currentState.NfaState is not null ? - FindFinalStatePositionDeltas(builder, inputForInnerLoop, ref i, ref currentState, ref matchLength, out finalStatePosition) : - FindFinalStatePositionDeltas(builder, inputForInnerLoop, ref i, ref currentState, ref matchLength, out finalStatePosition); + // Handle the special case for the last \n for states that start with a relevant anchor + int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ? + builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input + normalMintermId; + TSet minterm = builder.GetMinterm(mintermId); - // If we reached a final or deadend state, we're done. - if (findResult > 0) - { - return finalStatePosition; - } + // Get or create the transitions + int offset = (sourceId << builder._mintermsLog) | mintermId; + Debug.Assert(builder._capturingDelta is not null); + List<(DfaMatchingState, DerivativeEffect[])>? transitions = + builder._capturingDelta[offset] ?? + CreateNewCapturingTransitions(sourceState, minterm, offset); - // We're not at an end state, so we either ran out of input (in which case no match exists), hit an initial state (in which case - // we want to loop around to apply our initial state processing logic and optimizations), or failed to transition (which should - // only happen if we were in DFA mode and need to switch over to NFA mode). If we exited because we hit an initial state, - // find result will be 0, otherwise negative. - if (findResult < 0) + // Take the transitions in their prioritized order + for (int j = 0; j < transitions.Count; ++j) { - if (i >= input.Length) - { - // We ran out of input. No match. - break; - } + (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j]; + Debug.Assert(!targetState.IsDeadend, "Transitions should not include dead ends."); - if (i < inputForInnerLoop.Length) + // Try to add the state and handle the case where it didn't exist before. If the state already + // exists, then the transition can be safely ignored, as the existing state was generated by a + // higher priority transition. + if (next.Add(targetState.Id, out int index)) { - // We failed to transition. Upgrade to DFA mode. - Debug.Assert(currentState.DfaState is not null); - NfaMatchingState nfaState = perThreadData.NfaState; - nfaState.InitializeFrom(currentState.DfaState); - currentState = new CurrentState(nfaState); + // Avoid copying the registers on the last transition from this state, reusing the registers instead + Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters; + newRegisters.ApplyEffects(effects, i); + next.Update(index, targetState.Id, newRegisters); + if (targetState.IsNullable(GetCharKind(input, i + 1))) + { + // No lower priority transitions from this or other source states are taken because the + // backtracking engines would return the match ending here. + goto BreakNullable; + } } } - - // Check for a timeout before continuing. - if (_checkTimeout) - { - DoCheckTimeout(timeoutOccursAt); - } } - } - - // No match was found. - return NoMatchExists; - } - /// - /// Workhorse inner loop for . Consumes the character by character, - /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, - /// lazily building out the graph as needed. - /// - /// - /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's - /// to be non-null and its to be null; vice versa for - /// . - /// - /// - /// A positive value if iteration completed because it reached a nullable or deadend state. - /// 0 if iteration completed because we reached an initial state. - /// A negative value if iteration completed because we ran out of input or we failed to transition. - /// - private int FindFinalStatePositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, ref CurrentState currentState, ref int matchLength, out int finalStatePosition) - where TStateHandler : struct, IStateHandler - { - // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. - int pos = i; - CurrentState state = currentState; + BreakNullable: + // Swap the state sets and prepare for the next character + SparseIntMap tmp = current; + current = next; + next = tmp; + next.Clear(); + i++; + } - // Loop through each character in the input, transitioning from state to state for each. - while ((uint)pos < (uint)input.Length && TryTakeTransition(builder, input, pos, ref state)) + Debug.Assert(current.Count > 0); + Debug.Assert(_builder._capturingStateArray is not null); + foreach (var (endStateId, endRegisters) in current.Values) { - // We successfully transitioned for the character at index i. If the new state is nullable for - // the next character, meaning it accepts the empty string, we found a final state and are done! - if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos + 1))) + DfaMatchingState endState = _builder._capturingStateArray[endStateId]; + if (endState.IsNullable(GetCharKind(input, iEnd))) { - // Check whether there's a fixed-length marker for the current state. If there is, we can - // use that length to optimize subsequent matching phases. - matchLength = TStateHandler.FixedLength(ref state); - currentState = state; - i = pos; - finalStatePosition = pos; - return 1; - } - - // If the new state is a dead end, such that we didn't match and we can't transition anywhere - // else, then no match exists. - if (TStateHandler.IsDeadend(ref state)) - { - currentState = state; - i = pos; - finalStatePosition = NoMatchExists; - return 1; - } - - // We successfully transitioned, so update our current input index to match. - pos++; - - // Now that currentState and our position are coherent, check if currentState represents an initial state. - // If it does, we exit out in order to allow our find optimizations to kick in to hopefully more quickly - // find the next possible starting location. - if (TStateHandler.IsInitialState(ref state)) - { - currentState = state; - i = pos; - finalStatePosition = 0; - return 0; + // Apply effects for finishing at the stored end state + endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos), + CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); + return endRegisters; } } - - currentState = state; - i = pos; - finalStatePosition = 0; - return -1; + Debug.Fail("No nullable state found in the set of end states"); + return default; } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 0a1668f5348c1b2fcc46e020f773059443c9cb44..dc62260b9e8ec252dbcf318da708fe6bfc4f5445 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -382,7 +382,7 @@ public bool IsNothing Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateTrue(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create(containsSomeCharacter: true)); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateFixedLengthMarker(SymbolicRegexBuilder builder, int length) => Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true)); @@ -399,19 +399,22 @@ internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilde SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithLineAnchor: true, canBeNullable: true)); + return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true, + startsWithLineAnchor: kind is + SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or + SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); } internal static SymbolicRegexNode CreateBoundaryAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) { Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor); - return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithBoundaryAnchor: true, canBeNullable: true)); + return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true)); } #endregion internal static SymbolicRegexNode CreateSingleton(SymbolicRegexBuilder builder, TSet set) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create(containsSomeCharacter: !set.Equals(builder._solver.Empty))); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder builder, SymbolicRegexNode body, int lower, int upper, bool isLazy) { @@ -589,40 +592,6 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr); Debug.Assert(deduplicated); - // Apply the counter subsumption/combining optimization if possible - (SymbolicRegexNode loop, SymbolicRegexNode rest) = left.FirstCounterInfo(); - if (loop != builder._nothing) - { - Debug.Assert(loop._kind == SymbolicRegexNodeKind.Loop && loop._left is not null); - (SymbolicRegexNode otherLoop, SymbolicRegexNode otherRest) = right.FirstCounterInfo(); - if (otherLoop != builder._nothing && rest == otherRest) - { - // Found two adjacent counters with the same continuation, check that the loops are equivalent apart from bounds - // and that the bounds form a contiguous interval. Two integer intervals [x1,x2] and [y1,y2] overlap when - // x1 <= y2 and y1 <= x2. The union of intervals that just touch is still contiguous, e.g. [2,5] and [6,10] make - // [2,10], so the lower bounds are decremented by 1 in the check. - Debug.Assert(otherLoop._kind == SymbolicRegexNodeKind.Loop && otherLoop._left is not null); - if (loop._left == otherLoop._left && loop.IsLazy == otherLoop.IsLazy && - loop._lower - 1 <= otherLoop._upper && otherLoop._lower - 1 <= loop._upper) - { - // Loops are equivalent apart from bounds, and the union of the bounds is a contiguous interval - // Build a new counter for the union of the ranges - SymbolicRegexNode newCounter = CreateConcat(builder, CreateLoop(builder, loop._left, - Math.Min(loop._lower, otherLoop._lower), Math.Max(loop._upper, otherLoop._upper), loop.IsLazy), rest); - if (right._kind == SymbolicRegexNodeKind.OrderedOr) - { - // The right counter came from an or, so include the rest of that or - Debug.Assert(right._right is not null); - return OrderedOr(builder, newCounter, right._right, deduplicated: true); - } - else - { - return newCounter; - } - } - } - } - // Counter optimization did not apply, just build the or return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Or(left._info, right._info)); } @@ -1052,6 +1021,8 @@ internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivative private void AddTransitions(TSet elem, uint context, List<(SymbolicRegexNode, DerivativeEffect[])> transitions, List> continuation, Stack? effects, bool simulateBacktracking) { + Debug.Assert(!_builder._solver.IsEmpty(elem), "False element or minterm should not make it into derivative construction."); + // Helper function for concatenating a head node and a list of continuation nodes. The continuation nodes // are added in reverse order and the function below uses the list as a stack, so the nodes added to the // stack first end up at the tail of the concatenation. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 99f458ad5b20e55d7f93f1aa3ec881a301f46b06..6e939ec1e50fd686f2f9bb1c384dac3873a11688 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -754,6 +754,9 @@ static IEnumerable<(string Pattern, string Input, RegexOptions Options, int Begi yield return (@".*?\dFo{2}", "This1Foo should 2fOo match", RegexOptions.IgnoreCase, 0, 26, true, "This1Foo"); yield return (@".*?\dfoo", "1fooThis1FOO should 1foo match", RegexOptions.IgnoreCase, 4, 9, true, "This1FOO"); + // Earliest match, not match with earliest end + yield return (@".{5}Foo|Bar", "FooBarFoo", RegexOptions.None, 1, 8, true, "ooBarFoo"); + if (!RegexHelpers.IsNonBacktracking(engine)) { // RightToLeft