未验证 提交 6ebcb3d6 编写于 作者: O Olli Saarikivi 提交者: GitHub

NonBacktracking locking fixes and cleanup (#71234)

* Concurrency fixes and refactoring for clarity

Removed builder reference from SymbolicRegexNode instances; builder now
has to be passed in. Since the builder is not thread safe this clarifies
the locking required in the matcher when using it.
Moved matching specific state from the builder to the matcher. This
includes state and transition arrays.
Simplify character kind code by eliminating duplication of logic.

* Changes from review and cleanup

DfaMatchingState is now just MatchingState

* Comment on NFA mode IDs
上级 8cd9d636
......@@ -63,7 +63,7 @@
<Compile Include="System\Text\RegularExpressions\Symbolic\CharKind.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\CharSetSolver.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\ISolver.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
......@@ -75,6 +75,7 @@
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexNode.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexKind.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexInfo.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Automata.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Dgml.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Explore.cs" />
......
......@@ -43,5 +43,8 @@ internal static class CharKind
WordLetter => @"\w",
_ => string.Empty,
};
/// <summary>Returns whether the given value is in the range of valid character kinds.</summary>
internal static bool IsValidCharKind(uint charKind) => charKind < CharKindCount;
}
}
......@@ -8,135 +8,104 @@
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Captures a state of a DFA explored during matching.</summary>
internal sealed class DfaMatchingState<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
/// <summary>Captures a state explored during matching.</summary>
internal sealed class MatchingState<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
{
internal DfaMatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Node = node;
PrevCharKind = prevCharKind;
}
/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
internal SymbolicRegexNode<TSet> Node { get; }
/// <summary>
/// The kind of the previous character in the input. The <see cref="SymbolicRegexMatcher{TSet}"/> is responsible
/// for ensuring that in all uses of this state this invariant holds by both selecting initial states accordingly
/// and transitioning on each character to states that match that character's kind.
/// </summary>
/// <remarks>
/// Tracking this information is an optimization that allows each transition taken in the matcher to only depend
/// on the next character (and its kind). In general, the transitions from a state with anchors in its pattern
/// depend on both the previous and the next character. Creating distinct states for each kind of the previous
/// character embeds the necessary information about the previous character into the state space of the automaton.
/// However, this does incur a memory overhead due to the duplication of states. For patterns with no anchors
/// this will always be set to <see cref="CharKind.General"/>, which can reduce the number of states created.
///
/// The performance effect of this optimization has not been investigated. If this optimization were removed, the
/// transition logic would in turn have to become more complicated for derivatives that depend on the nullability
/// of anchors. Care should be taken to not slow down transitions without anchors involved.
/// </remarks>
internal uint PrevCharKind { get; }
/// <summary>
/// A unique identifier for this state, which is used in <see cref="SymbolicRegexMatcher{TSet}"/> to index into
/// state information and transition arrays. Valid IDs are always >= 1.
/// </summary>
internal int Id { get; set; }
/// <summary>This is a deadend state</summary>
internal bool IsDeadend => Node.IsNothing;
/// <summary>Whether this state is known to be a dead end, i.e. no nullable states are reachable from here.</summary>
internal bool IsDeadend(ISolver<TSet> solver) => Node.IsNothing(solver);
/// <summary>The node must be nullable here</summary>
/// <summary>
/// Returns the fixed length that any match ending with this state must have, or -1 if there is no such
/// fixed length, <see cref="SymbolicRegexNode{TSet}.ResolveFixedLength(uint)"/>. The context is defined
/// by <see cref="PrevCharKind"/> of this state and the given nextCharKind. The node must be nullable here.
/// </summary>
internal int FixedLength(uint nextCharKind)
{
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
Debug.Assert(IsNullableFor(nextCharKind));
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
uint context = CharKind.Context(PrevCharKind, nextCharKind);
return Node.ResolveFixedLength(context);
}
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
internal bool IsNothing => Node.IsNothing;
/// <summary>If true then state starts with a ^ or $ or \Z</summary>
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
/// <summary>
/// Translates a minterm set to a character kind, which is a general categorization of characters used
/// for cheaply deciding the nullability of anchors.
/// </summary>
/// <remarks>
/// An empty set is handled as a special case to indicate the very last \n.
/// </remarks>
/// <param name="minterm">the minterm to translate</param>
/// <returns>the character kind of the minterm</returns>
private uint GetNextCharKind(ref TSet minterm)
{
ISolver<TSet> solver = Node._builder._solver;
TSet wordLetterPredicate = Node._builder._wordLetterForBoundariesSet;
TSet newLinePredicate = Node._builder._newLineSet;
// minterm == solver.False is used to represent the very last \n
uint nextCharKind = CharKind.General;
if (solver.Empty.Equals(minterm))
{
nextCharKind = CharKind.NewLineS;
minterm = newLinePredicate;
}
else if (newLinePredicate.Equals(minterm))
{
// If the previous state was the start state, mark this as the very FIRST \n.
// Essentially, this looks the same as the very last \n and is used to nullify
// rev(\Z) in the conext of a reversed automaton.
nextCharKind = PrevCharKind == CharKind.BeginningEnd ?
CharKind.NewLineS :
CharKind.Newline;
}
else if (!solver.IsEmpty(solver.And(wordLetterPredicate, minterm)))
{
nextCharKind = CharKind.WordLetter;
}
return nextCharKind;
}
/// <summary>
/// Compute the target state for the given input minterm.
/// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
/// </summary>
/// <param name="builder">the builder that owns <see cref="Node"/></param>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
internal DfaMatchingState<TSet> Next(TSet minterm)
/// <param name="nextCharKind"></param>
internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet minterm, uint nextCharKind)
{
uint nextCharKind = GetNextCharKind(ref minterm);
// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);
// Compute the derivative of the node for the given context
SymbolicRegexNode<TSet> derivative = Node.CreateDerivativeWithoutEffects(minterm, context);
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
return Node._builder.CreateState(derivative, nextCharKind, capturing: false);
return Node.CreateDerivativeWithoutEffects(builder, minterm, context);
}
/// <summary>
/// Compute a set of transitions for the given minterm.
/// </summary>
/// <param name="builder">the builder that owns <see cref="Node"/></param>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
/// <param name="nextCharKind"></param>
/// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns>
internal List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm)
internal List<(SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects)> NfaNextWithEffects(SymbolicRegexBuilder<TSet> builder, TSet minterm, uint nextCharKind)
{
uint nextCharKind = GetNextCharKind(ref minterm);
// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);
// Compute the transitions for the given context
List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context);
var list = new List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)>();
foreach ((SymbolicRegexNode<TSet> node, DerivativeEffect[]? effects) in nodesAndEffects)
{
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
if (!state.IsDeadend)
list.Add((state, effects));
}
return list;
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullableFor(uint nextCharKind)
{
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
uint context = CharKind.Context(PrevCharKind, nextCharKind);
return Node.IsNullableFor(context);
}
public override bool Equals(object? obj) =>
obj is DfaMatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
obj is MatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
public override int GetHashCode() => (PrevCharKind, Node).GetHashCode();
......
......@@ -240,12 +240,12 @@ static string UnexpectedNodeType(RegexNode node)
SymbolicRegexNode<BDD> elem = childResult.Count == 1 ?
childResult.FirstElement :
_builder.CreateConcatAlreadyReversed(childResult);
if (elem.IsNothing)
if (elem.IsNothing(_builder._solver))
{
continue;
}
or = elem.IsAnyStar ?
or = elem.IsAnyStar(_builder._solver) ?
elem : // .* is the absorbing element
SymbolicRegexNode<BDD>.CreateAlternate(_builder, elem, or);
}
......
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Misc information of structural properties of a <see cref="SymbolicRegexNode{S}"/> that is computed bottom up.</summary>
......@@ -14,54 +16,34 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint StartsWithSomeAnchorMask = 32;
private const uint IsHighPriorityNullableMask = 64;
private const uint ContainsEffectMask = 128;
private const uint ContainsLineAnchorMask = 256;
private readonly uint _info;
private SymbolicRegexInfo(uint i) => _info = i;
internal static SymbolicRegexInfo Create(
private static SymbolicRegexInfo Create(
bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
bool startsWithLineAnchor = false, bool containsLineAnchor = false,
bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
bool isHighPriorityNullable = false, bool containsEffect = false)
{
uint i = 0;
if (canBeNullable || isAlwaysNullable)
{
i |= CanBeNullableMask;
if (isAlwaysNullable)
{
i |= IsAlwaysNullableMask;
}
}
if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{
i |= ContainsSomeAnchorMask;
if (startsWithLineAnchor)
{
i |= StartsWithLineAnchorMask;
}
if (startsWithLineAnchor || startsWithSomeAnchor)
{
i |= StartsWithSomeAnchorMask;
}
}
if (isHighPriorityNullable)
{
i |= IsHighPriorityNullableMask;
}
if (containsEffect)
{
i |= ContainsEffectMask;
}
return new SymbolicRegexInfo(i);
// Assert that the expected implications hold. For example, every node that contains a line anchor
// must also be marked as containing some anchor.
Debug.Assert(!isAlwaysNullable || canBeNullable);
Debug.Assert(!startsWithLineAnchor || containsLineAnchor);
Debug.Assert(!startsWithLineAnchor || startsWithSomeAnchor);
Debug.Assert(!containsLineAnchor || containsSomeAnchor);
Debug.Assert(!startsWithSomeAnchor || containsSomeAnchor);
return new SymbolicRegexInfo(
(isAlwaysNullable ? IsAlwaysNullableMask : 0) |
(canBeNullable ? CanBeNullableMask : 0) |
(startsWithLineAnchor ? StartsWithLineAnchorMask : 0) |
(containsLineAnchor ? ContainsLineAnchorMask : 0) |
(startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) |
(containsSomeAnchor ? ContainsSomeAnchorMask : 0) |
(isHighPriorityNullable ? IsHighPriorityNullableMask : 0) |
(containsEffect ? ContainsEffectMask : 0));
}
public bool IsNullable => (_info & IsAlwaysNullableMask) != 0;
......@@ -70,6 +52,8 @@ namespace System.Text.RegularExpressions.Symbolic
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
......@@ -80,6 +64,27 @@ namespace System.Text.RegularExpressions.Symbolic
public bool ContainsEffect => (_info & ContainsEffectMask) != 0;
/// <summary>
/// Used for any node that acts as an epsilon, i.e., something that always matches the empty string.
/// </summary>
public static SymbolicRegexInfo Epsilon() =>
Create(
isAlwaysNullable: true,
canBeNullable: true,
isHighPriorityNullable: true);
/// <summary>
/// Used for all anchors.
/// </summary>
/// <param name="isLineAnchor">whether this anchor is a line anchor</param>
public static SymbolicRegexInfo Anchor(bool isLineAnchor) =>
Create(
canBeNullable: true,
startsWithLineAnchor: isLineAnchor,
containsLineAnchor: isLineAnchor,
startsWithSomeAnchor: true,
containsSomeAnchor: true);
/// <summary>
/// The alternation remains high priority nullable if the left alternative is so.
/// All other info properties are the logical disjunction of the resepctive info properties
......@@ -90,6 +95,7 @@ namespace System.Text.RegularExpressions.Symbolic
isAlwaysNullable: left_info.IsNullable || right_info.IsNullable,
canBeNullable: left_info.CanBeNullable || right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || right_info.StartsWithLineAnchor,
containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor,
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor,
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable,
......@@ -105,6 +111,7 @@ namespace System.Text.RegularExpressions.Symbolic
isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor,
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable,
......
......@@ -48,7 +48,7 @@ internal enum SymbolicRegexNodeKind
/// <summary>Effects to be applied when taking a transition.</summary>
/// <remarks>
/// Left child is the pattern itself and the right child is a concatenation of nodes whose effects should be applied.
/// Effect nodes are created in the rule for concatenation in <see cref="SymbolicRegexNode{TSet}.CreateDerivative(TSet, uint)"/>,
/// Effect nodes are created in the rule for concatenation in <see cref="SymbolicRegexNode{TSet}.CreateDerivative(SymbolicRegexBuilder{TSet}, TSet, uint)"/>,
/// where they are used to represent additional operations that should be performed in the current position if
/// the pattern in the left child is used to match the input. Since these Effect nodes are relative to the current
/// position in the input, the effects from the right child must be applied in the transition that the derivative is
......
......@@ -16,89 +16,91 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
[ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa)
{
Debug.Assert(_builder._minterms is not null);
// Track seen states to avoid exploring twice
HashSet<DfaMatchingState<TSet>> seen = new();
// Use a queue for unexplored states
// This results in a breadth-first exploration
Queue<DfaMatchingState<TSet>> toExplore = new();
lock (this)
{
// Track seen states to avoid exploring twice
HashSet<MatchingState<TSet>> seen = new();
// Use a queue for unexplored states
// This results in a breadth-first exploration
Queue<MatchingState<TSet>> toExplore = new();
// Explore all initial states as requested
if (includeDotStarred)
EnqueueAll(_dotstarredInitialStates, seen, toExplore);
if (includeReverse)
EnqueueAll(_reverseInitialStates, seen, toExplore);
if (includeOriginal)
EnqueueAll(_initialStates, seen, toExplore);
// Explore all initial states as requested
if (includeDotStarred)
EnqueueAll(_dotstarredInitialStates, seen, toExplore);
if (includeReverse)
EnqueueAll(_reverseInitialStates, seen, toExplore);
if (includeOriginal)
EnqueueAll(_initialStates, seen, toExplore);
if (exploreDfa)
{
while (toExplore.Count > 0)
if (exploreDfa)
{
// Don't dequeue yet, because a transition might fail
DfaMatchingState<TSet> state = toExplore.Peek();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
while (toExplore.Count > 0)
{
int offset = (state.Id << _builder._mintermsLog) | mintermId;
if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState<TSet>? nextState))
goto DfaLimitReached;
EnqueueIfUnseen(nextState, seen, toExplore);
// Don't dequeue yet, because a transition might fail
MatchingState<TSet> state = toExplore.Peek();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
{
int offset = DeltaOffset(state.Id, mintermId);
if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState<TSet>? nextState))
goto DfaLimitReached;
EnqueueIfUnseen(nextState, seen, toExplore);
}
// Safe to dequeue now that the state has been completely handled
toExplore.Dequeue();
}
// Safe to dequeue now that the state has been completely handled
toExplore.Dequeue();
}
}
DfaLimitReached:
if (exploreNfa && toExplore.Count > 0)
{
// DFA states are broken up into NFA states when they are alternations
DfaMatchingState<TSet>[] toBreakUp = toExplore.ToArray();
toExplore.Clear();
foreach (DfaMatchingState<TSet> dfaState in toBreakUp)
DfaLimitReached:
if (exploreNfa && toExplore.Count > 0)
{
// Remove state from seen so that it can be added back in if necessary
seen.Remove(dfaState);
// Enqueue all elements of a top level alternation or the state itself
foreach (var element in dfaState.Node.EnumerateAlternationBranches())
// DFA states are broken up into NFA states when they are alternations
MatchingState<TSet>[] toBreakUp = toExplore.ToArray();
toExplore.Clear();
foreach (MatchingState<TSet> dfaState in toBreakUp)
{
int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind);
EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore);
// Remove state from seen so that it can be added back in if necessary
seen.Remove(dfaState);
// Enqueue all elements of a top level alternation or the state itself
ForEachNfaState(dfaState.Node, dfaState.PrevCharKind, (this, seen, toExplore),
static (int nfaId, (SymbolicRegexMatcher<TSet> Matcher, HashSet<MatchingState<TSet>> Seen, Queue<MatchingState<TSet>> ToExplore) args) =>
{
MatchingState<TSet>? coreState = args.Matcher.GetState(args.Matcher.GetCoreStateId(nfaId));
EnqueueIfUnseen(coreState, args.Seen, args.ToExplore);
});
}
}
while (toExplore.Count > 0)
{
// NFA transitions can't fail, so its safe to dequeue here
DfaMatchingState<TSet> state = toExplore.Dequeue();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
while (toExplore.Count > 0)
{
int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId;
int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset);
foreach (int nextNfaState in nextNfaStates)
// NFA transitions can't fail, so its safe to dequeue here
MatchingState<TSet> state = toExplore.Dequeue();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _minterms.Length : _minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
{
EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore);
int nfaOffset = DeltaOffset(_nfaIdByCoreId[state.Id], mintermId);
int[] nextNfaStates = CreateNewNfaTransition(_nfaIdByCoreId[state.Id], mintermId, nfaOffset);
foreach (int nextNfaState in nextNfaStates)
{
EnqueueIfUnseen(GetState(GetCoreStateId(nextNfaState)), seen, toExplore);
}
}
}
}
}
static void EnqueueAll(DfaMatchingState<TSet>[] states, HashSet<DfaMatchingState<TSet>> seen, Queue<DfaMatchingState<TSet>> toExplore)
static void EnqueueAll(MatchingState<TSet>[] states, HashSet<MatchingState<TSet>> seen, Queue<MatchingState<TSet>> toExplore)
{
foreach (DfaMatchingState<TSet> state in states)
foreach (MatchingState<TSet> state in states)
{
EnqueueIfUnseen(state, seen, toExplore);
}
}
static void EnqueueIfUnseen(DfaMatchingState<TSet> state, HashSet<DfaMatchingState<TSet>> seen, Queue<DfaMatchingState<TSet>> queue)
static void EnqueueIfUnseen(MatchingState<TSet> state, HashSet<MatchingState<TSet>> seen, Queue<MatchingState<TSet>> queue)
{
if (seen.Add(state))
{
......
......@@ -30,133 +30,134 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
[ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
public override IEnumerable<string> SampleMatches(int k, int randomseed)
{
// Zero is treated as no seed, instead using a system provided one
Random random = randomseed != 0 ? new Random(randomseed) : new Random();
ISolver<TSet> solver = _builder._solver;
CharSetSolver charSetSolver = _builder._charSetSolver;
lock (this)
{
// Zero is treated as no seed, instead using a system provided one
Random random = randomseed != 0 ? new Random(randomseed) : new Random();
CharSetSolver charSetSolver = _builder._charSetSolver;
// Create helper BDDs for handling anchors and preferentially generating ASCII inputs
BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
// Create helper BDDs for handling anchors and preferentially generating ASCII inputs
BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
charSetSolver.CreateBDDFromRange('A', 'Z'),
charSetSolver.CreateBDDFromRange('a', 'z'),
charSetSolver.CreateBDDFromChar('_'),
charSetSolver.CreateBDDFromRange('0', '9')});
// Visible ASCII range for input character generation
BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));
// Set up two sets of minterms, one with the additional special minterm for the last end-of-line
Debug.Assert(_builder._minterms is not null);
int[] mintermIdsWithoutZ = new int[_builder._minterms.Length];
int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1];
for (int i = 0; i < _builder._minterms.Length; ++i)
{
mintermIdsWithoutZ[i] = i;
mintermIdsWithZ[i] = i;
}
mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length;
for (int i = 0; i < k; i++)
{
// Holds the generated input so far
StringBuilder inputSoFar = new();
StringBuilder? latestCandidate = null;
// Visible ASCII range for input character generation
BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));
// Set up two sets of minterms, one with the additional special minterm for the last end-of-line
Debug.Assert(_minterms is not null);
int[] mintermIdsWithoutZ = new int[_minterms.Length];
int[] mintermIdsWithZ = new int[_minterms.Length + 1];
for (int i = 0; i < _minterms.Length; ++i)
{
mintermIdsWithoutZ[i] = i;
mintermIdsWithZ[i] = i;
}
mintermIdsWithZ[_minterms.Length] = _minterms.Length;
// Current set of states reached initially contains just the root
NfaMatchingState states = new(_builder);
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
// and initialize inputSoFar accordingly
states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan<char>.Empty, -1)]);
CurrentState statesWrapper = new(states);
for (int i = 0; i < k; i++)
{
// Holds the generated input so far
StringBuilder inputSoFar = new();
StringBuilder? latestCandidate = null;
// Used for end suffixes
List<string> possibleEndings = new();
// Current set of states reached initially contains just the root
NfaMatchingState states = new();
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
// and initialize inputSoFar accordingly
states.InitializeFrom(this, _initialStates[GetCharKind<FullInputReader>(ReadOnlySpan<char>.Empty, -1)]);
CurrentState statesWrapper = new(states);
while (true)
{
Debug.Assert(states.NfaStateSet.Count > 0);
// Used for end suffixes
List<string> possibleEndings = new();
// Gather the possible endings for satisfying nullability
possibleEndings.Clear();
if (NfaStateHandler.CanBeNullable(ref statesWrapper))
while (true)
{
// Unconditionally final state or end of the input due to \Z anchor for example
if (NfaStateHandler.IsNullable(ref statesWrapper) ||
NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
{
possibleEndings.Add("");
}
Debug.Assert(states.NfaStateSet.Count > 0);
// End of line due to end-of-line anchor
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
// Gather the possible endings for satisfying nullability
possibleEndings.Clear();
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.CanBeNullable(this, in statesWrapper))
{
possibleEndings.Add("\n");
// Unconditionally final state or end of the input due to \Z anchor for example
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullable(this, in statesWrapper) ||
SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.BeginningEnd))
{
possibleEndings.Add("");
}
// End of line due to end-of-line anchor
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.Newline))
{
possibleEndings.Add("\n");
}
// Related to wordborder due to \b or \B
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.WordLetter))
{
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
}
// Related to wordborder due to \b or \B
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.General))
{
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
}
}
// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
// If we have a possible ending, then store a candidate input
if (possibleEndings.Count > 0)
{
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
latestCandidate ??= new();
latestCandidate.Clear();
latestCandidate.Append(inputSoFar);
//Choose some suffix that allows some anchor (if any) to be nullable
latestCandidate.Append(Choose(random, possibleEndings));
// Choose to stop here based on a coin-toss
if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
{
yield return latestCandidate.ToString();
break;
}
}
// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
// Shuffle the minterms, including the last end-of-line marker if appropriate
int[] mintermIds = SymbolicRegexMatcher<TSet>.NfaStateHandler.StartsWithLineAnchor(this, in statesWrapper) ?
Shuffle(random, mintermIdsWithZ) :
Shuffle(random, mintermIdsWithoutZ);
foreach (int mintermId in mintermIds)
{
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
bool success = SymbolicRegexMatcher<TSet>.NfaStateHandler.TryTakeTransition(this, ref statesWrapper, mintermId);
Debug.Assert(success);
if (states.NfaStateSet.Count > 0)
{
TSet minterm = GetMintermFromId(mintermId);
// Append a random member of the minterm
inputSoFar.Append(ChooseChar(random, ToBDD(minterm, Solver, charSetSolver), ascii, charSetSolver));
break;
}
else
{
// The transition was a dead end, undo and continue to try another minterm
NfaStateHandler.UndoTransition(ref statesWrapper);
}
}
}
// If we have a possible ending, then store a candidate input
if (possibleEndings.Count > 0)
{
latestCandidate ??= new();
latestCandidate.Clear();
latestCandidate.Append(inputSoFar);
//Choose some suffix that allows some anchor (if any) to be nullable
latestCandidate.Append(Choose(random, possibleEndings));
// Choose to stop here based on a coin-toss
if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
// In the case that there are no next states or input has become too large: stop here
if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
{
yield return latestCandidate.ToString();
// Ending up here without an ending is unlikely but possible for example for infeasible patterns
// such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
if (latestCandidate != null)
{
yield return latestCandidate.ToString();
}
break;
}
}
// Shuffle the minterms, including the last end-of-line marker if appropriate
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
Shuffle(random, mintermIdsWithZ) :
Shuffle(random, mintermIdsWithoutZ);
foreach (int mintermId in mintermIds)
{
bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId);
Debug.Assert(success);
if (states.NfaStateSet.Count > 0)
{
TSet minterm = _builder.GetMinterm(mintermId);
// Append a random member of the minterm
inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver));
break;
}
else
{
// The transition was a dead end, undo and continue to try another minterm
NfaStateHandler.UndoTransition(ref statesWrapper);
}
}
// In the case that there are no next states or input has become too large: stop here
if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
{
// Ending up here without an ending is unlikely but possible for example for infeasible patterns
// such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
if (latestCandidate != null)
{
yield return latestCandidate.ToString();
}
break;
}
}
}
......
......@@ -37,8 +37,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
}
}
rootNode = rootNode.AddFixedLengthMarkers();
BDD[] minterms = rootNode.ComputeMinterms();
rootNode = rootNode.AddFixedLengthMarkers(bddBuilder);
BDD[] minterms = rootNode.ComputeMinterms(bddBuilder);
_matcher = minterms.Length > 64 ?
SymbolicRegexMatcher<BitVector>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) :
......
......@@ -64,6 +64,38 @@ public static bool TryEnsureSufficientExecutionStack()
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <param name="action">The action to invoke.</param>
/// <param name="arg1">The first argument to pass to the action.</param>
/// <param name="arg2">The second argument to pass to the action.</param>
/// <param name="arg3">The third argument to pass to the action.</param>
/// <param name="arg4">The fourth argument to pass to the action.</param>
public static void CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4>(Action<TArg1, TArg2, TArg3, TArg4> action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) =>
Task.Run(() => action(arg1, arg2, arg3, arg4))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <typeparam name="TArg5">The type of the fifth argument to pass to the function.</typeparam>
/// <param name="action">The action to invoke.</param>
/// <param name="arg1">The first argument to pass to the action.</param>
/// <param name="arg2">The second argument to pass to the action.</param>
/// <param name="arg3">The third argument to pass to the action.</param>
/// <param name="arg4">The fourth argument to pass to the action.</param>
/// <param name="arg5">The fifth argument to pass to the action.</param>
public static void CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4, TArg5>(Action<TArg1, TArg2, TArg3, TArg4, TArg5> action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4, TArg5 arg5) =>
Task.Run(() => action(arg1, arg2, arg3, arg4, arg5))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
......@@ -126,5 +158,21 @@ public static bool TryEnsureSufficientExecutionStack()
Task.Run(() => func(arg1, arg2, arg3))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <typeparam name="TResult">The return type of the function.</typeparam>
/// <param name="func">The function to invoke.</param>
/// <param name="arg1">The first argument to pass to the function.</param>
/// <param name="arg2">The second argument to pass to the function.</param>
/// <param name="arg3">The third argument to pass to the function.</param>
/// <param name="arg4">The fourth argument to pass to the function.</param>
public static TResult CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4, TResult>(Func<TArg1, TArg2, TArg3, TArg4, TResult> func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) =>
Task.Run(() => func(arg1, arg2, arg3, arg4))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
}
}
......@@ -86,7 +86,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize };
}
// add .*? in front of the pattern, this adds 1 more NFA state
......@@ -94,7 +94,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(".*?" + Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, 1 + ExpectedSafeSize};
yield return new object[] { bddBuilder, rootNode, 1 + ExpectedSafeSize};
}
// use of anchors increases the estimate by 5x in general but in reality much less, at most 3x
......@@ -102,7 +102,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern + "$", options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, 5 * ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, 5 * ExpectedSafeSize };
}
// use of captures has no effect on the estimations
......@@ -110,31 +110,32 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern, options, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize };
}
}
[Theory]
[MemberData(nameof(SafeThresholdTests_MemberData))]
public void SafeThresholdTests(object obj, int expectedSafeSize)
public void SafeThresholdTests(object builderObj, object nodeObj, int expectedSafeSize)
{
SymbolicRegexNode<BDD> node = (SymbolicRegexNode<BDD>)obj;
SymbolicRegexBuilder<BDD> builder = (SymbolicRegexBuilder<BDD>)builderObj;
SymbolicRegexNode<BDD> node = (SymbolicRegexNode<BDD>)nodeObj;
int safeSize = node.EstimateNfaSize();
Assert.Equal(expectedSafeSize, safeSize);
int nfaStateCount = CalculateNfaStateCount(node);
int nfaStateCount = CalculateNfaStateCount(builder, node);
Assert.True(nfaStateCount <= expectedSafeSize);
}
/// <summary>
/// Compute the closure of all NFA states from root and return the size of the resulting state space.
/// </summary>
private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
private static int CalculateNfaStateCount(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> root)
{
// Here we are actually using the original BDD algebra (not converting to the BV or Uint64 algebra)
// because it does not matter which algebra we use here (this matters only for performance)
HashSet<(uint, SymbolicRegexNode<BDD>)> states = new();
Stack<(uint, SymbolicRegexNode<BDD>)> frontier = new();
List<BDD> minterms = MintermGenerator<BDD>.GenerateMinterms(root._builder._solver, root.GetSets());
List<BDD> minterms = MintermGenerator<BDD>.GenerateMinterms(builder._solver, root.GetSets(builder));
// Start from the initial state that has kind 'General' when no anchors are being used, else kind 'BeginningEnd'
(uint, SymbolicRegexNode<BDD>) initialState = (root._info.ContainsSomeAnchor ? CharKind.BeginningEnd : CharKind.General, root);
......@@ -150,7 +151,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
foreach (BDD minterm in minterms)
{
uint kind = GetCharKind(minterm);
SymbolicRegexNode<BDD> target = source.Node.CreateDerivativeWithoutEffects(minterm, source.Kind);
SymbolicRegexNode<BDD> target = source.Node.CreateDerivativeWithoutEffects(builder, minterm, source.Kind);
//In the case of an NFA all the different alternatives in the DFA state become individual states themselves
foreach (SymbolicRegexNode<BDD> node in GetAlternatives(target))
......@@ -169,7 +170,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
return states.Count;
// Enumerates the alternatives from a node, for eaxmple (ab|(bc|cd)) has three alternatives
static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD> node)
IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD> node)
{
if (node._kind == SymbolicRegexNodeKind.Alternate)
{
......@@ -178,7 +179,7 @@ static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD
foreach (SymbolicRegexNode<BDD> elem in GetAlternatives(node._right!))
yield return elem;
}
else if (!node.IsNothing) // omit deadend states
else if (!node.IsNothing(builder._solver)) // omit deadend states
{
yield return node;
}
......@@ -187,8 +188,8 @@ static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD
// Simplified character kind calculation that omits the special case that minterm can be the very last \n
// This omission has practically no effect of the size of the state space, but would complicate the logic
uint GetCharKind(BDD minterm) =>
minterm.Equals(root._builder._newLineSet) ? CharKind.Newline : // is \n
(!root._builder._solver.IsEmpty(root._builder._solver.And(root._builder._wordLetterForBoundariesSet, minterm)) ?
minterm.Equals(builder._newLineSet) ? CharKind.Newline : // is \n
(!builder._solver.IsEmpty(builder._solver.And(builder._wordLetterForBoundariesSet, minterm)) ?
CharKind.WordLetter : // in \w
CharKind.General); // anything else, thus in particular in \W
}
......
......@@ -51,7 +51,7 @@
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\CharKind.cs" Link="Production\CharKind.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\CharSetSolver.cs" Link="Production\CharSetSolver.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" Link="Production\DerivativeEffect.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" Link="Production\DfaMatchingState.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\MatchingState.cs" Link="Production\MatchingState.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" Link="Production\DoublyLinkedList.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\ISolver.cs" Link="Production\ISolver.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\MintermGenerator.cs" Link="Production\MintermGenerator.cs" />
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册