未验证 提交 6ebcb3d6 编写于 作者: O Olli Saarikivi 提交者: GitHub

NonBacktracking locking fixes and cleanup (#71234)

* Concurrency fixes and refactoring for clarity

Removed builder reference from SymbolicRegexNode instances; builder now
has to be passed in. Since the builder is not thread safe this clarifies
the locking required in the matcher when using it.
Moved matching specific state from the builder to the matcher. This
includes state and transition arrays.
Simplify character kind code by eliminating duplication of logic.

* Changes from review and cleanup

DfaMatchingState is now just MatchingState

* Comment on NFA mode IDs
上级 8cd9d636
......@@ -63,7 +63,7 @@
<Compile Include="System\Text\RegularExpressions\Symbolic\CharKind.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\CharSetSolver.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\ISolver.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
......@@ -75,6 +75,7 @@
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexNode.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexKind.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexInfo.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Automata.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Dgml.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Explore.cs" />
......
......@@ -43,5 +43,8 @@ internal static class CharKind
WordLetter => @"\w",
_ => string.Empty,
};
/// <summary>Returns whether the given value is in the range of valid character kinds.</summary>
internal static bool IsValidCharKind(uint charKind) => charKind < CharKindCount;
}
}
......@@ -8,135 +8,104 @@
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Captures a state of a DFA explored during matching.</summary>
internal sealed class DfaMatchingState<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
/// <summary>Captures a state explored during matching.</summary>
internal sealed class MatchingState<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
{
internal DfaMatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Node = node;
PrevCharKind = prevCharKind;
}
/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
internal SymbolicRegexNode<TSet> Node { get; }
/// <summary>
/// The kind of the previous character in the input. The <see cref="SymbolicRegexMatcher{TSet}"/> is responsible
/// for ensuring that in all uses of this state this invariant holds by both selecting initial states accordingly
/// and transitioning on each character to states that match that character's kind.
/// </summary>
/// <remarks>
/// Tracking this information is an optimization that allows each transition taken in the matcher to only depend
/// on the next character (and its kind). In general, the transitions from a state with anchors in its pattern
/// depend on both the previous and the next character. Creating distinct states for each kind of the previous
/// character embeds the necessary information about the previous character into the state space of the automaton.
/// However, this does incur a memory overhead due to the duplication of states. For patterns with no anchors
/// this will always be set to <see cref="CharKind.General"/>, which can reduce the number of states created.
///
/// The performance effect of this optimization has not been investigated. If this optimization were removed, the
/// transition logic would in turn have to become more complicated for derivatives that depend on the nullability
/// of anchors. Care should be taken to not slow down transitions without anchors involved.
/// </remarks>
internal uint PrevCharKind { get; }
/// <summary>
/// A unique identifier for this state, which is used in <see cref="SymbolicRegexMatcher{TSet}"/> to index into
/// state information and transition arrays. Valid IDs are always >= 1.
/// </summary>
internal int Id { get; set; }
/// <summary>This is a deadend state</summary>
internal bool IsDeadend => Node.IsNothing;
/// <summary>Whether this state is known to be a dead end, i.e. no nullable states are reachable from here.</summary>
internal bool IsDeadend(ISolver<TSet> solver) => Node.IsNothing(solver);
/// <summary>The node must be nullable here</summary>
/// <summary>
/// Returns the fixed length that any match ending with this state must have, or -1 if there is no such
/// fixed length, <see cref="SymbolicRegexNode{TSet}.ResolveFixedLength(uint)"/>. The context is defined
/// by <see cref="PrevCharKind"/> of this state and the given nextCharKind. The node must be nullable here.
/// </summary>
internal int FixedLength(uint nextCharKind)
{
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
Debug.Assert(IsNullableFor(nextCharKind));
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
uint context = CharKind.Context(PrevCharKind, nextCharKind);
return Node.ResolveFixedLength(context);
}
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
internal bool IsNothing => Node.IsNothing;
/// <summary>If true then state starts with a ^ or $ or \Z</summary>
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
/// <summary>
/// Translates a minterm set to a character kind, which is a general categorization of characters used
/// for cheaply deciding the nullability of anchors.
/// </summary>
/// <remarks>
/// An empty set is handled as a special case to indicate the very last \n.
/// </remarks>
/// <param name="minterm">the minterm to translate</param>
/// <returns>the character kind of the minterm</returns>
private uint GetNextCharKind(ref TSet minterm)
{
ISolver<TSet> solver = Node._builder._solver;
TSet wordLetterPredicate = Node._builder._wordLetterForBoundariesSet;
TSet newLinePredicate = Node._builder._newLineSet;
// minterm == solver.False is used to represent the very last \n
uint nextCharKind = CharKind.General;
if (solver.Empty.Equals(minterm))
{
nextCharKind = CharKind.NewLineS;
minterm = newLinePredicate;
}
else if (newLinePredicate.Equals(minterm))
{
// If the previous state was the start state, mark this as the very FIRST \n.
// Essentially, this looks the same as the very last \n and is used to nullify
// rev(\Z) in the conext of a reversed automaton.
nextCharKind = PrevCharKind == CharKind.BeginningEnd ?
CharKind.NewLineS :
CharKind.Newline;
}
else if (!solver.IsEmpty(solver.And(wordLetterPredicate, minterm)))
{
nextCharKind = CharKind.WordLetter;
}
return nextCharKind;
}
/// <summary>
/// Compute the target state for the given input minterm.
/// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
/// </summary>
/// <param name="builder">the builder that owns <see cref="Node"/></param>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
internal DfaMatchingState<TSet> Next(TSet minterm)
/// <param name="nextCharKind"></param>
internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet minterm, uint nextCharKind)
{
uint nextCharKind = GetNextCharKind(ref minterm);
// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);
// Compute the derivative of the node for the given context
SymbolicRegexNode<TSet> derivative = Node.CreateDerivativeWithoutEffects(minterm, context);
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
return Node._builder.CreateState(derivative, nextCharKind, capturing: false);
return Node.CreateDerivativeWithoutEffects(builder, minterm, context);
}
/// <summary>
/// Compute a set of transitions for the given minterm.
/// </summary>
/// <param name="builder">the builder that owns <see cref="Node"/></param>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
/// <param name="nextCharKind"></param>
/// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns>
internal List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm)
internal List<(SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects)> NfaNextWithEffects(SymbolicRegexBuilder<TSet> builder, TSet minterm, uint nextCharKind)
{
uint nextCharKind = GetNextCharKind(ref minterm);
// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);
// Compute the transitions for the given context
List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context);
var list = new List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)>();
foreach ((SymbolicRegexNode<TSet> node, DerivativeEffect[]? effects) in nodesAndEffects)
{
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
if (!state.IsDeadend)
list.Add((state, effects));
}
return list;
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullableFor(uint nextCharKind)
{
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
uint context = CharKind.Context(PrevCharKind, nextCharKind);
return Node.IsNullableFor(context);
}
public override bool Equals(object? obj) =>
obj is DfaMatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
obj is MatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
public override int GetHashCode() => (PrevCharKind, Node).GetHashCode();
......
......@@ -240,12 +240,12 @@ static string UnexpectedNodeType(RegexNode node)
SymbolicRegexNode<BDD> elem = childResult.Count == 1 ?
childResult.FirstElement :
_builder.CreateConcatAlreadyReversed(childResult);
if (elem.IsNothing)
if (elem.IsNothing(_builder._solver))
{
continue;
}
or = elem.IsAnyStar ?
or = elem.IsAnyStar(_builder._solver) ?
elem : // .* is the absorbing element
SymbolicRegexNode<BDD>.CreateAlternate(_builder, elem, or);
}
......
......@@ -30,43 +30,34 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this);
private SymbolicRegexNode<TSet>? _beginningAnchor;
internal SymbolicRegexNode<TSet> BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BeginningAnchor);
internal SymbolicRegexNode<TSet> BeginningAnchor => _beginningAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.BeginningAnchor);
private SymbolicRegexNode<TSet>? _endAnchor;
internal SymbolicRegexNode<TSet> EndAnchor => _endAnchor ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchor);
internal SymbolicRegexNode<TSet> EndAnchor => _endAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchor);
private SymbolicRegexNode<TSet>? _endAnchorZ;
internal SymbolicRegexNode<TSet> EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZ);
internal SymbolicRegexNode<TSet> EndAnchorZ => _endAnchorZ ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZ);
private SymbolicRegexNode<TSet>? _endAnchorZReverse;
internal SymbolicRegexNode<TSet> EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse);
internal SymbolicRegexNode<TSet> EndAnchorZReverse => _endAnchorZReverse ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.EndAnchorZReverse);
private SymbolicRegexNode<TSet>? _bolAnchor;
internal SymbolicRegexNode<TSet> BolAnchor => _bolAnchor ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.BOLAnchor);
internal SymbolicRegexNode<TSet> BolAnchor => _bolAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.BOLAnchor);
private SymbolicRegexNode<TSet>? _eolAnchor;
internal SymbolicRegexNode<TSet> EolAnchor => _eolAnchor ??= SymbolicRegexNode<TSet>.CreateBeginEndAnchor(this, SymbolicRegexNodeKind.EOLAnchor);
internal SymbolicRegexNode<TSet> EolAnchor => _eolAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.EOLAnchor);
private SymbolicRegexNode<TSet>? _wbAnchor;
internal SymbolicRegexNode<TSet> BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode<TSet>.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor);
internal SymbolicRegexNode<TSet> BoundaryAnchor => _wbAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.BoundaryAnchor);
private SymbolicRegexNode<TSet>? _nwbAnchor;
internal SymbolicRegexNode<TSet> NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode<TSet>.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor);
internal SymbolicRegexNode<TSet> NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode<TSet>.CreateAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor);
internal TSet _wordLetterForBoundariesSet;
internal TSet _newLineSet;
/// <summary>Partition of the input space of sets.</summary>
internal TSet[]? _minterms;
private readonly Dictionary<TSet, SymbolicRegexNode<TSet>> _singletonCache = new();
// states that have been created
internal HashSet<DfaMatchingState<TSet>> _stateCache = new();
// capturing states that have been created
internal HashSet<DfaMatchingState<TSet>> _capturingStateCache = new();
/// <summary>
/// This cache is used in <see cref="SymbolicRegexNode{TSet}.Create"/> to keep all nodes associated with this builder
/// unique. This ensures that reference equality can be used for syntactic equality and that all shared subexpressions
......@@ -84,7 +75,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
// matching when simplification rules fail to eliminate the portions being walked over.
/// <summary>
/// Cache for <see cref="SymbolicRegexNode{TSet}.CreateDerivative(TSet, uint)"/> keyed by:
/// Cache for <see cref="SymbolicRegexNode{TSet}.CreateDerivative(SymbolicRegexBuilder{TSet}, TSet, uint)"/> keyed by:
/// -The node to derivate
/// -The character or minterm to take the derivative with
/// -The surrounding character context
......@@ -93,7 +84,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
internal readonly Dictionary<(SymbolicRegexNode<TSet>, TSet elem, uint context), SymbolicRegexNode<TSet>> _derivativeCache = new();
/// <summary>
/// Cache for <see cref="SymbolicRegexNode{TSet}.PruneLowerPriorityThanNullability(uint)"/> keyed by:
/// Cache for <see cref="SymbolicRegexNode{TSet}.PruneLowerPriorityThanNullability(SymbolicRegexBuilder{TSet}, uint)"/> keyed by:
/// -The node to prune
/// -The surrounding character context
/// The value is the pruned node.
......@@ -101,74 +92,13 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
internal readonly Dictionary<(SymbolicRegexNode<TSet>, uint), SymbolicRegexNode<TSet>> _pruneLowerPriorityThanNullabilityCache = new();
/// <summary>
/// Cache for <see cref="SymbolicRegexNode{TSet}.Subsumes(SymbolicRegexNode{TSet}, int)"/> keyed by:
/// Cache for <see cref="SymbolicRegexNode{TSet}.Subsumes(SymbolicRegexBuilder{TSet}, SymbolicRegexNode{TSet}, int)"/> keyed by:
/// -The node R potentially subsuming S
/// -The node S potentially being subsumed by R
/// The value indicates if subsumption is known to hold.
/// </summary>
internal readonly Dictionary<(SymbolicRegexNode<TSet>, SymbolicRegexNode<TSet>), bool> _subsumptionCache = new();
/// <summary>
/// Maps state ids to states, initial capacity is 1024 states.
/// Each time more states are needed the length is increased by 1024.
/// </summary>
internal DfaMatchingState<TSet>[]? _stateArray;
internal DfaMatchingState<TSet>[]? _capturingStateArray;
/// <summary>
/// Maps state IDs to context-independent information for all states in <see cref="_stateArray"/>.
/// </summary>
private ContextIndependentState[] _stateInfo = Array.Empty<ContextIndependentState>();
/// <summary>Context-independent information available for every state.</summary>
[Flags]
private enum ContextIndependentState : byte
{
IsInitial = 1,
IsDeadend = 2,
IsNullable = 4,
CanBeNullable = 8,
}
/// <remarks>
/// For these "delta" arrays, technically Volatile.Read should be used to read out an element,
/// but in practice that's not needed on the runtimes in use (though that needs to be documented
/// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is
/// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789).
/// </remarks>
internal int[]? _delta;
internal List<(DfaMatchingState<TSet>, DerivativeEffect[])>?[]? _capturingDelta;
private const int InitialStateLimit = 1024;
/// <summary>1 + Log2(_minterms.Length), the smallest k s.t. 2^k >= minterms.Length + 1</summary>
internal int _mintermsLog;
/// <summary>
/// Maps each NFA state id to the state id of the DfaMatchingState stored in _stateArray.
/// This map is used to compactly represent NFA state ids in NFA mode in order to utilize
/// the property that all NFA states are small integers in one interval.
/// The valid entries are 0 to <see cref="NfaStateCount"/>-1.
/// </summary>
internal int[] _nfaStateArray = Array.Empty<int>();
/// <summary>
/// Maps the id of a DfaMatchingState to the NFA state id that it is being identifed with in the NFA.
/// It is the inverse of used entries in _nfaStateArray.
/// The range of this map is 0 to <see cref="NfaStateCount"/>-1.
/// </summary>
internal readonly Dictionary<int, int> _nfaStateArrayInverse = new();
/// <summary>Gets <see cref="_nfaStateArrayInverse"/>.Count</summary>
internal int NfaStateCount => _nfaStateArrayInverse.Count;
/// <summary>
/// Transition function for NFA transitions in NFA mode.
/// Each NFA entry maps to a list of NFA target states.
/// Each list of target states is without repetitions.
/// If the entry is null then the targets states have not been computed yet.
/// </summary>
internal int[]?[] _nfaDelta = Array.Empty<int[]>();
/// <summary>Create a new symbolic regex builder.</summary>
internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
{
......@@ -176,24 +106,6 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
_charSetSolver = charSetSolver;
_solver = solver;
// minterms = null if partition of the solver is undefined and returned as null
_minterms = solver.GetMinterms();
if (_minterms == null)
{
_mintermsLog = -1;
}
else
{
_stateArray = new DfaMatchingState<TSet>[InitialStateLimit];
_capturingStateArray = new DfaMatchingState<TSet>[InitialStateLimit];
_stateInfo = new ContextIndependentState[InitialStateLimit];
// the extra +1 slot with id minterms.Length is reserved for \Z (last occurrence of \n)
_mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1;
_delta = new int[InitialStateLimit << _mintermsLog];
_capturingDelta = new List<(DfaMatchingState<TSet>, DerivativeEffect[])>[InitialStateLimit << _mintermsLog];
}
// initialized to False but updated later to the actual condition ony if \b or \B occurs anywhere in the regex
// this implies that if a regex never uses \b or \B then the character context will never
// update the previous character context to distinguish word and nonword letters
......@@ -213,94 +125,6 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
_singletonCache[_solver.Full] = _anyChar;
}
/// <summary>Assign the context-independent information for the given state.</summary>
internal void SetStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable)
{
Debug.Assert(stateId > 0);
Debug.Assert(!isNullable || canBeNullable);
ContextIndependentState info = 0;
if (isInitial)
{
info |= ContextIndependentState.IsInitial;
}
if (isDeadend)
{
info |= ContextIndependentState.IsDeadend;
}
if (canBeNullable)
{
info |= ContextIndependentState.CanBeNullable;
if (isNullable)
{
info |= ContextIndependentState.IsNullable;
}
}
_stateInfo[stateId] = info;
}
/// <summary>Get context-independent information for the given state.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId)
{
Debug.Assert(stateId > 0);
ContextIndependentState info = _stateInfo[stateId];
return ((info & ContextIndependentState.IsInitial) != 0,
(info & ContextIndependentState.IsDeadend) != 0,
(info & ContextIndependentState.IsNullable) != 0,
(info & ContextIndependentState.CanBeNullable) != 0);
}
/// <summary>Lookup the actual minterm based on its ID.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal TSet GetMinterm(int mintermId)
{
TSet[]? minterms = _minterms;
Debug.Assert(minterms is not null);
return (uint)mintermId < (uint)minterms.Length ?
minterms[mintermId] :
_solver.Empty; // minterm=False represents \Z
}
/// <summary>Returns the span from <see cref="_delta"/> that may contain transitions for the given state</summary>
internal Span<int> GetDeltasFor(DfaMatchingState<TSet> state)
{
if (_delta is null || _minterms is null)
{
return default;
}
int numMinterms = _minterms.Length;
if (state.StartsWithLineAnchor)
{
numMinterms++;
}
return _delta.AsSpan(state.Id << _mintermsLog, numMinterms);
}
/// <summary>Returns the span from <see cref="_nfaDelta"/> that may contain transitions for the given state</summary>
internal Span<int[]?> GetNfaDeltasFor(DfaMatchingState<TSet> state)
{
if (_nfaDelta is null || _minterms is null || !_nfaStateArrayInverse.TryGetValue(state.Id, out int nfaState))
{
return default;
}
int numMinterms = _minterms.Length;
if (state.StartsWithLineAnchor)
{
numMinterms++;
}
return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms);
}
/// <summary>
/// Make an alternation of given nodes, simplify by eliminating any regex that accepts no inputs
/// </summary>
......@@ -509,224 +333,5 @@ internal SymbolicRegexNode<TNewSet> Transform<TNewSet>(SymbolicRegexNode<TSet> n
return null;
}
}
/// <summary>
/// Create a state with given node and previous character context.
/// </summary>
/// <param name="node">the pattern that this state will represent</param>
/// <param name="prevCharKind">the kind of the character that led to this state</param>
/// <param name="capturing">whether to use the separate space of states with capturing transitions or not</param>
/// <param name="isInitialState">whether to mark the state as an initial state or not</param>
/// <returns></returns>
public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint prevCharKind, bool capturing = false, bool isInitialState = false)
{
//first prune the anchors in the node
TSet wlbSet = _wordLetterForBoundariesSet;
TSet startSet = node.GetStartSet();
//true if the startset of the node overlaps with some wordletter or the node can be nullable
bool contWithWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(wlbSet, startSet));
//true if the startset of the node overlaps with some nonwordletter or the node can be nullable
bool contWithNWL = node.CanBeNullable || !_solver.IsEmpty(_solver.And(_solver.Not(wlbSet), startSet));
SymbolicRegexNode<TSet> pruned_node = node.PruneAnchors(prevCharKind, contWithWL, contWithNWL);
var s = new DfaMatchingState<TSet>(pruned_node, prevCharKind);
if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState<TSet>? state))
{
state = MakeNewState(s, capturing, isInitialState);
}
return state;
}
private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool capturing, bool isInitialState)
{
lock (this)
{
HashSet<DfaMatchingState<TSet>> cache = capturing ? _capturingStateCache : _stateCache;
cache.Add(state); // Add to cache first to make 1 the first state ID
state.Id = cache.Count;
Debug.Assert(_stateArray is not null && _capturingStateArray is not null);
const int GrowthSize = 1024;
if (capturing)
{
if (state.Id == _capturingStateArray.Length)
{
int newsize = _capturingStateArray.Length + GrowthSize;
Array.Resize(ref _capturingStateArray, newsize);
Array.Resize(ref _capturingDelta, newsize << _mintermsLog);
}
_capturingStateArray[state.Id] = state;
}
else
{
if (state.Id == _stateArray.Length)
{
int newsize = _stateArray.Length + GrowthSize;
Array.Resize(ref _stateArray, newsize);
Array.Resize(ref _delta, newsize << _mintermsLog);
Array.Resize(ref _stateInfo, newsize);
}
_stateArray[state.Id] = state;
SetStateInfo(state.Id, isInitialState, state.IsDeadend, state.Node.IsNullable, state.Node.CanBeNullable);
}
return state;
}
}
/// <summary>
/// Make an NFA state for the given node and previous character kind.
/// </summary>
public int CreateNfaState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate);
// First make the underlying core state
DfaMatchingState<TSet> coreState = CreateState(node, prevCharKind);
if (!_nfaStateArrayInverse.TryGetValue(coreState.Id, out int nfaStateId))
{
nfaStateId = MakeNewNfaState(coreState.Id);
}
return nfaStateId;
}
/// <summary>Critical region that creates a new NFA state for the underlying core state</summary>
private int MakeNewNfaState(int coreStateId)
{
lock (this)
{
if (NfaStateCount == _nfaStateArray.Length)
{
// TBD: is 1024 reasonable?
int newsize = _nfaStateArray.Length + 1024;
Array.Resize(ref _nfaStateArray, newsize);
Array.Resize(ref _nfaDelta, newsize << _mintermsLog);
// TBD: capturing
}
int nfaStateId = NfaStateCount;
_nfaStateArray[nfaStateId] = coreStateId;
_nfaStateArrayInverse[coreStateId] = nfaStateId;
return nfaStateId;
}
}
/// <summary>Gets the core state Id corresponding to the NFA state</summary>
public int GetCoreStateId(int nfaStateId)
{
Debug.Assert(_stateArray is not null);
Debug.Assert(nfaStateId < _nfaStateArray.Length);
Debug.Assert(_nfaStateArray[nfaStateId] < _stateArray.Length);
return _nfaStateArray[nfaStateId];
}
/// <summary>Gets the core state corresponding to the NFA state</summary>
public DfaMatchingState<TSet> GetCoreState(int nfaStateId)
{
Debug.Assert(_stateArray is not null);
return _stateArray[GetCoreStateId(nfaStateId)];
}
/// <summary>Critical region for defining a new core transition</summary>
public DfaMatchingState<TSet> CreateNewTransition(DfaMatchingState<TSet> sourceState, int mintermId, int offset)
{
TryCreateNewTransition(sourceState, mintermId, offset, checkThreshold: false, out DfaMatchingState<TSet>? nextState);
Debug.Assert(nextState is not null);
return nextState;
}
/// <summary>Gets or creates a new DFA transition.</summary>
public bool TryCreateNewTransition(
DfaMatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out DfaMatchingState<TSet>? nextState)
{
Debug.Assert(_delta is not null && _stateArray is not null);
lock (this)
{
Debug.Assert(offset < _delta.Length);
// check if meanwhile delta[offset] has become defined possibly by another thread
DfaMatchingState<TSet>? targetState = _stateArray[_delta[offset]];
if (targetState is null)
{
if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
{
nextState = null;
return false;
}
targetState = sourceState.Next(GetMinterm(mintermId));
Volatile.Write(ref _delta[offset], targetState.Id);
}
nextState = targetState;
return true;
}
}
/// <summary>Gets or creates a new NFA transition.</summary>
public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset)
{
Debug.Assert(_delta is not null && _stateArray is not null);
lock (this)
{
Debug.Assert(nfaOffset < _nfaDelta.Length);
// check if meanwhile the nfaoffset has become defined possibly by another thread
int[]? targets = _nfaDelta[nfaOffset];
if (targets is null)
{
// Create the underlying transition from the core state corresponding to the nfa state
DfaMatchingState<TSet> coreState = GetCoreState(nfaStateId);
int coreOffset = (coreState.Id << _mintermsLog) | mintermId;
int coreTargetId = _delta[coreOffset];
DfaMatchingState<TSet>? coreTarget = coreTargetId > 0 ?
_stateArray[coreTargetId] : CreateNewTransition(coreState, mintermId, coreOffset);
SymbolicRegexNode<TSet> node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ?
coreTarget.Node._left! : coreTarget.Node;
if (node.Kind == SymbolicRegexNodeKind.Alternate)
{
// Create separate NFA states for all members of a disjunction
// Here duplicate NFA states cannot arise because there are no duplicate nodes in the disjunction
List<SymbolicRegexNode<TSet>> alts = node.ToList(listKind: SymbolicRegexNodeKind.Alternate);
targets = new int[alts.Count];
int targetIndex = 0;
foreach (SymbolicRegexNode<TSet> q in alts)
{
Debug.Assert(!q.IsNothing);
// Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too
SymbolicRegexNode<TSet> targetNode = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ?
CreateDisableBacktrackingSimulation(q) : q;
targets[targetIndex++] = CreateNfaState(targetNode, coreTarget.PrevCharKind);
}
Debug.Assert(targetIndex == targets.Length);
}
else if (coreTarget.IsDeadend)
{
// Omit deadend states from the target list of states
// target list being empty means that the NFA state itself is a deadend
targets = Array.Empty<int>();
}
else
{
// Add the single NFA target state correponding to the core target state
if (!_nfaStateArrayInverse.TryGetValue(coreTarget.Id, out int nfaTargetId))
{
nfaTargetId = MakeNewNfaState(coreTarget.Id);
}
targets = new[] { nfaTargetId };
}
Volatile.Write(ref _nfaDelta[nfaOffset], targets);
}
return targets;
}
}
}
}
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Misc information of structural properties of a <see cref="SymbolicRegexNode{S}"/> that is computed bottom up.</summary>
......@@ -14,54 +16,34 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint StartsWithSomeAnchorMask = 32;
private const uint IsHighPriorityNullableMask = 64;
private const uint ContainsEffectMask = 128;
private const uint ContainsLineAnchorMask = 256;
private readonly uint _info;
private SymbolicRegexInfo(uint i) => _info = i;
internal static SymbolicRegexInfo Create(
private static SymbolicRegexInfo Create(
bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
bool startsWithLineAnchor = false, bool containsLineAnchor = false,
bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
bool isHighPriorityNullable = false, bool containsEffect = false)
{
uint i = 0;
if (canBeNullable || isAlwaysNullable)
{
i |= CanBeNullableMask;
if (isAlwaysNullable)
{
i |= IsAlwaysNullableMask;
}
}
if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{
i |= ContainsSomeAnchorMask;
if (startsWithLineAnchor)
{
i |= StartsWithLineAnchorMask;
}
if (startsWithLineAnchor || startsWithSomeAnchor)
{
i |= StartsWithSomeAnchorMask;
}
}
if (isHighPriorityNullable)
{
i |= IsHighPriorityNullableMask;
}
if (containsEffect)
{
i |= ContainsEffectMask;
}
return new SymbolicRegexInfo(i);
// Assert that the expected implications hold. For example, every node that contains a line anchor
// must also be marked as containing some anchor.
Debug.Assert(!isAlwaysNullable || canBeNullable);
Debug.Assert(!startsWithLineAnchor || containsLineAnchor);
Debug.Assert(!startsWithLineAnchor || startsWithSomeAnchor);
Debug.Assert(!containsLineAnchor || containsSomeAnchor);
Debug.Assert(!startsWithSomeAnchor || containsSomeAnchor);
return new SymbolicRegexInfo(
(isAlwaysNullable ? IsAlwaysNullableMask : 0) |
(canBeNullable ? CanBeNullableMask : 0) |
(startsWithLineAnchor ? StartsWithLineAnchorMask : 0) |
(containsLineAnchor ? ContainsLineAnchorMask : 0) |
(startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) |
(containsSomeAnchor ? ContainsSomeAnchorMask : 0) |
(isHighPriorityNullable ? IsHighPriorityNullableMask : 0) |
(containsEffect ? ContainsEffectMask : 0));
}
public bool IsNullable => (_info & IsAlwaysNullableMask) != 0;
......@@ -70,6 +52,8 @@ namespace System.Text.RegularExpressions.Symbolic
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
......@@ -80,6 +64,27 @@ namespace System.Text.RegularExpressions.Symbolic
public bool ContainsEffect => (_info & ContainsEffectMask) != 0;
/// <summary>
/// Used for any node that acts as an epsilon, i.e., something that always matches the empty string.
/// </summary>
public static SymbolicRegexInfo Epsilon() =>
Create(
isAlwaysNullable: true,
canBeNullable: true,
isHighPriorityNullable: true);
/// <summary>
/// Used for all anchors.
/// </summary>
/// <param name="isLineAnchor">whether this anchor is a line anchor</param>
public static SymbolicRegexInfo Anchor(bool isLineAnchor) =>
Create(
canBeNullable: true,
startsWithLineAnchor: isLineAnchor,
containsLineAnchor: isLineAnchor,
startsWithSomeAnchor: true,
containsSomeAnchor: true);
/// <summary>
/// The alternation remains high priority nullable if the left alternative is so.
/// All other info properties are the logical disjunction of the resepctive info properties
......@@ -90,6 +95,7 @@ namespace System.Text.RegularExpressions.Symbolic
isAlwaysNullable: left_info.IsNullable || right_info.IsNullable,
canBeNullable: left_info.CanBeNullable || right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || right_info.StartsWithLineAnchor,
containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor,
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor,
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable,
......@@ -105,6 +111,7 @@ namespace System.Text.RegularExpressions.Symbolic
isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
containsLineAnchor: left_info.ContainsLineAnchor || right_info.ContainsLineAnchor,
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable,
......
......@@ -48,7 +48,7 @@ internal enum SymbolicRegexNodeKind
/// <summary>Effects to be applied when taking a transition.</summary>
/// <remarks>
/// Left child is the pattern itself and the right child is a concatenation of nodes whose effects should be applied.
/// Effect nodes are created in the rule for concatenation in <see cref="SymbolicRegexNode{TSet}.CreateDerivative(TSet, uint)"/>,
/// Effect nodes are created in the rule for concatenation in <see cref="SymbolicRegexNode{TSet}.CreateDerivative(SymbolicRegexBuilder{TSet}, TSet, uint)"/>,
/// where they are used to represent additional operations that should be performed in the current position if
/// the pattern in the left child is used to match the input. Since these Effect nodes are relative to the current
/// position in the input, the effects from the right child must be applied in the transition that the derivative is
......
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Threading;
namespace System.Text.RegularExpressions.Symbolic
{
internal sealed partial class SymbolicRegexMatcher<TSet>
{
/// <summary>
/// Initial capacity for DFA related arrays.
/// </summary>
private const int InitialDfaStateCapacity = 1024;
/// <summary>
/// Minimum capacity for NFA related arrays when the matcher first enters NFA mode. The arrays start out empty,
/// but are resized to this capacity upon first use.
/// </summary>
private const int InitialNfaStateCapacity = 64;
/// <summary>
/// Cache for the states that have been created. Each state is uniquely identified by its associated
/// <see cref="SymbolicRegexNode{TSet}"/> and the kind of the previous character.
/// </summary>
private readonly Dictionary<(SymbolicRegexNode<TSet> Node, uint PrevCharKind), MatchingState<TSet>> _stateCache = new();
/// <summary>
/// Maps state ids to states, initial capacity is given by <see cref="InitialDfaStateCapacity"/>.
/// Each time more states are needed the length is doubled.
/// The first valid state is at index 1.
/// </summary>
private MatchingState<TSet>?[] _stateArray;
/// <summary>
/// Maps state IDs to context-independent information for all states in <see cref="_stateArray"/>.
/// The first valid entry is at index 1.
/// </summary>
private ContextIndependentState[] _stateInfo;
/// <summary>Context-independent information available for every state.</summary>
[Flags]
private enum ContextIndependentState : byte
{
IsInitial = 1,
IsDeadend = 2,
IsNullable = 4,
CanBeNullable = 8,
}
/// <summary>
/// The transition function for DFA mode.
/// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
/// the number of bits required to represent the largest minterm ID <see cref="_mintermsLog"/>, is reserved
/// for each state. This makes indexing into this array not require a multiplication
/// <see cref="DeltaOffset(int, int)"/>, but does mean some unused space may be present.
/// The first valid state ID is 1.
/// </summary>
/// <remarks>
/// For these "delta" arrays, technically Volatile.Read should be used to read out an element,
/// but in practice that's not needed on the runtimes in use (though that needs to be documented
/// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is
/// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789).
/// </remarks>
private int[] _dfaDelta;
/// <summary>
/// Maps each NFA state id to the state id of the MatchingState stored in _stateArray.
/// This map is used to compactly represent NFA state ids in NFA mode in order to utilize
/// the property that all NFA states are small integers in one interval.
/// The valid entries are 0 to the size of <see cref="_nfaIdByCoreId"/> - 1.
/// </summary>
private int[] _nfaCoreIdArray = Array.Empty<int>();
/// <summary>
/// Maps the id of a MatchingState to the NFA state id that it is being identifed with in the NFA.
/// It is the inverse of used entries in _nfaStateArray.
/// The range of this map is 0 to its size - 1.
/// </summary>
private readonly Dictionary<int, int> _nfaIdByCoreId = new();
/// <summary>
/// Transition function for NFA transitions in NFA mode.
/// Each NFA entry maps to a list of NFA target states.
/// Each list of target states is without repetitions.
/// If the entry is null then the targets states have not been computed yet.
/// </summary>
private int[]?[] _nfaDelta = Array.Empty<int[]>();
/// <summary>
/// The transition function for <see cref="FindSubcaptures(ReadOnlySpan{char}, int, int, PerThreadData)"/>,
/// which is an NFA mode with additional state to track capture start and end positions.
/// Each entry is an array of pairs of target state and effects to be applied when taking the transition.
/// If the entry is null then the transition has not been computed yet.
/// </summary>
private (int, DerivativeEffect[])[]?[] _capturingNfaDelta = Array.Empty<(int, DerivativeEffect[])[]?>();
/// <summary>
/// Implements a version of <see cref="Array.Resize"/> that is guaranteed to not publish an array before values
/// have been copied over.
/// </summary>
/// <remarks>
/// This may not be strictly necessary for arrays of primitive or reference types (which have atomic
/// reads/writes), as when, e.g., <see cref="_dfaDelta"/> is found to not have an entry the array is checked again
/// after a lock on the matcher has been acquired. However, in a highly threaded use case it still seems better
/// to avoid unnecessarily causing other threads to acquire the lock.
/// </remarks>
private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
{
Debug.Assert(newSize >= array.Length);
T[] newArray = new T[newSize];
Array.Copy(array, newArray, array.Length);
Volatile.Write(ref array, newArray);
}
private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId;
/// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
private Span<int> GetDeltasFor(MatchingState<TSet> state)
{
Debug.Assert(Monitor.IsEntered(this));
int numMinterms = _minterms.Length;
if (state.StartsWithLineAnchor)
{
numMinterms++;
}
return _dfaDelta.AsSpan(state.Id << _mintermsLog, numMinterms);
}
/// <summary>Returns the span from <see cref="_nfaDelta"/> that may contain transitions for the given state</summary>
private Span<int[]?> GetNfaDeltasFor(MatchingState<TSet> state)
{
Debug.Assert(Monitor.IsEntered(this));
if (!_nfaIdByCoreId.TryGetValue(state.Id, out int nfaState))
{
return default;
}
int numMinterms = _minterms.Length;
if (state.StartsWithLineAnchor)
{
numMinterms++;
}
return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms);
}
/// <summary>Get context-independent information for the given state.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId)
{
Debug.Assert(stateId > 0);
ContextIndependentState info = _stateInfo[stateId];
return ((info & ContextIndependentState.IsInitial) != 0,
(info & ContextIndependentState.IsDeadend) != 0,
(info & ContextIndependentState.IsNullable) != 0,
(info & ContextIndependentState.CanBeNullable) != 0);
}
/// <summary>
/// Create a state with given node and previous character context.
/// </summary>
/// <param name="node">the pattern that this state will represent</param>
/// <param name="prevCharKind">the kind of the character that led to this state</param>
/// <returns></returns>
private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Debug.Assert(Monitor.IsEntered(this));
return GetOrCreateState_NoLock(node, prevCharKind);
}
/// <summary>
/// Create a state with given node and previous character context.
/// </summary>
/// <param name="node">the pattern that this state will represent</param>
/// <param name="prevCharKind">the kind of the character that led to this state</param>
/// <param name="isInitialState">whether to mark the state as an initial state or not</param>
/// <returns></returns>
private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node, uint prevCharKind, bool isInitialState = false)
{
SymbolicRegexNode<TSet> prunedNode = node.PruneAnchors(_builder, prevCharKind);
(SymbolicRegexNode<TSet> Node, uint PrevCharKind) key = (prunedNode, prevCharKind);
if (!_stateCache.TryGetValue(key, out MatchingState<TSet>? state))
{
state = new MatchingState<TSet>(key.Node, key.PrevCharKind);
_stateCache.Add(key, state); // Add to cache first to make 1 the first state ID
state.Id = _stateCache.Count;
Debug.Assert(_stateArray is not null);
if (state.Id == _stateArray.Length)
{
// The growth factor 2 matches that of List<T>
int newsize = _stateArray.Length * 2;
ArrayResizeAndVolatilePublish(ref _stateArray, newsize);
ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
ArrayResizeAndVolatilePublish(ref _stateInfo, newsize);
}
_stateArray[state.Id] = state;
_stateInfo[state.Id] = BuildStateInfo(state.Id, isInitialState, state.IsDeadend(Solver), state.Node.IsNullable, state.Node.CanBeNullable);
}
return state;
// Assign the context-independent information for the given state
static ContextIndependentState BuildStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable)
{
Debug.Assert(stateId > 0);
Debug.Assert(!isNullable || canBeNullable);
ContextIndependentState info = 0;
if (isInitial)
{
info |= ContextIndependentState.IsInitial;
}
if (isDeadend)
{
info |= ContextIndependentState.IsDeadend;
}
if (canBeNullable)
{
info |= ContextIndependentState.CanBeNullable;
if (isNullable)
{
info |= ContextIndependentState.IsNullable;
}
}
return info;
}
}
/// <summary>
/// Make an NFA state for the given node and previous character kind. NFA states include a "core state" of a
/// <see cref="MatchingState{TSet}"/> allocated with <see cref="GetOrCreateState(SymbolicRegexNode{TSet}, uint)"/>,
/// which stores the pattern and previous character kind and can be used for creating further NFA transitions.
/// In addition to the ID of the core state, NFA states are allocated a new NFA mode specific ID, which is
/// used to index into NFA mode transition arrays (e.g. <see cref="_nfaDelta"/>).
/// </summary>
/// <remarks>
/// Using an ID numbering for NFA mode that is separate from DFA mode allows the IDs to be smaller, which saves
/// space both in the NFA mode arrays and in the <see cref="SparseIntMap{T}"/> instances used during matching for
/// sets of NFA states.
/// The core state ID can be looked up by the NFA ID with <see cref="GetCoreStateId(int)"/>.
/// </remarks>
/// <returns>the NFA ID of the new state, or null if the state is a dead end</returns>
private int? CreateNfaState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Debug.Assert(Monitor.IsEntered(this));
Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate);
// First make the core state for the node, which is used for creating further transitions out of this state
MatchingState<TSet> coreState = GetOrCreateState(node, prevCharKind);
// If the state is a dead end then don't create an NFA state, as dead ends in NFA mode are represented
// as empty lists of states.
if (coreState.IsDeadend(Solver))
{
return null;
}
// The NFA state itself is an ID that can be mapped back to the ID of the MatchingState. These NFA states are
// allocated separately from the IDs used in DFA mode to avoid large values, which helps save memory in the
// SparseIntMap data structures used in NFA matching modes.
if (!_nfaIdByCoreId.TryGetValue(coreState.Id, out int nfaStateId))
{
// No NFA state already exists, so make a new one. NFA state IDs are allocated sequentially from zero by
// giving each new state an ID equal to the number of existing NFA states.
nfaStateId = _nfaIdByCoreId.Count;
// If the next ID is past the end of the NFA state array, increase the sizes of the NFA arrays
if (nfaStateId == _nfaCoreIdArray.Length)
{
// The growth factor 2 matches that of List<T>
int newsize = Math.Max(_nfaCoreIdArray.Length * 2, InitialNfaStateCapacity);
ArrayResizeAndVolatilePublish(ref _nfaCoreIdArray, newsize);
ArrayResizeAndVolatilePublish(ref _nfaDelta, newsize << _mintermsLog);
ArrayResizeAndVolatilePublish(ref _capturingNfaDelta, newsize << _mintermsLog);
}
// Store the mapping from NFA state ID to core state ID
Debug.Assert(nfaStateId < _nfaCoreIdArray.Length);
_nfaCoreIdArray[nfaStateId] = coreState.Id;
// Store the mapping from core state ID to NFA state ID
// Adding an entry here increments the ID that will be given to the next NFA state
_nfaIdByCoreId.Add(coreState.Id, nfaStateId);
}
return nfaStateId;
}
/// <summary>Gets the <see cref="MatchingState{TSet}"/> corresponding to the given state ID.</summary>
private MatchingState<TSet> GetState(int stateId)
{
Debug.Assert(stateId > 0);
MatchingState<TSet>? state = _stateArray[stateId];
Debug.Assert(state is not null);
return state;
}
/// <summary>Gets the core state Id corresponding to the NFA state</summary>
private int GetCoreStateId(int nfaStateId)
{
Debug.Assert(nfaStateId < _nfaCoreIdArray.Length);
Debug.Assert(_nfaCoreIdArray[nfaStateId] < _stateArray.Length);
return _nfaCoreIdArray[nfaStateId];
}
/// <summary>Gets or creates a new DFA transition.</summary>
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
private bool TryCreateNewTransition(
MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState<TSet>? nextState)
{
Debug.Assert(offset < _dfaDelta.Length);
lock (this)
{
// check if meanwhile delta[offset] has become defined possibly by another thread
MatchingState<TSet>? targetState = _stateArray[_dfaDelta[offset]];
if (targetState is null)
{
if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
{
nextState = null;
return false;
}
TSet minterm = GetMintermFromId(mintermId);
uint nextCharKind = GetPositionKind(mintermId);
targetState = GetOrCreateState(sourceState.Next(_builder, minterm, nextCharKind), nextCharKind);
Volatile.Write(ref _dfaDelta[offset], targetState.Id);
}
nextState = targetState;
return true;
}
}
/// <summary>Gets or creates a new NFA transition.</summary>
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset)
{
Debug.Assert(nfaOffset < _nfaDelta.Length);
lock (this)
{
// check if meanwhile the nfaoffset has become defined possibly by another thread
int[]? targets = _nfaDelta[nfaOffset];
if (targets is null)
{
// Create the underlying transition from the core state corresponding to the nfa state
int coreId = GetCoreStateId(nfaStateId);
int coreOffset = (coreId << _mintermsLog) | mintermId;
int coreTargetId = _dfaDelta[coreOffset];
MatchingState<TSet> coreState = GetState(coreId);
TSet minterm = GetMintermFromId(mintermId);
uint nextCharKind = GetPositionKind(mintermId);
SymbolicRegexNode<TSet>? targetNode = coreTargetId > 0 ?
GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind);
List<int> targetsList = new();
ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List<int> targetsList) =>
targetsList.Add(nfaId));
targets = targetsList.ToArray();
Volatile.Write(ref _nfaDelta[nfaOffset], targets);
}
return targets;
}
}
/// <summary>Gets or creates a new capturing NFA transition.</summary>
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
private (int, DerivativeEffect[])[] CreateNewCapturingTransition(int nfaStateId, int mintermId, int offset)
{
lock (this)
{
// Get the next state if it exists. The caller should have already tried and found it null (not yet created),
// but in the interim another thread could have created it.
(int, DerivativeEffect[])[]? targets = _capturingNfaDelta[offset];
if (targets is null)
{
MatchingState<TSet> coreState = GetState(GetCoreStateId(nfaStateId));
TSet minterm = GetMintermFromId(mintermId);
uint nextCharKind = GetPositionKind(mintermId);
List<(SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind);
// Build the new state and store it into the array.
List<(int, DerivativeEffect[])> targetsList = new();
foreach ((SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects) entry in transition)
{
ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects),
static (int nfaId, (List<(int, DerivativeEffect[])> Targets, DerivativeEffect[] Effects) args) =>
args.Targets.Add((nfaId, args.Effects)));
}
targets = targetsList.ToArray();
Volatile.Write(ref _capturingNfaDelta[offset], targets);
}
return targets;
}
}
/// <summary>
/// Iterates through the alternation branches <see cref="SymbolicRegexNode{TSet}.EnumerateAlternationBranches(SymbolicRegexBuilder{TSet})"/>
/// and tries to create NFA states for each. The supplied action is called for each created NFA state. These never
/// include dead ends as <see cref="CreateNfaState(SymbolicRegexNode{TSet}, uint)"/> will filter those out.
/// </summary>
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
/// <typeparam name="T">the type of the additional argument passed through to the action</typeparam>
/// <param name="node">the node to break up into NFA states</param>
/// <param name="prevCharKind">the previous character kind for each created NFA state</param>
/// <param name="arg">an additional argument passed through to each call to the action</param>
/// <param name="action">action to call for each NFA state</param>
private void ForEachNfaState<T>(SymbolicRegexNode<TSet> node, uint prevCharKind, T arg, Action<int, T> action)
{
lock (this)
{
foreach (SymbolicRegexNode<TSet> nfaNode in node.EnumerateAlternationBranches(_builder))
{
if (CreateNfaState(nfaNode, prevCharKind) is int nfaId)
{
action(nfaId, arg);
}
}
}
}
}
}
......@@ -16,140 +16,140 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
[ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
public override void SaveDGML(TextWriter writer, int maxLabelLength)
{
if (maxLabelLength < 0)
maxLabelLength = int.MaxValue;
Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> transitions = GatherTransitions(_builder);
writer.WriteLine("<?xml version=\"1.0\" encoding=\"utf-8\"?>");
writer.WriteLine("<DirectedGraph xmlns=\"http://schemas.microsoft.com/vs/2009/dgml\" ZoomLevel=\"1.5\" GraphDirection=\"TopToBottom\" >");
writer.WriteLine(" <Nodes>");
writer.WriteLine(" <Node Id=\"dfa\" Label=\" \" Group=\"Collapsed\" Category=\"DFA\" DFAInfo=\"{0}\" />", FormatInfo(_builder, transitions.Count));
writer.WriteLine(" <Node Id=\"dfainfo\" Category=\"DFAInfo\" Label=\"{0}\"/>", FormatInfo(_builder, transitions.Count));
foreach (DfaMatchingState<TSet> state in _builder._stateCache)
lock (this)
{
string info = CharKind.DescribePrev(state.PrevCharKind);
string deriv = WebUtility.HtmlEncode(state.Node.ToString());
string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info}&#13;")}{(deriv == string.Empty ? "()" : deriv)}";
if (maxLabelLength < 0)
maxLabelLength = int.MaxValue;
Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> transitions = GatherTransitions(this);
writer.WriteLine(" <Node Id=\"{0}\" Label=\"{0}\" Category=\"State\" Group=\"Collapsed\" StateInfo=\"{1}\">", state.Id, nodeDgmlView);
if (_builder.GetStateInfo(state.Id).IsInitial)
writer.WriteLine("<?xml version=\"1.0\" encoding=\"utf-8\"?>");
writer.WriteLine("<DirectedGraph xmlns=\"http://schemas.microsoft.com/vs/2009/dgml\" ZoomLevel=\"1.5\" GraphDirection=\"TopToBottom\" >");
writer.WriteLine(" <Nodes>");
writer.WriteLine(" <Node Id=\"dfa\" Label=\" \" Group=\"Collapsed\" Category=\"DFA\" DFAInfo=\"{0}\" />", FormatInfo(this, transitions.Count));
writer.WriteLine(" <Node Id=\"dfainfo\" Category=\"DFAInfo\" Label=\"{0}\"/>", FormatInfo(this, transitions.Count));
foreach (MatchingState<TSet> state in _stateCache.Values)
{
writer.WriteLine(" <Category Ref=\"InitialState\" />");
string info = CharKind.DescribePrev(state.PrevCharKind);
string deriv = WebUtility.HtmlEncode(state.Node.ToString());
string nodeDgmlView = $"{(info == string.Empty ? info : $"Previous: {info}&#13;")}{(deriv == string.Empty ? "()" : deriv)}";
writer.WriteLine(" <Node Id=\"{0}\" Label=\"{0}\" Category=\"State\" Group=\"Collapsed\" StateInfo=\"{1}\">", state.Id, nodeDgmlView);
if (GetStateInfo(state.Id).IsInitial)
{
writer.WriteLine(" <Category Ref=\"InitialState\" />");
}
if (state.Node.CanBeNullable)
{
writer.WriteLine(" <Category Ref=\"FinalState\" />");
}
writer.WriteLine(" </Node>");
writer.WriteLine(" <Node Id=\"{0}info\" Label=\"{1}\" Category=\"StateInfo\"/>", state.Id, nodeDgmlView);
}
if (state.Node.CanBeNullable)
writer.WriteLine(" </Nodes>");
writer.WriteLine(" <Links>");
foreach (MatchingState<TSet> initialState in GetInitialStates(this))
{
writer.WriteLine(" <Category Ref=\"FinalState\" />");
writer.WriteLine(" <Link Source=\"dfa\" Target=\"{0}\" Label=\"\" Category=\"StartTransition\" />", initialState.Id);
}
writer.WriteLine(" </Node>");
writer.WriteLine(" <Node Id=\"{0}info\" Label=\"{1}\" Category=\"StateInfo\"/>", state.Id, nodeDgmlView);
}
writer.WriteLine(" </Nodes>");
writer.WriteLine(" <Links>");
foreach (DfaMatchingState<TSet> initialState in GetInitialStates(this))
{
Debug.Assert(_builder._stateCache.Contains(initialState));
writer.WriteLine(" <Link Source=\"dfa\" Target=\"{0}\" Label=\"\" Category=\"StartTransition\" />", initialState.Id);
}
writer.WriteLine(" <Link Source=\"dfa\" Target=\"dfainfo\" Label=\"\" Category=\"Contains\" />");
writer.WriteLine(" <Link Source=\"dfa\" Target=\"dfainfo\" Label=\"\" Category=\"Contains\" />");
foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> transition in transitions)
{
string label = DescribeLabel(transition.Value.Rule, _builder);
string info = "";
if (label.Length > maxLabelLength)
foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> transition in transitions)
{
info = $"FullLabel = \"{label}\" ";
label = string.Concat(label.AsSpan(0, maxLabelLength), "..");
string label = DescribeLabel(transition.Value.Rule, _builder);
string info = "";
if (label.Length > maxLabelLength)
{
info = $"FullLabel = \"{label}\" ";
label = string.Concat(label.AsSpan(0, maxLabelLength), "..");
}
writer.WriteLine($" <Link Source=\"{transition.Key.Source}\" Target=\"{transition.Key.Target}\" Label=\"{label}\" Category=\"NonEpsilonTransition\" {info}/>");
// Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character)
// from the target of the DFA transition.
foreach (int nfaTarget in transition.Value.NfaTargets)
{
writer.WriteLine($" <Link Source=\"{transition.Key.Target}\" Target=\"{nfaTarget}\" Category=\"EpsilonTransition\"/>");
}
}
writer.WriteLine($" <Link Source=\"{transition.Key.Source}\" Target=\"{transition.Key.Target}\" Label=\"{label}\" Category=\"NonEpsilonTransition\" {info}/>");
// Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character)
// from the target of the DFA transition.
foreach (int nfaTarget in transition.Value.NfaTargets)
foreach (MatchingState<TSet> state in _stateCache.Values)
{
writer.WriteLine($" <Link Source=\"{transition.Key.Target}\" Target=\"{nfaTarget}\" Category=\"EpsilonTransition\"/>");
writer.WriteLine(" <Link Source=\"{0}\" Target=\"{0}info\" Category=\"Contains\" />", state.Id);
}
}
foreach (DfaMatchingState<TSet> state in _builder._stateCache)
{
writer.WriteLine(" <Link Source=\"{0}\" Target=\"{0}info\" Category=\"Contains\" />", state.Id);
writer.WriteLine(" </Links>");
writer.WriteLine(" <Categories>");
writer.WriteLine(" <Category Id=\"DFA\" Label=\"DFA\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"EpsilonTransition\" Label=\"Epsilon transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"StartTransition\" Label=\"Initial transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"FinalLabel\" Label=\"Final transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"FinalState\" Label=\"Final\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"SinkState\" Label=\"Sink state\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"EpsilonState\" Label=\"Epsilon state\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"InitialState\" Label=\"Initial\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"NonEpsilonTransition\" Label=\"Nonepsilon transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"State\" Label=\"State\" IsTag=\"True\" />");
writer.WriteLine(" </Categories>");
writer.WriteLine(" <Styles>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"InitialState\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('InitialState')\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"lightblue\" />");
writer.WriteLine(" <Setter Property=\"MinWidth\" Value=\"0\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"FinalState\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('FinalState')\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"lightgreen\" />");
writer.WriteLine(" <Setter Property=\"StrokeThickness\" Value=\"4\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"State\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('State')\" />");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"black\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"MinWidth\" Value=\"0\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"12\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"NonEpsilonTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('NonEpsilonTransition')\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"StartTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('StartTransition')\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"EpsilonTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('EpsilonTransition')\" />");
writer.WriteLine(" <Setter Property=\"StrokeDashArray\" Value=\"8 8\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"FinalLabel\" ValueLabel=\"False\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('FinalLabel')\" />");
writer.WriteLine(" <Setter Property=\"StrokeDashArray\" Value=\"8 8\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"StateInfo\" ValueLabel=\"True\">");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"DFAInfo\" ValueLabel=\"True\">");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" </Styles>");
writer.WriteLine("</DirectedGraph>");
}
writer.WriteLine(" </Links>");
writer.WriteLine(" <Categories>");
writer.WriteLine(" <Category Id=\"DFA\" Label=\"DFA\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"EpsilonTransition\" Label=\"Epsilon transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"StartTransition\" Label=\"Initial transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"FinalLabel\" Label=\"Final transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"FinalState\" Label=\"Final\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"SinkState\" Label=\"Sink state\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"EpsilonState\" Label=\"Epsilon state\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"InitialState\" Label=\"Initial\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"NonEpsilonTransition\" Label=\"Nonepsilon transition\" IsTag=\"True\" />");
writer.WriteLine(" <Category Id=\"State\" Label=\"State\" IsTag=\"True\" />");
writer.WriteLine(" </Categories>");
writer.WriteLine(" <Styles>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"InitialState\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('InitialState')\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"lightblue\" />");
writer.WriteLine(" <Setter Property=\"MinWidth\" Value=\"0\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"FinalState\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('FinalState')\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"lightgreen\" />");
writer.WriteLine(" <Setter Property=\"StrokeThickness\" Value=\"4\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"State\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('State')\" />");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"black\" />");
writer.WriteLine(" <Setter Property=\"Background\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"MinWidth\" Value=\"0\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"12\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"NonEpsilonTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('NonEpsilonTransition')\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"StartTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('StartTransition')\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"EpsilonTransition\" ValueLabel=\"True\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('EpsilonTransition')\" />");
writer.WriteLine(" <Setter Property=\"StrokeDashArray\" Value=\"8 8\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Link\" GroupLabel=\"FinalLabel\" ValueLabel=\"False\">");
writer.WriteLine(" <Condition Expression=\"HasCategory('FinalLabel')\" />");
writer.WriteLine(" <Setter Property=\"StrokeDashArray\" Value=\"8 8\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"StateInfo\" ValueLabel=\"True\">");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" <Style TargetType=\"Node\" GroupLabel=\"DFAInfo\" ValueLabel=\"True\">");
writer.WriteLine(" <Setter Property=\"Stroke\" Value=\"white\" />");
writer.WriteLine(" <Setter Property=\"FontSize\" Value=\"18\" />");
writer.WriteLine(" <Setter Property=\"FontFamily\" Value=\"Arial\" />");
writer.WriteLine(" </Style>");
writer.WriteLine(" </Styles>");
writer.WriteLine("</DirectedGraph>");
// This function gathers all transitions in the given builder and groups them by (source,destination) state ID
static Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> GatherTransitions(SymbolicRegexBuilder<TSet> builder)
static Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> GatherTransitions(SymbolicRegexMatcher<TSet> matcher)
{
Debug.Assert(builder._delta is not null);
Debug.Assert(builder._minterms is not null);
Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> result = new();
foreach (DfaMatchingState<TSet> source in builder._stateCache)
foreach (MatchingState<TSet> source in matcher._stateCache.Values)
{
// Get the span of entries in delta that gives the transitions for the different minterms
Span<int> deltas = builder.GetDeltasFor(source);
Span<int[]?> nfaDeltas = builder.GetNfaDeltasFor(source);
Debug.Assert(deltas.Length == builder._minterms.Length);
Span<int> deltas = matcher.GetDeltasFor(source);
Span<int[]?> nfaDeltas = matcher.GetNfaDeltasFor(source);
Debug.Assert(deltas.Length == matcher._minterms.Length);
for (int i = 0; i < deltas.Length; ++i)
{
// negative entries are transitions not explored yet, so skip them
......@@ -160,7 +160,7 @@ static Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> G
(int Source, int Target) key = (source.Id, targetId);
if (!result.TryGetValue(key, out (TSet Rule, List<int> NfaTargets) entry))
{
entry = (builder._solver.Empty, new List<int>());
entry = (matcher.Solver.Empty, new List<int>());
}
// If this state has an NFA transition for the same minterm, then associate
// those with the transition.
......@@ -168,24 +168,24 @@ static Dictionary<(int Source, int Target), (TSet Rule, List<int> NfaTargets)> G
{
foreach (int nfaTarget in nfaTargets)
{
entry.NfaTargets.Add(builder._nfaStateArray[nfaTarget]);
entry.NfaTargets.Add(matcher._nfaCoreIdArray[nfaTarget]);
}
}
// Expand the rule for this minterm
result[key] = (builder._solver.Or(entry.Rule, builder._minterms[i]), entry.NfaTargets);
result[key] = (matcher.Solver.Or(entry.Rule, matcher._minterms[i]), entry.NfaTargets);
}
}
}
return result;
}
static string FormatInfo(SymbolicRegexBuilder<TSet> builder, int transitionCount)
static string FormatInfo(SymbolicRegexMatcher<TSet> matcher, int transitionCount)
{
StringBuilder sb = new();
sb.Append($"States = {builder._stateCache.Count}&#13;");
sb.Append($"States = {matcher._stateCache.Count}&#13;");
sb.Append($"Transitions = {transitionCount}&#13;");
sb.Append($"Min Terms ({builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',',
DescribeLabels(builder._solver.GetMinterms()!, builder));
sb.Append($"Min Terms ({matcher.Solver.GetMinterms()!.Length}) = ").AppendJoin(',',
DescribeLabels(matcher.Solver.GetMinterms()!, matcher._builder));
return sb.ToString();
}
......@@ -200,13 +200,13 @@ static IEnumerable<string> DescribeLabels(IEnumerable<TSet> labels, SymbolicRege
static string DescribeLabel(TSet label, SymbolicRegexBuilder<TSet> builder) =>
WebUtility.HtmlEncode(builder._solver.PrettyPrint(label, builder._charSetSolver));
static IEnumerable<DfaMatchingState<TSet>> GetInitialStates(SymbolicRegexMatcher<TSet> matcher)
static IEnumerable<MatchingState<TSet>> GetInitialStates(SymbolicRegexMatcher<TSet> matcher)
{
foreach (DfaMatchingState<TSet> state in matcher._dotstarredInitialStates)
foreach (MatchingState<TSet> state in matcher._dotstarredInitialStates)
yield return state;
foreach (DfaMatchingState<TSet> state in matcher._initialStates)
foreach (MatchingState<TSet> state in matcher._initialStates)
yield return state;
foreach (DfaMatchingState<TSet> state in matcher._reverseInitialStates)
foreach (MatchingState<TSet> state in matcher._reverseInitialStates)
yield return state;
}
}
......
......@@ -16,89 +16,91 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
[ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa)
{
Debug.Assert(_builder._minterms is not null);
// Track seen states to avoid exploring twice
HashSet<DfaMatchingState<TSet>> seen = new();
// Use a queue for unexplored states
// This results in a breadth-first exploration
Queue<DfaMatchingState<TSet>> toExplore = new();
lock (this)
{
// Track seen states to avoid exploring twice
HashSet<MatchingState<TSet>> seen = new();
// Use a queue for unexplored states
// This results in a breadth-first exploration
Queue<MatchingState<TSet>> toExplore = new();
// Explore all initial states as requested
if (includeDotStarred)
EnqueueAll(_dotstarredInitialStates, seen, toExplore);
if (includeReverse)
EnqueueAll(_reverseInitialStates, seen, toExplore);
if (includeOriginal)
EnqueueAll(_initialStates, seen, toExplore);
// Explore all initial states as requested
if (includeDotStarred)
EnqueueAll(_dotstarredInitialStates, seen, toExplore);
if (includeReverse)
EnqueueAll(_reverseInitialStates, seen, toExplore);
if (includeOriginal)
EnqueueAll(_initialStates, seen, toExplore);
if (exploreDfa)
{
while (toExplore.Count > 0)
if (exploreDfa)
{
// Don't dequeue yet, because a transition might fail
DfaMatchingState<TSet> state = toExplore.Peek();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
while (toExplore.Count > 0)
{
int offset = (state.Id << _builder._mintermsLog) | mintermId;
if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState<TSet>? nextState))
goto DfaLimitReached;
EnqueueIfUnseen(nextState, seen, toExplore);
// Don't dequeue yet, because a transition might fail
MatchingState<TSet> state = toExplore.Peek();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
{
int offset = DeltaOffset(state.Id, mintermId);
if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState<TSet>? nextState))
goto DfaLimitReached;
EnqueueIfUnseen(nextState, seen, toExplore);
}
// Safe to dequeue now that the state has been completely handled
toExplore.Dequeue();
}
// Safe to dequeue now that the state has been completely handled
toExplore.Dequeue();
}
}
DfaLimitReached:
if (exploreNfa && toExplore.Count > 0)
{
// DFA states are broken up into NFA states when they are alternations
DfaMatchingState<TSet>[] toBreakUp = toExplore.ToArray();
toExplore.Clear();
foreach (DfaMatchingState<TSet> dfaState in toBreakUp)
DfaLimitReached:
if (exploreNfa && toExplore.Count > 0)
{
// Remove state from seen so that it can be added back in if necessary
seen.Remove(dfaState);
// Enqueue all elements of a top level alternation or the state itself
foreach (var element in dfaState.Node.EnumerateAlternationBranches())
// DFA states are broken up into NFA states when they are alternations
MatchingState<TSet>[] toBreakUp = toExplore.ToArray();
toExplore.Clear();
foreach (MatchingState<TSet> dfaState in toBreakUp)
{
int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind);
EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore);
// Remove state from seen so that it can be added back in if necessary
seen.Remove(dfaState);
// Enqueue all elements of a top level alternation or the state itself
ForEachNfaState(dfaState.Node, dfaState.PrevCharKind, (this, seen, toExplore),
static (int nfaId, (SymbolicRegexMatcher<TSet> Matcher, HashSet<MatchingState<TSet>> Seen, Queue<MatchingState<TSet>> ToExplore) args) =>
{
MatchingState<TSet>? coreState = args.Matcher.GetState(args.Matcher.GetCoreStateId(nfaId));
EnqueueIfUnseen(coreState, args.Seen, args.ToExplore);
});
}
}
while (toExplore.Count > 0)
{
// NFA transitions can't fail, so its safe to dequeue here
DfaMatchingState<TSet> state = toExplore.Dequeue();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
while (toExplore.Count > 0)
{
int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId;
int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset);
foreach (int nextNfaState in nextNfaStates)
// NFA transitions can't fail, so its safe to dequeue here
MatchingState<TSet> state = toExplore.Dequeue();
// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _minterms.Length : _minterms.Length - 1;
// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
{
EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore);
int nfaOffset = DeltaOffset(_nfaIdByCoreId[state.Id], mintermId);
int[] nextNfaStates = CreateNewNfaTransition(_nfaIdByCoreId[state.Id], mintermId, nfaOffset);
foreach (int nextNfaState in nextNfaStates)
{
EnqueueIfUnseen(GetState(GetCoreStateId(nextNfaState)), seen, toExplore);
}
}
}
}
}
static void EnqueueAll(DfaMatchingState<TSet>[] states, HashSet<DfaMatchingState<TSet>> seen, Queue<DfaMatchingState<TSet>> toExplore)
static void EnqueueAll(MatchingState<TSet>[] states, HashSet<MatchingState<TSet>> seen, Queue<MatchingState<TSet>> toExplore)
{
foreach (DfaMatchingState<TSet> state in states)
foreach (MatchingState<TSet> state in states)
{
EnqueueIfUnseen(state, seen, toExplore);
}
}
static void EnqueueIfUnseen(DfaMatchingState<TSet> state, HashSet<DfaMatchingState<TSet>> seen, Queue<DfaMatchingState<TSet>> queue)
static void EnqueueIfUnseen(MatchingState<TSet> state, HashSet<MatchingState<TSet>> seen, Queue<MatchingState<TSet>> queue)
{
if (seen.Add(state))
{
......
......@@ -30,133 +30,134 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
[ExcludeFromCodeCoverage(Justification = "Currently only used for testing")]
public override IEnumerable<string> SampleMatches(int k, int randomseed)
{
// Zero is treated as no seed, instead using a system provided one
Random random = randomseed != 0 ? new Random(randomseed) : new Random();
ISolver<TSet> solver = _builder._solver;
CharSetSolver charSetSolver = _builder._charSetSolver;
lock (this)
{
// Zero is treated as no seed, instead using a system provided one
Random random = randomseed != 0 ? new Random(randomseed) : new Random();
CharSetSolver charSetSolver = _builder._charSetSolver;
// Create helper BDDs for handling anchors and preferentially generating ASCII inputs
BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
// Create helper BDDs for handling anchors and preferentially generating ASCII inputs
BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
charSetSolver.CreateBDDFromRange('A', 'Z'),
charSetSolver.CreateBDDFromRange('a', 'z'),
charSetSolver.CreateBDDFromChar('_'),
charSetSolver.CreateBDDFromRange('0', '9')});
// Visible ASCII range for input character generation
BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));
// Set up two sets of minterms, one with the additional special minterm for the last end-of-line
Debug.Assert(_builder._minterms is not null);
int[] mintermIdsWithoutZ = new int[_builder._minterms.Length];
int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1];
for (int i = 0; i < _builder._minterms.Length; ++i)
{
mintermIdsWithoutZ[i] = i;
mintermIdsWithZ[i] = i;
}
mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length;
for (int i = 0; i < k; i++)
{
// Holds the generated input so far
StringBuilder inputSoFar = new();
StringBuilder? latestCandidate = null;
// Visible ASCII range for input character generation
BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));
// Set up two sets of minterms, one with the additional special minterm for the last end-of-line
Debug.Assert(_minterms is not null);
int[] mintermIdsWithoutZ = new int[_minterms.Length];
int[] mintermIdsWithZ = new int[_minterms.Length + 1];
for (int i = 0; i < _minterms.Length; ++i)
{
mintermIdsWithoutZ[i] = i;
mintermIdsWithZ[i] = i;
}
mintermIdsWithZ[_minterms.Length] = _minterms.Length;
// Current set of states reached initially contains just the root
NfaMatchingState states = new(_builder);
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
// and initialize inputSoFar accordingly
states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan<char>.Empty, -1)]);
CurrentState statesWrapper = new(states);
for (int i = 0; i < k; i++)
{
// Holds the generated input so far
StringBuilder inputSoFar = new();
StringBuilder? latestCandidate = null;
// Used for end suffixes
List<string> possibleEndings = new();
// Current set of states reached initially contains just the root
NfaMatchingState states = new();
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
// and initialize inputSoFar accordingly
states.InitializeFrom(this, _initialStates[GetCharKind<FullInputReader>(ReadOnlySpan<char>.Empty, -1)]);
CurrentState statesWrapper = new(states);
while (true)
{
Debug.Assert(states.NfaStateSet.Count > 0);
// Used for end suffixes
List<string> possibleEndings = new();
// Gather the possible endings for satisfying nullability
possibleEndings.Clear();
if (NfaStateHandler.CanBeNullable(ref statesWrapper))
while (true)
{
// Unconditionally final state or end of the input due to \Z anchor for example
if (NfaStateHandler.IsNullable(ref statesWrapper) ||
NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
{
possibleEndings.Add("");
}
Debug.Assert(states.NfaStateSet.Count > 0);
// End of line due to end-of-line anchor
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
// Gather the possible endings for satisfying nullability
possibleEndings.Clear();
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.CanBeNullable(this, in statesWrapper))
{
possibleEndings.Add("\n");
// Unconditionally final state or end of the input due to \Z anchor for example
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullable(this, in statesWrapper) ||
SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.BeginningEnd))
{
possibleEndings.Add("");
}
// End of line due to end-of-line anchor
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.Newline))
{
possibleEndings.Add("\n");
}
// Related to wordborder due to \b or \B
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.WordLetter))
{
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
}
// Related to wordborder due to \b or \B
if (SymbolicRegexMatcher<TSet>.NfaStateHandler.IsNullableFor(this, in statesWrapper, CharKind.General))
{
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
}
}
// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
// If we have a possible ending, then store a candidate input
if (possibleEndings.Count > 0)
{
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
latestCandidate ??= new();
latestCandidate.Clear();
latestCandidate.Append(inputSoFar);
//Choose some suffix that allows some anchor (if any) to be nullable
latestCandidate.Append(Choose(random, possibleEndings));
// Choose to stop here based on a coin-toss
if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
{
yield return latestCandidate.ToString();
break;
}
}
// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
// Shuffle the minterms, including the last end-of-line marker if appropriate
int[] mintermIds = SymbolicRegexMatcher<TSet>.NfaStateHandler.StartsWithLineAnchor(this, in statesWrapper) ?
Shuffle(random, mintermIdsWithZ) :
Shuffle(random, mintermIdsWithoutZ);
foreach (int mintermId in mintermIds)
{
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
bool success = SymbolicRegexMatcher<TSet>.NfaStateHandler.TryTakeTransition(this, ref statesWrapper, mintermId);
Debug.Assert(success);
if (states.NfaStateSet.Count > 0)
{
TSet minterm = GetMintermFromId(mintermId);
// Append a random member of the minterm
inputSoFar.Append(ChooseChar(random, ToBDD(minterm, Solver, charSetSolver), ascii, charSetSolver));
break;
}
else
{
// The transition was a dead end, undo and continue to try another minterm
NfaStateHandler.UndoTransition(ref statesWrapper);
}
}
}
// If we have a possible ending, then store a candidate input
if (possibleEndings.Count > 0)
{
latestCandidate ??= new();
latestCandidate.Clear();
latestCandidate.Append(inputSoFar);
//Choose some suffix that allows some anchor (if any) to be nullable
latestCandidate.Append(Choose(random, possibleEndings));
// Choose to stop here based on a coin-toss
if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
// In the case that there are no next states or input has become too large: stop here
if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
{
yield return latestCandidate.ToString();
// Ending up here without an ending is unlikely but possible for example for infeasible patterns
// such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
if (latestCandidate != null)
{
yield return latestCandidate.ToString();
}
break;
}
}
// Shuffle the minterms, including the last end-of-line marker if appropriate
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
Shuffle(random, mintermIdsWithZ) :
Shuffle(random, mintermIdsWithoutZ);
foreach (int mintermId in mintermIds)
{
bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId);
Debug.Assert(success);
if (states.NfaStateSet.Count > 0)
{
TSet minterm = _builder.GetMinterm(mintermId);
// Append a random member of the minterm
inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver));
break;
}
else
{
// The transition was a dead end, undo and continue to try another minterm
NfaStateHandler.UndoTransition(ref statesWrapper);
}
}
// In the case that there are no next states or input has become too large: stop here
if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
{
// Ending up here without an ending is unlikely but possible for example for infeasible patterns
// such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
if (latestCandidate != null)
{
yield return latestCandidate.ToString();
}
break;
}
}
}
......
......@@ -5,6 +5,7 @@
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
......@@ -84,19 +85,31 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
/// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly DfaMatchingState<TSet>[] _initialStates;
private readonly MatchingState<TSet>[] _initialStates;
/// <summary>The initial states for the dot-star pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly DfaMatchingState<TSet>[] _dotstarredInitialStates;
private readonly MatchingState<TSet>[] _dotstarredInitialStates;
/// <summary>The initial states for the reverse pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly DfaMatchingState<TSet>[] _reverseInitialStates;
private readonly MatchingState<TSet>[] _reverseInitialStates;
/// <summary>Lookup table to quickly determine the character kind for ASCII characters.</summary>
/// <remarks>Non-null iff the pattern contains anchors; otherwise, it's unused.</remarks>
private readonly uint[]? _asciiCharKinds;
/// <summary>Partition of the input space of sets.</summary>
private readonly TSet[] _minterms;
/// <summary>
/// Character kinds <see cref="CharKind"/> for all minterms in <see cref="_minterms"/> as well as two special
/// cases: character positions outside the input bounds and an end-of-line as the last input character.
/// </summary>
private readonly uint[] _positionKinds;
/// <summary>
/// The smallest k s.t. 2^k >= minterms.Length + 1. The "delta arrays", e.g., <see cref="_dfaDelta"/> allocate 2^k
/// consecutive slots for each state ID to represent the transitions for each minterm. The extra slot at index
/// _minterms.Length is used to represent an \n occurring at the very end of input, for supporting the \Z anchor.
/// </summary>
private readonly int _mintermsLog;
/// <summary>Number of capture groups.</summary>
private readonly int _capsize;
......@@ -105,14 +118,10 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
/// <remarks>This determines whether the matcher uses the special capturing NFA simulation mode.</remarks>
internal bool HasSubcaptures => _capsize > 1;
/// <summary>Get the minterm of <paramref name="c"/>.</summary>
/// <param name="c">character code</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private TSet GetMinterm(int c)
{
Debug.Assert(_builder._minterms is not null);
return _builder._minterms[_mintermClassifier.GetMintermID(c)];
}
/// <remarks>
/// Both solvers supported here, <see cref="UInt64Solver"/> and <see cref="BitVectorSolver"/> are thread safe.
/// </remarks>
private ISolver<TSet> Solver => _builder._solver;
/// <summary>Creates a new <see cref="SymbolicRegexMatcher{TSetType}"/>.</summary>
/// <param name="captureCount">The number of captures in the regular expression.</param>
......@@ -136,25 +145,46 @@ private TSet GetMinterm(int c)
_newLineSet = solver.ConvertFromBDD(bddBuilder._newLineSet, charSetSolver)
};
// Convert the BDD-based AST to TSetType-based AST
// Convert the BDD-based AST to TSet-based AST
SymbolicRegexNode<TSet> rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver));
return new SymbolicRegexMatcher<TSet>(rootNode, captureCount, findOptimizations, matchTimeout);
return new SymbolicRegexMatcher<TSet>(builder, rootNode, captureCount, findOptimizations, matchTimeout);
}
/// <summary>Constructs matcher for given symbolic regex.</summary>
private SymbolicRegexMatcher(SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
{
Debug.Assert(rootNode._builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {rootNode._builder._solver}");
Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}");
_pattern = rootNode;
_builder = rootNode._builder;
_builder = builder;
_checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout;
_timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms
_mintermClassifier = _builder._solver is UInt64Solver bv64 ?
TSet[]? solverMinterms = builder._solver.GetMinterms();
Debug.Assert(solverMinterms is not null);
_minterms = solverMinterms;
// BitOperations.Log2 gives the integer floor of the log, so the +1 below either rounds up with non-power-of-two
// minterms or adds an extra bit with power-of-two minterms. The extra slot at index _minterms.Length is used to
// represent an \n occurring at the very end of input, for supporting the \Z anchor.
_mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1;
_mintermClassifier = builder._solver is UInt64Solver bv64 ?
bv64._classifier :
((BitVectorSolver)(object)_builder._solver)._classifier;
((BitVectorSolver)(object)builder._solver)._classifier;
_capsize = captureCount;
// Initialization for fields in SymbolicRegexMatcher.Automata.cs
_stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
_stateInfo = new ContextIndependentState[InitialDfaStateCapacity];
_dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
// Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm
// ID _minterms.Length, which is used to represent a \n at the very end of input, and another ID -1,
// which is used to represent any position outside the bounds of the input.
_positionKinds = new uint[_minterms.Length + 2];
for (int mintermId = -1; mintermId < _positionKinds.Length - 1; mintermId++)
{
_positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId);
}
// Store the find optimizations that can be used to jump ahead to the next possible starting location.
// If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's
// handling for beginning anchors.
......@@ -168,26 +198,28 @@ private SymbolicRegexMatcher(SymbolicRegexNode<TSet> rootNode, int captureCount,
// character kind 0 is ever going to be used for all initial states.
int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1;
// The loops below and how character kinds are calculated assume that the "general" character kind is zero
Debug.Assert(CharKind.General == 0);
// Create the initial states for the original pattern.
var initialStates = new DfaMatchingState<TSet>[statesCount];
for (uint i = 0; i < initialStates.Length; i++)
var initialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < initialStates.Length; charKind++)
{
initialStates[i] = _builder.CreateState(_pattern, i, capturing: HasSubcaptures);
initialStates[charKind] = GetOrCreateState_NoLock(_pattern, charKind);
}
_initialStates = initialStates;
// Create the dot-star pattern (a concatenation of any* with the original pattern)
// and all of its initial states.
_dotStarredPattern = _builder.CreateConcat(_builder._anyStarLazy, _pattern);
var dotstarredInitialStates = new DfaMatchingState<TSet>[statesCount];
for (uint i = 0; i < dotstarredInitialStates.Length; i++)
_dotStarredPattern = builder.CreateConcat(builder._anyStarLazy, _pattern);
var dotstarredInitialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < dotstarredInitialStates.Length; charKind++)
{
// Used to detect if initial state was reentered,
// but observe that the behavior from the state may ultimately depend on the previous
// input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
// in that sense there can be several "versions" (not more than StateCount) of the initial state.
DfaMatchingState<TSet> state = _builder.CreateState(_dotStarredPattern, i, capturing: false, isInitialState: true);
dotstarredInitialStates[i] = state;
dotstarredInitialStates[charKind] = GetOrCreateState_NoLock(_dotStarredPattern, charKind, isInitialState: true);
}
_dotstarredInitialStates = dotstarredInitialStates;
......@@ -195,84 +227,91 @@ private SymbolicRegexMatcher(SymbolicRegexNode<TSet> rootNode, int captureCount,
// initial states. Also disable backtracking simulation to ensure the reverse path from
// the final state that was found is followed. Not doing so might cause the earliest
// starting point to not be found.
_reversePattern = _builder.CreateDisableBacktrackingSimulation(_pattern.Reverse());
var reverseInitialStates = new DfaMatchingState<TSet>[statesCount];
for (uint i = 0; i < reverseInitialStates.Length; i++)
_reversePattern = builder.CreateDisableBacktrackingSimulation(_pattern.Reverse(builder));
var reverseInitialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < reverseInitialStates.Length; charKind++)
{
reverseInitialStates[i] = _builder.CreateState(_reversePattern, i, capturing: false);
reverseInitialStates[charKind] = GetOrCreateState_NoLock(_reversePattern, charKind);
}
_reverseInitialStates = reverseInitialStates;
// Initialize our fast-lookup for determining the character kind of ASCII characters.
// This is only required when the pattern contains anchors, as otherwise there's only
// ever a single kind used.
if (_pattern._info.ContainsSomeAnchor)
// Maps a minterm ID to a character kind
uint CalculateMintermIdKind(int mintermId)
{
var asciiCharKinds = new uint[128];
for (int i = 0; i < asciiCharKinds.Length; i++)
// Only patterns with anchors use anything except the general kind
if (_pattern._info.ContainsSomeAnchor)
{
TSet set;
uint charKind;
if (i == '\n')
// A minterm ID of -1 represents the positions before the first and after the last character
// in the input.
if (mintermId == -1)
{
set = _builder._newLineSet;
charKind = CharKind.Newline;
return CharKind.BeginningEnd;
}
else
// A minterm ID of minterms.Length represents a \n at the very end of input, which is matched
// by the \Z anchor.
if ((uint)mintermId == (uint)_minterms.Length)
{
set = _builder._wordLetterForBoundariesSet;
charKind = CharKind.WordLetter;
return CharKind.NewLineS;
}
asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), set).Equals(_builder._solver.Empty) ? 0 : charKind;
TSet minterm = _minterms[mintermId];
// Examine the minterm to figure out its character kind
if (_builder._newLineSet.Equals(minterm))
{
// The minterm is a new line character
return CharKind.Newline;
}
else if (!Solver.IsEmpty(Solver.And(_builder._wordLetterForBoundariesSet, minterm)))
{
Debug.Assert(Solver.IsEmpty(Solver.And(Solver.Not(_builder._wordLetterForBoundariesSet), minterm)));
// The minterm is a subset of word letters as considered by \b and \B
return CharKind.WordLetter;
}
}
_asciiCharKinds = asciiCharKinds;
// All other minterms belong to the general kind
return CharKind.General;
}
}
/// <summary>
/// Create a PerThreadData with the appropriate parts initialized for this matcher's pattern.
/// </summary>
internal PerThreadData CreatePerThreadData() => new PerThreadData(_builder, _capsize);
internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
/// <summary>Compute the target state for the source state and input[i] character and transition to it.</summary>
/// <param name="builder">The associated builder.</param>
/// <param name="input">The input text.</param>
/// <param name="i">The index into <paramref name="input"/> at which the target character lives.</param>
/// <param name="state">The current state being transitioned from. Upon return it's the new state if the transition succeeded.</param>
/// <summary>Look up what is the character kind given a position ID</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryTakeTransition<TStateHandler>(SymbolicRegexBuilder<TSet> builder, ReadOnlySpan<char> input, int i, ref CurrentState state)
where TStateHandler : struct, IStateHandler
private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
/// <summary>
/// Lookup the actual minterm based on its ID. Also get its character kind, which is a general categorization of
/// characters used for cheaply deciding the nullability of anchors.
/// </summary>
internal TSet GetMintermFromId(int mintermId)
{
int c = input[i];
TSet[] minterms = _minterms;
// Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
int mintermId = c == '\n' && i == input.Length - 1 && TStateHandler.StartsWithLineAnchor(builder, ref state) ?
builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input
_mintermClassifier.GetMintermID(c);
// A minterm ID of minterms.Length represents a \n at the very end of input, which is matched
// by the \Z anchor.
if ((uint)mintermId >= (uint)minterms.Length)
{
return _builder._newLineSet;
}
return TStateHandler.TakeTransition(builder, ref state, mintermId);
// Otherwise look up the minterm from the array
return minterms[mintermId];
}
private List<(DfaMatchingState<TSet>, DerivativeEffect[])> CreateNewCapturingTransitions(DfaMatchingState<TSet> state, TSet minterm, int offset)
{
Debug.Assert(_builder._capturingDelta is not null);
lock (this)
{
// Get the next state if it exists. The caller should have already tried and found it null (not yet created),
// but in the interim another thread could have created it.
List<(DfaMatchingState<TSet>, DerivativeEffect[])>? p = _builder._capturingDelta[offset];
if (p is null)
{
// Build the new state and store it into the array.
p = state.NfaNextWithEffects(minterm);
Volatile.Write(ref _builder._capturingDelta[offset], p);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
CharKind.General : // The previous character kind is irrelevant when anchors are not used.
GetPositionKind(TInputReader.GetPositionId(this, input, i));
return p;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsMintermId(int positionId) => positionId >= 0;
private void CheckTimeout(long timeoutOccursAt)
{
......@@ -309,12 +348,16 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
// the position of the last b: aacaaaabbbc. It additionally records the position of the first a after
// the c as the low boundary for the starting position.
int matchStartLowBoundary, matchStartLengthMarker;
int matchEnd = (_findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
{
(true, true) => FindEndPosition<InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(true, false) => FindEndPosition<InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, true) => FindEndPosition<NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, false) => FindEndPosition<NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(true, true, true) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(true, true, false) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(true, false, true) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(true, false, false) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, true, true) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, true, false) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, false, true) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
(false, false, false) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
};
// If there wasn't a match, we're done.
......@@ -345,9 +388,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
{
Debug.Assert(matchEnd >= startat - 1);
matchStart = matchEnd < startat ?
startat : _pattern._info.ContainsSomeAnchor ?
FindStartPosition<FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData) :
FindStartPosition<NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData);
startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch
{
(true, true) => FindStartPosition<FullInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
(true, false) => FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
(false, true) => FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
(false, false) => FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
};
}
// Phase 3:
......@@ -361,7 +408,9 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
}
else
{
Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData);
Registers endRegisters = _pattern._info.ContainsLineAnchor ?
FindSubcaptures<FullInputReader>(input, matchStart, matchEnd, perThreadData) :
FindSubcaptures<NoZAnchorInputReader>(input, matchStart, matchEnd, perThreadData);
return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
}
}
......@@ -377,15 +426,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
/// <returns>
/// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
/// </returns>
private int FindEndPosition<TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData)
private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
where TFindOptimizationsHandler : struct, IInitialStateHandler
where TNullabilityHandler : struct, INullabilityHandler
{
initialStatePos = pos;
int initialStatePosCandidate = pos;
var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]);
SymbolicRegexBuilder<TSet> builder = _builder;
var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<TInputReader>(input, pos - 1)]);
int endPos = NoMatchExists;
int endStateId = -1;
......@@ -404,8 +453,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
input;
bool done = currentState.NfaState is not null ?
FindEndPositionDeltas<NfaStateHandler, TFindOptimizationsHandler, TNullabilityHandler>(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
FindEndPositionDeltas<DfaStateHandler, TFindOptimizationsHandler, TNullabilityHandler>(builder, input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
FindEndPositionDeltas<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
FindEndPositionDeltas<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
// If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
// there is no more input available, then the whole search is done.
......@@ -421,10 +470,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
{
// Because there was still more input available, a failure to transition in DFA mode must be the cause
// of the early exit. Upgrade to NFA mode.
DfaMatchingState<TSet>? dfaState = currentState.DfaState(_builder);
Debug.Assert(dfaState is not null);
NfaMatchingState nfaState = perThreadData.NfaState;
nfaState.InitializeFrom(dfaState);
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
currentState = new CurrentState(nfaState);
}
......@@ -437,7 +484,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
// Check whether there's a fixed-length marker for the current state. If there is, we can
// use that length to optimize subsequent matching phases.
matchLength = endStateId > 0 ? _builder._stateArray![endStateId].FixedLength(GetCharKind(input, endPos)) : -1;
matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind<TInputReader>(input, endPos)) : -1;
return endPos;
}
......@@ -448,8 +495,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
/// </summary>
/// <remarks>
/// The <typeparamref name="TStateHandler"/> supplies the actual transitioning logic, controlling whether processing is
/// performed in DFA mode or in NFA mode. However, it expects <paramref name="stateRef"/> to be configured to match,
/// so for example if <typeparamref name="TStateHandler"/> is a <see cref="DfaStateHandler"/>, it expects the <paramref name="stateRef"/>'s
/// performed in DFA mode or in NFA mode. However, it expects <paramref name="state"/> to be configured to match,
/// so for example if <typeparamref name="TStateHandler"/> is a <see cref="DfaStateHandler"/>, it expects the <paramref name="state"/>'s
/// <see cref="CurrentState.DfaStateId"/> to be non-negative and its <see cref="CurrentState.NfaState"/> to be null; vice versa for
/// <see cref="NfaStateHandler"/>.
/// </remarks>
......@@ -458,15 +505,15 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
/// 0 if iteration completed because we reached an initial state.
/// A negative value if iteration completed because we ran out of input or we failed to transition.
/// </returns>
private bool FindEndPositionDeltas<TStateHandler, TFindOptimizationsHandler, TNullabilityHandler>(SymbolicRegexBuilder<TSet> builder, ReadOnlySpan<char> input, RegexRunnerMode mode,
ref int posRef, ref CurrentState stateRef, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, RegexRunnerMode mode,
ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
where TStateHandler : struct, IStateHandler
where TInputReader : struct, IInputReader
where TFindOptimizationsHandler : struct, IInitialStateHandler
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
int pos = posRef;
CurrentState state = stateRef;
int endPos = endPosRef;
int endStateId = endStateIdRef;
int initialStatePos = initialStatePosRef;
......@@ -476,13 +523,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
// Loop through each character in the input, transitioning from state to state for each.
while (true)
{
(bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state);
(bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state);
// Check if currentState represents an initial state. If it does, call into any possible find optimizations
// to hopefully more quickly find the next possible starting location.
if (isInitial)
{
if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos))
if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
{
return true;
}
......@@ -496,12 +543,14 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
return true;
}
int positionId = TInputReader.GetPositionId(this, input, pos);
// If the state is nullable for the next character, meaning it accepts the empty string,
// we found a potential end state.
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, ref state, input, pos, isNullable, canBeNullable))
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, isNullable, canBeNullable))
{
endPos = pos;
endStateId = TStateHandler.ExtractNullableCoreStateId(this, ref state, input, pos);
endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
initialStatePos = initialStatePosCandidate;
// A match is known to exist. If that's all we need to know, we're done.
......@@ -512,7 +561,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
}
// If there is more input available try to transition with the next character.
if ((uint)pos >= (uint)input.Length || !TryTakeTransition<TStateHandler>(builder, input, pos, ref state))
if (!IsMintermId(positionId) || !TStateHandler.TryTakeTransition(this, ref state, positionId))
{
return false;
}
......@@ -525,7 +574,6 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
{
// Write back the local copies of the ref values.
posRef = pos;
stateRef = state;
endPosRef = endPos;
endStateIdRef = endStateId;
initialStatePosRef = initialStatePos;
......@@ -546,7 +594,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
/// <param name="matchStartBoundary">The initial starting location discovered in phase 1, a point we must not walk earlier than.</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
/// <returns>The found starting position for the match.</returns>
private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
Debug.Assert(i >= 0, $"{nameof(i)} == {i}");
......@@ -555,18 +604,17 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
// Get the starting state for the reverse pattern. This depends on previous character (which, because we're
// going backwards, is character number i).
var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]);
var currentState = new CurrentState(_reverseInitialStates[GetCharKind<TInputReader>(input, i)]);
int lastStart = -1; // invalid sentinel value
// Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
SymbolicRegexBuilder<TSet> builder = _builder;
while (true)
{
// Run the DFA or NFA traversal backwards from the current point using the current state.
bool done = currentState.NfaState is not null ?
FindStartPositionDeltas<NfaStateHandler, TNullabilityHandler>(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
FindStartPositionDeltas<DfaStateHandler, TNullabilityHandler>(builder, input, ref i, matchStartBoundary, ref currentState, ref lastStart);
FindStartPositionDeltas<NfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
FindStartPositionDeltas<DfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
// If we found the starting position, we're done.
if (done)
......@@ -578,10 +626,8 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
// if we were unable to transition, which should only happen if we were in DFA mode and exceeded our graph size.
// Upgrade to NFA mode and continue.
Debug.Assert(i >= matchStartBoundary);
DfaMatchingState<TSet>? dfaState = currentState.DfaState(_builder);
Debug.Assert(dfaState is not null);
NfaMatchingState nfaState = perThreadData.NfaState;
nfaState.InitializeFrom(dfaState);
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
currentState = new CurrentState(nfaState);
}
......@@ -594,23 +640,25 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
/// starting at <paramref name="i"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
/// </summary>
private bool FindStartPositionDeltas<TStateHandler, TNullabilityHandler>(SymbolicRegexBuilder<TSet> builder, ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState currentState, ref int lastStart)
private bool FindStartPositionDeltas<TStateHandler, TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
where TStateHandler : struct, IStateHandler
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
int pos = i;
CurrentState state = currentState;
try
{
// Loop backwards through each character in the input, transitioning from state to state for each.
while (true)
{
(bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(builder, ref state);
(bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = TStateHandler.GetStateInfo(this, in state);
int positionId = TInputReader.GetPositionId(this, input, pos - 1);
// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, ref state, input, pos - 1, isNullable, canBeNullable))
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, isNullable, canBeNullable))
{
lastStart = pos;
}
......@@ -624,7 +672,7 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
}
// Try to transition with the next character, the one before the current position.
if (!TryTakeTransition<TStateHandler>(builder, input, pos - 1, ref state))
if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
{
// Return false to indicate the search didn't finish.
return false;
......@@ -637,7 +685,6 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
finally
{
// Write back the local copies of the ref values.
currentState = state;
i = pos;
}
}
......@@ -649,10 +696,11 @@ private int FindStartPosition<TNullabilityHandler>(ReadOnlySpan<char> input, int
/// <param name="iEnd">exclusive end position</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
/// <returns>the final register values, which indicate capture starts and ends</returns>
private Registers FindSubcaptures(ReadOnlySpan<char> input, int i, int iEnd, PerThreadData perThreadData)
private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i, int iEnd, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
{
// Pick the correct start state based on previous character kind.
DfaMatchingState<TSet> initialState = _initialStates[GetCharKind(input, i - 1)];
MatchingState<TSet> initialState = _initialStates[GetCharKind<TInputReader>(input, i - 1)];
Registers initialRegisters = perThreadData.InitialRegisters;
......@@ -667,52 +715,45 @@ private Registers FindSubcaptures(ReadOnlySpan<char> input, int i, int iEnd, Per
SparseIntMap<Registers> current = perThreadData.Current, next = perThreadData.Next;
current.Clear();
next.Clear();
current.Add(initialState.Id, initialRegisters);
SymbolicRegexBuilder<TSet> builder = _builder;
ForEachNfaState(initialState.Node, initialState.PrevCharKind, (current, initialRegisters),
static (int nfaId, (SparseIntMap<Registers> Current, Registers InitialRegisters) args) =>
args.Current.Add(nfaId, args.InitialRegisters.Clone()));
while ((uint)i < (uint)iEnd)
{
Debug.Assert(next.Count == 0);
// Read the next character and find its minterm
int c = input[i];
int normalMintermId = _mintermClassifier.GetMintermID(c);
// i is guaranteed to be within bounds, so the position ID is a minterm ID
int mintermId = TInputReader.GetPositionId(this, input, i);
foreach ((int sourceId, Registers sourceRegisters) in current.Values)
{
Debug.Assert(builder._capturingStateArray is not null);
DfaMatchingState<TSet> sourceState = builder._capturingStateArray[sourceId];
// Handle the special case for the last \n for states that start with a relevant anchor
int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?
builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input
normalMintermId;
TSet minterm = builder.GetMinterm(mintermId);
// Get or create the transitions
int offset = (sourceId << builder._mintermsLog) | mintermId;
Debug.Assert(builder._capturingDelta is not null);
List<(DfaMatchingState<TSet>, DerivativeEffect[])>? transitions =
builder._capturingDelta[offset] ??
CreateNewCapturingTransitions(sourceState, minterm, offset);
int offset = DeltaOffset(sourceId, mintermId);
(int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ??
CreateNewCapturingTransition(sourceId, mintermId, offset);
// Take the transitions in their prioritized order
for (int j = 0; j < transitions.Count; ++j)
for (int j = 0; j < transitions.Length; ++j)
{
(DfaMatchingState<TSet> targetState, DerivativeEffect[] effects) = transitions[j];
Debug.Assert(!targetState.IsDeadend, "Transitions should not include dead ends.");
(int targetStateId, DerivativeEffect[] effects) = transitions[j];
// Try to add the state and handle the case where it didn't exist before. If the state already
// exists, then the transition can be safely ignored, as the existing state was generated by a
// higher priority transition.
if (next.Add(targetState.Id, out int index))
if (next.Add(targetStateId, out int index))
{
// Avoid copying the registers on the last transition from this state, reusing the registers instead
Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters;
Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters;
newRegisters.ApplyEffects(effects, i);
next.Update(index, targetState.Id, newRegisters);
if (targetState.IsNullableFor(GetCharKind(input, i + 1)))
next.Update(index, targetStateId, newRegisters);
int coreStateId = GetCoreStateId(targetStateId);
(bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable) = GetStateInfo(coreStateId);
Debug.Assert(!isDeadend);
if (isNullable || (canBeNullable && GetState(coreStateId).IsNullableFor(GetCharKind<TInputReader>(input, i + 1))))
{
// No lower priority transitions from this or other source states are taken because the
// backtracking engines would return the match ending here.
......@@ -732,15 +773,14 @@ private Registers FindSubcaptures(ReadOnlySpan<char> input, int i, int iEnd, Per
}
Debug.Assert(current.Count > 0);
Debug.Assert(_builder._capturingStateArray is not null);
foreach (var (endStateId, endRegisters) in current.Values)
{
DfaMatchingState<TSet> endState = _builder._capturingStateArray[endStateId];
if (endState.IsNullableFor(GetCharKind(input, iEnd)))
MatchingState<TSet> endState = GetState(GetCoreStateId(endStateId));
if (endState.IsNullableFor(GetCharKind<TInputReader>(input, iEnd)))
{
// Apply effects for finishing at the stored end state
endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos),
CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd));
CharKind.Context(endState.PrevCharKind, GetCharKind<TInputReader>(input, iEnd)), (Registers: endRegisters, Pos: iEnd));
return endRegisters;
}
}
......@@ -749,39 +789,6 @@ private Registers FindSubcaptures(ReadOnlySpan<char> input, int i, int iEnd, Per
return default;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetCharKind(ReadOnlySpan<char> input, int i)
{
return !_pattern._info.ContainsSomeAnchor ?
CharKind.General : // The previous character kind is irrelevant when anchors are not used.
GetCharKindWithAnchor(input, i);
uint GetCharKindWithAnchor(ReadOnlySpan<char> input, int i)
{
Debug.Assert(_asciiCharKinds is not null);
if ((uint)i >= (uint)input.Length)
{
return CharKind.BeginningEnd;
}
char nextChar = input[i];
if (nextChar == '\n')
{
return
_builder._newLineSet.Equals(_builder._solver.Empty) ? 0 : // ignore \n
i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z).
CharKind.Newline;
}
uint[] asciiCharKinds = _asciiCharKinds;
return
nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] :
_builder._solver.And(GetMinterm(nextChar), _builder._wordLetterForBoundariesSet).Equals(_builder._solver.Empty) ? 0 : // intersect with the wordletter set to compute the kind of the next character
CharKind.WordLetter;
}
}
/// <summary>Stores additional data for tracking capture start and end positions.</summary>
/// <remarks>The NFA simulation based third phase has one of these for each current state in the current set of live states.</remarks>
internal struct Registers
......@@ -867,9 +874,9 @@ internal sealed class PerThreadData
/// <summary>Registers used for the capturing third phase.</summary>
public readonly Registers InitialRegisters;
public PerThreadData(SymbolicRegexBuilder<TSet> builder, int capsize)
public PerThreadData(int capsize)
{
NfaState = new NfaMatchingState(builder);
NfaState = new NfaMatchingState();
// Only create data used for capturing mode if there are subcaptures
if (capsize > 1)
......@@ -883,11 +890,9 @@ public PerThreadData(SymbolicRegexBuilder<TSet> builder, int capsize)
/// <summary>Stores the state that represents a current state in NFA mode.</summary>
/// <remarks>The entire state is composed of a list of individual states.</remarks>
/// <remarks>New instances should only be created once per runner.</remarks>
internal sealed class NfaMatchingState
{
/// <summary>The associated builder used to lazily add new DFA or NFA nodes to the graph.</summary>
public readonly SymbolicRegexBuilder<TSet> Builder;
/// <summary>Ordered set used to store the current NFA states.</summary>
/// <remarks>The value is unused. The type is used purely for its keys.</remarks>
public SparseIntMap<int> NfaStateSet = new();
......@@ -899,24 +904,17 @@ internal sealed class NfaMatchingState
/// </remarks>
public SparseIntMap<int> NfaStateSetScratch = new();
/// <summary>Create the instance.</summary>
/// <remarks>New instances should only be created once per runner.</remarks>
public NfaMatchingState(SymbolicRegexBuilder<TSet> builder) => Builder = builder;
/// <summary>Resets this NFA state to represent the supplied DFA state.</summary>
/// <param name="matcher"></param>
/// <param name="dfaMatchingState">The DFA state to use to initialize the NFA state.</param>
public void InitializeFrom(DfaMatchingState<TSet> dfaMatchingState)
public void InitializeFrom(SymbolicRegexMatcher<TSet> matcher, MatchingState<TSet> dfaMatchingState)
{
NfaStateSet.Clear();
// If the DFA state is a union of multiple DFA states, loop through all of them
// adding an NFA state for each.
foreach (SymbolicRegexNode<TSet> element in dfaMatchingState.Node.EnumerateAlternationBranches())
{
// Create (possibly new) NFA states for all the members.
// Add their IDs to the current set of NFA states and into the list.
NfaStateSet.Add(Builder.CreateNfaState(element, dfaMatchingState.PrevCharKind), out _);
}
matcher.ForEachNfaState(dfaMatchingState.Node, dfaMatchingState.PrevCharKind, NfaStateSet,
static (int nfaId, SparseIntMap<int> nfaStateSet) => nfaStateSet.Add(nfaId, out _));
}
}
......@@ -925,7 +923,7 @@ public void InitializeFrom(DfaMatchingState<TSet> dfaMatchingState)
private struct CurrentState
{
/// <summary>Initializes the state as a DFA state.</summary>
public CurrentState(DfaMatchingState<TSet> dfaState)
public CurrentState(MatchingState<TSet> dfaState)
{
DfaStateId = dfaState.Id;
NfaState = null;
......@@ -942,51 +940,48 @@ public CurrentState(NfaMatchingState nfaState)
public int DfaStateId;
/// <summary>The NFA state.</summary>
public NfaMatchingState? NfaState;
public DfaMatchingState<TSet>? DfaState(SymbolicRegexBuilder<TSet> builder) => DfaStateId > 0 ? builder._stateArray![DfaStateId] : null;
}
/// <summary>Represents a set of routines for operating over a <see cref="CurrentState"/>.</summary>
private interface IStateHandler
{
public static abstract bool StartsWithLineAnchor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state);
public static abstract bool IsNullableFor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind);
public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos);
public static abstract int FixedLength(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind);
public static abstract bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, int mintermId);
public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder<TSet> builder, ref CurrentState state);
public static abstract bool StartsWithLineAnchor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state);
public static abstract bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind);
public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, ReadOnlySpan<char> input, int pos);
public static abstract int FixedLength(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind);
public static abstract bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId);
public static abstract (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher<TSet> matcher, in CurrentState state);
}
/// <summary>An <see cref="IStateHandler"/> for operating over <see cref="CurrentState"/> instances configured as DFA states.</summary>
private readonly struct DfaStateHandler : IStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool StartsWithLineAnchor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state) => state.DfaState(builder)!.StartsWithLineAnchor;
public static bool StartsWithLineAnchor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableFor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.IsNullableFor(nextCharKind);
public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind);
/// <summary>Gets the preferred DFA state for nullability. In DFA mode this is just the state itself.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos) => state.DfaStateId;
public static int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, ReadOnlySpan<char> input, int pos) => state.DfaStateId;
/// <summary>Gets the length of any fixed-length marker that exists for this state, or -1 if there is none.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int FixedLength(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind) => state.DfaState(builder)!.FixedLength(nextCharKind);
public static int FixedLength(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).FixedLength(nextCharKind);
/// <summary>Take the transition to the next DFA state.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, int mintermId)
public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId)
{
Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}.");
Debug.Assert(builder._delta is not null);
// Use the mintermId for the character being read to look up which state to transition to.
// If that state has already been materialized, move to it, and we're done. If that state
// hasn't been materialized, try to create it; if we can, move to it, and we're done.
int dfaOffset = (state.DfaStateId << builder._mintermsLog) | mintermId;
int nextStateId = builder._delta[dfaOffset];
int dfaOffset = matcher.DeltaOffset(state.DfaStateId, mintermId);
int nextStateId = matcher._dfaDelta[dfaOffset];
if (nextStateId > 0)
{
// There was an existing DFA transition to some state. Move to it and
......@@ -995,7 +990,7 @@ public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref Curren
return true;
}
if (builder.TryCreateNewTransition(state.DfaState(builder)!, mintermId, dfaOffset, checkThreshold: true, out DfaMatchingState<TSet>? nextState))
if (matcher.TryCreateNewTransition(matcher.GetState(state.DfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState<TSet>? nextState))
{
// We were able to create a new DFA transition to some state. Move to it and
// return that we're still operating as a DFA and can keep going.
......@@ -1014,22 +1009,19 @@ public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref Curren
/// - whether this state may be contextually nullable
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder<TSet> builder, ref CurrentState state)
{
Debug.Assert(state.DfaStateId > 0);
return builder.GetStateInfo(state.DfaStateId);
}
public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
=> matcher.GetStateInfo(state.DfaStateId);
}
/// <summary>An <see cref="IStateHandler"/> for operating over <see cref="CurrentState"/> instances configured as NFA states.</summary>
private readonly struct NfaStateHandler : IStateHandler
{
/// <summary>Check if any underlying core state starts with a line anchor.</summary>
public static bool StartsWithLineAnchor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state)
public static bool StartsWithLineAnchor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetCoreState(nfaState.Key).StartsWithLineAnchor)
if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).StartsWithLineAnchor)
{
return true;
}
......@@ -1039,11 +1031,11 @@ public static bool StartsWithLineAnchor(SymbolicRegexBuilder<TSet> builder, ref
}
/// <summary>Check if any underlying core state is nullable in the context of the next character kind.</summary>
public static bool IsNullableFor(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind)
public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetCoreState(nfaState.Key).IsNullableFor(nextCharKind))
if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).IsNullableFor(nextCharKind))
{
return true;
}
......@@ -1053,12 +1045,12 @@ public static bool IsNullableFor(SymbolicRegexBuilder<TSet> builder, ref Current
}
/// <summary>Gets the preferred DFA state for nullability. In DFA mode this is just the state itself.</summary>
public static int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos)
public static int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, ReadOnlySpan<char> input, int pos)
{
uint nextCharKind = matcher.GetCharKind(input, pos);
uint nextCharKind = matcher.GetCharKind<FullInputReader>(input, pos);
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
DfaMatchingState<TSet> coreState = matcher._builder.GetCoreState(nfaState.Key);
MatchingState<TSet> coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key));
if (coreState.IsNullableFor(nextCharKind))
{
return coreState.Id;
......@@ -1070,11 +1062,11 @@ public static int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher,
}
/// <summary>Gets the length of any fixed-length marker that exists for this state, or -1 if there is none.</summary>
public static int FixedLength(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, uint nextCharKind)
public static int FixedLength(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
DfaMatchingState<TSet> coreState = builder.GetCoreState(nfaState.Key);
MatchingState<TSet> coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key));
if (coreState.IsNullableFor(nextCharKind))
{
return coreState.FixedLength(nextCharKind);
......@@ -1086,7 +1078,7 @@ public static int FixedLength(SymbolicRegexBuilder<TSet> builder, ref CurrentSta
}
/// <summary>Take the transition to the next NFA state.</summary>
public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref CurrentState state, int mintermId)
public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId)
{
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
......@@ -1105,7 +1097,7 @@ public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref Curren
{
// We have a single source state. We know its next states are already deduped,
// so we can just add them directly to the destination states list.
foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, builder))
foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, matcher))
{
nextStates.Add(nextState, out _);
}
......@@ -1118,7 +1110,7 @@ public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref Curren
// to the set, then add the known-unique state to the destination list.
foreach (ref KeyValuePair<int, int> sourceState in CollectionsMarshal.AsSpan(sourceStates.Values))
{
foreach (int nextState in GetNextStates(sourceState.Key, mintermId, builder))
foreach (int nextState in GetNextStates(sourceState.Key, mintermId, matcher))
{
nextStates.Add(nextState, out _);
}
......@@ -1128,13 +1120,13 @@ public static bool TakeTransition(SymbolicRegexBuilder<TSet> builder, ref Curren
return true;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder<TSet> builder)
static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<TSet> matcher)
{
// Calculate the offset into the NFA transition table.
int nfaOffset = (sourceState << builder._mintermsLog) | mintermId;
int nfaOffset = matcher.DeltaOffset(sourceState, mintermId);
// Get the next NFA state.
return builder._nfaDelta[nfaOffset] ?? builder.CreateNewNfaTransition(sourceState, mintermId, nfaOffset);
return matcher._nfaDelta[nfaOffset] ?? matcher.CreateNewNfaTransition(sourceState, mintermId, nfaOffset);
}
}
......@@ -1153,15 +1145,15 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder<
/// can transition back to a DFA state.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexBuilder<TSet> builder, ref CurrentState state) =>
(false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(builder, ref state), CanBeNullable(builder, ref state));
public static (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(SymbolicRegexMatcher<TSet> matcher, in CurrentState state) =>
(false, state.NfaState!.NfaStateSet.Count == 0, IsNullable(matcher, in state), CanBeNullable(matcher, in state));
/// <summary>Check if any underlying core state is unconditionally nullable.</summary>
private static bool IsNullable(SymbolicRegexBuilder<TSet> builder, ref CurrentState state)
public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).IsNullable)
if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).IsNullable)
{
return true;
}
......@@ -1171,11 +1163,11 @@ private static bool IsNullable(SymbolicRegexBuilder<TSet> builder, ref CurrentSt
}
/// <summary>Check if any underlying core state can be nullable in some context.</summary>
private static bool CanBeNullable(SymbolicRegexBuilder<TSet> builder, ref CurrentState state)
public static bool CanBeNullable(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetStateInfo(builder.GetCoreStateId(nfaState.Key)).CanBeNullable)
if (matcher.GetStateInfo(matcher.GetCoreStateId(nfaState.Key)).CanBeNullable)
{
return true;
}
......@@ -1185,10 +1177,10 @@ private static bool CanBeNullable(SymbolicRegexBuilder<TSet> builder, ref Curren
}
#if DEBUG
/// <summary>Undo a previous call to <see cref="TakeTransition"/>.</summary>
/// <summary>Undo a previous call to <see cref="TryTakeTransition"/>.</summary>
public static void UndoTransition(ref CurrentState state)
{
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaState)}.");
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
NfaMatchingState nfaState = state.NfaState!;
......@@ -1202,37 +1194,43 @@ public static void UndoTransition(ref CurrentState state)
// Sanity check: if there are any next states, then there must have been some source states.
Debug.Assert(nextStates.Count == 0 || sourceStates.Count > 0);
}
#endif
}
/// <summary>Check if any underlying core state is unconditionally nullable.</summary>
public static bool IsNullable(ref CurrentState state)
{
SymbolicRegexBuilder<TSet> builder = state.NfaState!.Builder;
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetCoreState(nfaState.Key).Node.IsNullable)
{
return true;
}
}
/// <summary>
/// Interface for mapping positions in the input to position IDs, which capture all the information necessary to
/// both take transitions and decide nullability. For positions of valid characters that are handled normally,
/// these IDs coincide with minterm IDs (i.e. indices to <see cref="_minterms"/>). Positions outside the bounds
/// of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be
/// mapped to _minterms.Length for supporting the \Z anchor.
/// </summary>
private interface IInputReader
{
public static abstract int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos);
}
return false;
}
/// <summary>This reader omits the special handling of \n for the \Z anchor.</summary>
private readonly struct NoZAnchorInputReader : IInputReader
{
public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos) =>
(uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]);
}
/// <summary>Check if any underlying core state can be nullable.</summary>
public static bool CanBeNullable(ref CurrentState state)
/// <summary>This reader includes full handling of an \n as the last character of input for the \Z anchor.</summary>
private readonly struct FullInputReader : IInputReader
{
public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos)
{
SymbolicRegexBuilder<TSet> builder = state.NfaState!.Builder;
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (builder.GetCoreState(nfaState.Key).Node.CanBeNullable)
{
return true;
}
}
if ((uint)pos >= (uint)input.Length)
return -1;
return false;
int c = input[pos];
// Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
return c == '\n' && pos == input.Length - 1 ?
matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
matcher._mintermClassifier.GetMintermID(c);
}
#endif
}
/// <summary>
......@@ -1240,7 +1238,8 @@ public static bool CanBeNullable(ref CurrentState state)
/// </summary>
private interface IInitialStateHandler
{
public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos);
public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
where TInputReader : struct, IInputReader;
}
/// <summary>
......@@ -1249,7 +1248,8 @@ private interface IInitialStateHandler
private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
public static bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
where TInputReader : struct, IInputReader
{
// return true to indicate that the current position is a possible starting position
return true;
......@@ -1262,7 +1262,8 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matche
private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
public static bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
where TInputReader : struct, IInputReader
{
// Find the first position that matches with some likely character.
if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0))
......@@ -1273,7 +1274,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matche
// Update the starting state based on where TryFindNextStartingPosition moved us to.
// As with the initial starting state, if it's a dead end, no match exists.
state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]);
state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind<TInputReader>(input, pos - 1)]);
return true;
}
}
......@@ -1283,7 +1284,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matche
/// </summary>
private interface INullabilityHandler
{
public static abstract bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos, bool isNullable, bool canBeNullable)
public static abstract bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable)
where TStateHandler : struct, IStateHandler;
}
......@@ -1293,7 +1294,7 @@ private interface INullabilityHandler
private readonly struct NoAnchorsNullabilityHandler : INullabilityHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos, bool isNullable, bool canBeNullable)
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable)
where TStateHandler : struct, IStateHandler
{
Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor);
......@@ -1307,10 +1308,10 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
private readonly struct FullNullabilityHandler : INullabilityHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, ReadOnlySpan<char> input, int pos, bool isNullable, bool canBeNullable)
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, bool isNullable, bool canBeNullable)
where TStateHandler : struct, IStateHandler
{
return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher._builder, ref state, matcher.GetCharKind(input, pos)));
return isNullable || (canBeNullable && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
}
}
}
......
......@@ -33,24 +33,25 @@ internal sealed class SymbolicRegexNode<TSet> where TSet : IComparable<TSet>, IE
/// </remarks>
internal const int SubsumptionCheckDepthLimit = 50;
internal readonly SymbolicRegexBuilder<TSet> _builder;
internal readonly SymbolicRegexNodeKind _kind;
internal readonly int _lower;
internal readonly int _upper;
internal readonly TSet? _set;
internal readonly SymbolicRegexNode<TSet>? _left;
internal readonly SymbolicRegexNode<TSet>? _right;
internal readonly SymbolicRegexInfo _info;
/// <summary>
/// Caches nullability of this node for any given context (0 &lt;= context &lt; ContextLimit)
/// when _info.StartsWithSomeAnchor and _info.CanBeNullable are true. Otherwise the cache is null.
/// </summary>
private byte[]? _nullabilityCache;
private readonly byte[]? _nullabilityCache;
private TSet _startSet;
#if DEBUG
internal SymbolicRegexBuilder<TSet>? _debugBuilder;
#endif
/// <summary>AST node of a symbolic regex</summary>
/// <param name="builder">the builder</param>
/// <param name="kind">what kind of node</param>
/// <param name="left">left child</param>
/// <param name="right">right child</param>
......@@ -58,9 +59,8 @@ internal sealed class SymbolicRegexNode<TSet> where TSet : IComparable<TSet>, IE
/// <param name="upper">upper boubd of a loop</param>
/// <param name="set">singelton set</param>
/// <param name="info">misc flags including laziness</param>
private SymbolicRegexNode(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind, SymbolicRegexNode<TSet>? left, SymbolicRegexNode<TSet>? right, int lower, int upper, TSet? set, SymbolicRegexInfo info)
private SymbolicRegexNode(SymbolicRegexNodeKind kind, SymbolicRegexNode<TSet>? left, SymbolicRegexNode<TSet>? right, int lower, int upper, TSet? set, SymbolicRegexInfo info)
{
_builder = builder;
_kind = kind;
_left = left;
_right = right;
......@@ -68,7 +68,6 @@ private SymbolicRegexNode(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeK
_upper = upper;
_set = set;
_info = info;
_startSet = ComputeStartSet();
_nullabilityCache = info.StartsWithSomeAnchor && info.CanBeNullable ? new byte[CharKind.ContextLimit] : null;
}
......@@ -78,7 +77,10 @@ private static SymbolicRegexNode<TSet> Create(SymbolicRegexBuilder<TSet> builder
var key = (kind, left, right, lower, upper, set, info);
if (!builder._nodeCache.TryGetValue(key, out SymbolicRegexNode<TSet>? node))
{
node = new SymbolicRegexNode<TSet>(builder, kind, left, right, lower, upper, set, info);
node = new SymbolicRegexNode<TSet>(kind, left, right, lower, upper, set, info);
#if DEBUG
node._debugBuilder = builder;
#endif
builder._nodeCache[key] = node;
}
return node;
......@@ -172,9 +174,6 @@ internal bool CanBeNullable
}
}
internal SymbolicRegexInfo _info;
/// <summary>
/// Converts a list of a given kind, e.g. Concat or Alternate, into an array,
/// returns anything else in a singleton array.
......@@ -331,71 +330,31 @@ bool WithCache(uint context)
}
/// <summary>Returns true if this is equivalent to .* (the node must be eager also)</summary>
public bool IsAnyStar
public bool IsAnyStar(ISolver<TSet> solver)
{
get
if (IsStar)
{
if (IsStar)
{
Debug.Assert(_left is not null);
if (_left._kind == SymbolicRegexNodeKind.Singleton)
{
Debug.Assert(_left._set is not null);
return !IsLazy && _builder._solver.Full.Equals(_left._set);
}
}
return false;
}
}
/// <summary>Returns true if this is equivalent to .+ (the node must be eager also)</summary>
public bool IsAnyPlus
{
get
{
if (IsPlus)
Debug.Assert(_left is not null);
if (_left._kind == SymbolicRegexNodeKind.Singleton)
{
Debug.Assert(_left is not null);
if (_left._kind == SymbolicRegexNodeKind.Singleton)
{
Debug.Assert(_left._set is not null);
return !IsLazy && _builder._solver.Full.Equals(_left._set);
}
Debug.Assert(_left._set is not null);
return !IsLazy && solver.Full.Equals(_left._set);
}
return false;
}
}
/// <summary>Returns true if this is equivalent to [\0-\xFFFF] </summary>
public bool IsAnyChar
{
get
{
if (_kind == SymbolicRegexNodeKind.Singleton)
{
Debug.Assert(_set is not null);
return _builder._solver.IsFull(_set);
}
return false;
}
return false;
}
/// <summary>Returns true if this is equivalent to [0-[0]]</summary>
public bool IsNothing
public bool IsNothing(ISolver<TSet> solver)
{
get
if (_kind == SymbolicRegexNodeKind.Singleton)
{
if (_kind == SymbolicRegexNodeKind.Singleton)
{
Debug.Assert(_set is not null);
return _builder._solver.IsEmpty(_set);
}
return false;
Debug.Assert(_set is not null);
return solver.IsEmpty(_set);
}
return false;
}
/// <summary>Returns true iff this is a loop whose lower bound is 0 and upper bound is max</summary>
......@@ -415,39 +374,33 @@ public bool IsNothing
#region called only once, in the constructor of SymbolicRegexBuilder
internal static SymbolicRegexNode<TSet> CreateFalse(SymbolicRegexBuilder<TSet> builder) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, SymbolicRegexInfo.Create());
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, default);
internal static SymbolicRegexNode<TSet> CreateTrue(SymbolicRegexBuilder<TSet> builder) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, SymbolicRegexInfo.Create());
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, default);
internal static SymbolicRegexNode<TSet> CreateFixedLengthMarker(SymbolicRegexBuilder<TSet> builder, int length) =>
Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true));
Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Epsilon());
internal static SymbolicRegexNode<TSet> CreateEpsilon(SymbolicRegexBuilder<TSet> builder) =>
Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true));
Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Epsilon());
internal static SymbolicRegexNode<TSet> CreateBeginEndAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind)
internal static SymbolicRegexNode<TSet> CreateAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind)
{
Debug.Assert(kind is
SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor or
SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true,
startsWithLineAnchor: kind is
return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
}
internal static SymbolicRegexNode<TSet> CreateBoundaryAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind)
{
Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor);
return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true));
}
#endregion
internal static SymbolicRegexNode<TSet> CreateSingleton(SymbolicRegexBuilder<TSet> builder, TSet set) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, SymbolicRegexInfo.Create());
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, default);
internal static SymbolicRegexNode<TSet> CreateLoop(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> body, int lower, int upper, bool isLazy)
{
......@@ -480,10 +433,10 @@ internal static SymbolicRegexNode<TSet> CreateEffect(SymbolicRegexBuilder<TSet>
}
internal static SymbolicRegexNode<TSet> CreateCaptureStart(SymbolicRegexBuilder<TSet> builder, int captureNum) =>
Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true));
Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon());
internal static SymbolicRegexNode<TSet> CreateCaptureEnd(SymbolicRegexBuilder<TSet> builder, int captureNum) =>
Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true));
Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Epsilon());
internal static SymbolicRegexNode<TSet> CreateDisableBacktrackingSimulation(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> child) =>
Create(builder, SymbolicRegexNodeKind.DisableBacktrackingSimulation, child, null, -1, -1, default, child._info);
......@@ -530,7 +483,7 @@ internal static SymbolicRegexNode<TSet> CreateConcat(SymbolicRegexBuilder<TSet>
/// <returns></returns>
internal static SymbolicRegexNode<TSet> CreateAlternate(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right, bool deduplicated = false, bool hintRightLikelySubsumes = false)
{
if (left.IsAnyStar || right == builder._nothing || left == right || (left.IsNullable && right.IsEpsilon))
if (left.IsAnyStar(builder._solver) || right.IsNothing(builder._solver) || left == right || (left.IsNullable && right.IsEpsilon))
return left;
if (left == builder._nothing)
return right;
......@@ -541,16 +494,16 @@ internal static SymbolicRegexNode<TSet> CreateAlternate(SymbolicRegexBuilder<TSe
SymbolicRegexNode<TSet> tail = right._kind == SymbolicRegexNodeKind.Alternate ? right._right! : builder._nothing;
// Simplify away right side if left side subsumes it. For example X?Y|Y|Z would simplify to just X?Y|Z.
if (!hintRightLikelySubsumes && left.Subsumes(head))
if (!hintRightLikelySubsumes && left.Subsumes(builder, head))
return CreateAlternate(builder, left, tail);
// Simplify by folding right side into left side if right side subsumes the left side. For example Y|X?Y|Z
// would simplify to X??Y|Z.
if (head.Subsumes(left) && TryFoldAlternation(left, head, out SymbolicRegexNode<TSet>? result))
if (head.Subsumes(builder, left) && TryFoldAlternation(builder, left, head, out SymbolicRegexNode<TSet>? result))
return CreateAlternate(builder, result, tail);
// This is a repeat of a rule above, but for the case when the hint tells us to try reverse subsumption first.
if (hintRightLikelySubsumes && left.Subsumes(head))
if (hintRightLikelySubsumes && left.Subsumes(builder, head))
return CreateAlternate(builder, left, tail);
// If left is not an Alternate, try to avoid allocation by checking if deduplication is necessary
......@@ -647,53 +600,54 @@ internal static SymbolicRegexNode<TSet> CreateAlternate(SymbolicRegexBuilder<TSe
/// recursive subsumption check. The rationale is that if the answer could be produced locally then recomputing
/// it is better than caching.
/// </remarks>
/// <param name="builder">the builder that owns this node</param>
/// <param name="other">the node to check for being subsumed</param>
/// <param name="depth">the current recursion depth</param>
/// <returns></returns>
internal bool Subsumes(SymbolicRegexNode<TSet> other, int depth = 0)
internal bool Subsumes(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> other, int depth = 0)
{
// A node subsumes itself
if (this == other)
return true;
// Nothing has an empty language, which is subsumed by anything
if (other == _builder._nothing)
if (other.IsNothing(builder._solver))
return true;
// Early exit if we've gone too deep
if (depth >= SubsumptionCheckDepthLimit)
return false;
if (_builder._subsumptionCache.TryGetValue((this, other), out bool cached))
if (builder._subsumptionCache.TryGetValue((this, other), out bool cached))
{
return cached;
}
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(Subsumes, other, depth);
return StackHelper.CallOnEmptyStack(Subsumes, builder, other, depth);
}
// Try to apply all subsumption rules
bool? subsumes = ApplySubsumptionRules(this, other, depth + 1);
bool? subsumes = ApplySubsumptionRules(builder, this, other, depth + 1);
// Cache and return the result if any rule applied
if (subsumes.HasValue)
{
return (_builder._subsumptionCache[(this, other)] = subsumes.Value);
return (builder._subsumptionCache[(this, other)] = subsumes.Value);
}
// Assume false if no rule applied
return false;
static bool? ApplySubsumptionRules(SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right, int depth)
static bool? ApplySubsumptionRules(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right, int depth)
{
// Rule: Effect(X,E) subsumes Y iff X subsumes Y
// Effectively this ignores any effects
if (left._kind == SymbolicRegexNodeKind.Effect)
{
Debug.Assert(left._left is not null && left._right is not null);
return left._left.Subsumes(right, depth);
return left._left.Subsumes(builder, right, depth);
}
// Rule: X subsumes Effect(Y,E) iff X subsumes Y
......@@ -701,7 +655,7 @@ internal bool Subsumes(SymbolicRegexNode<TSet> other, int depth = 0)
if (right._kind == SymbolicRegexNodeKind.Effect)
{
Debug.Assert(right._left is not null && right._right is not null);
return left.Subsumes(right._left, depth);
return left.Subsumes(builder, right._left, depth);
}
// Rule: XY subsumes (X')??Y' if X equals X' and Y subsumes Y'
......@@ -714,7 +668,7 @@ internal bool Subsumes(SymbolicRegexNode<TSet> other, int depth = 0)
{
Debug.Assert(rl._left is not null);
if (TrySkipPrefix(left, rl._left, out SymbolicRegexNode<TSet>? tail))
return tail.Subsumes(right._right, depth);
return tail.Subsumes(builder, right._right, depth);
}
}
......@@ -728,7 +682,7 @@ internal bool Subsumes(SymbolicRegexNode<TSet> other, int depth = 0)
{
Debug.Assert(ll._left is not null);
if (TrySkipPrefix(right, ll._left, out SymbolicRegexNode<TSet>? tail))
return left._right.Subsumes(tail, depth);
return left._right.Subsumes(builder, tail, depth);
}
}
......@@ -738,7 +692,7 @@ internal bool Subsumes(SymbolicRegexNode<TSet> other, int depth = 0)
Debug.Assert(left._left is not null && left._right is not null);
if (left._left.IsNullable)
{
return left._right.Subsumes(right, depth);
return left._right.Subsumes(builder, right, depth);
}
}
......@@ -804,18 +758,19 @@ private SymbolicRegexNode<TSet> UnwrapEffects()
/// eliminate the alternation by simplifying to (xyz){0,3}?abc. Note that the transformation preserves the priority
/// of the shorter "abc" match by making the prefix lazy.
/// </summary>
/// <param name="builder">the builder that owns this node</param>
/// <param name="left">the lower priority alternative</param>
/// <param name="right">the higher priority alternative</param>
/// <param name="result">the folded regex that eliminates alternation, or null if the operation fails</param>
/// <param name="rightEffects">accumulated effects from the right side</param>
/// <returns>whether folding was successful</returns>
private static bool TryFoldAlternation(SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right, [NotNullWhen(true)] out SymbolicRegexNode<TSet>? result,
private static bool TryFoldAlternation(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right, [NotNullWhen(true)] out SymbolicRegexNode<TSet>? result,
SymbolicRegexNode<TSet>? rightEffects = null)
{
// The rules below assume that the right side subsumes the left side
Debug.Assert(right.Subsumes(left));
Debug.Assert(right.Subsumes(builder, left));
rightEffects ??= left._builder.Epsilon;
rightEffects ??= builder.Epsilon;
// If the sides are equal (ignoring effects) then just return the higher priority left side
if (left.UnwrapEffects() == right.UnwrapEffects())
......@@ -830,20 +785,20 @@ private SymbolicRegexNode<TSet> UnwrapEffects()
if (left._kind == SymbolicRegexNodeKind.Effect)
{
Debug.Assert(left._left is not null && left._right is not null);
Debug.Assert(right.Subsumes(left._left));
Debug.Assert(right.Subsumes(builder, left._left));
// If there are any accumulated effects we don't know how to handle them here.
// This shouldn't normally happen because this rule has priority over the rule
// for effects on the right side.
if (rightEffects != left._builder.Epsilon)
if (rightEffects != builder.Epsilon)
{
result = null;
return false;
}
if (TryFoldAlternation(left._left, right, out SymbolicRegexNode<TSet>? innerResult, rightEffects))
if (TryFoldAlternation(builder, left._left, right, out SymbolicRegexNode<TSet>? innerResult, rightEffects))
{
result = CreateEffect(left._builder, innerResult, left._right);
result = CreateEffect(builder, innerResult, left._right);
return true;
}
}
......@@ -853,19 +808,19 @@ private SymbolicRegexNode<TSet> UnwrapEffects()
if (right._kind == SymbolicRegexNodeKind.Effect)
{
Debug.Assert(right._left is not null && right._right is not null);
Debug.Assert(right._left.Subsumes(left));
rightEffects = CreateConcat(left._builder, right._right, rightEffects);
return TryFoldAlternation(left, right._left, out result, rightEffects);
Debug.Assert(right._left.Subsumes(builder, left));
rightEffects = CreateConcat(builder, right._right, rightEffects);
return TryFoldAlternation(builder, left, right._left, out result, rightEffects);
}
// If we have Y | XY then this rule will find X and fold to X??Y.
if (right._kind == SymbolicRegexNodeKind.Concat)
{
Debug.Assert(right._left is not null && right._right is not null);
if (right._left.IsNullable && TrySplitConcatSubsumption(left, right, out SymbolicRegexNode<TSet>? prefix))
if (right._left.IsNullable && TrySplitConcatSubsumption(builder, left, right, out SymbolicRegexNode<TSet>? prefix))
{
prefix = CreateEffect(left._builder, prefix, rightEffects);
result = left._builder.CreateConcat(CreateLoop(left._builder, prefix, 0, 1, true), left);
prefix = CreateEffect(builder, prefix, rightEffects);
result = builder.CreateConcat(CreateLoop(builder, prefix, 0, 1, true), left);
return true;
}
}
......@@ -875,7 +830,7 @@ private SymbolicRegexNode<TSet> UnwrapEffects()
return false;
// This rule tries to find a prefix P that the right side has such that right is PR and left is equivalent to R
static bool TrySplitConcatSubsumption(SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right,
static bool TrySplitConcatSubsumption(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right,
[NotNullWhen(true)] out SymbolicRegexNode<TSet>? prefix)
{
List<SymbolicRegexNode<TSet>> prefixElements = new();
......@@ -884,25 +839,25 @@ private SymbolicRegexNode<TSet> UnwrapEffects()
{
Debug.Assert(suffix._left is not null && suffix._right is not null);
// We maintain a loop invariant that the suffix subsumes the left hand side
Debug.Assert(suffix.Subsumes(left));
Debug.Assert(suffix.Subsumes(builder, left));
if (suffix == left)
{
// We found a split, so store the prefix and return success
prefixElements.Reverse();
prefix = left._builder.CreateConcatAlreadyReversed(prefixElements);
prefix = builder.CreateConcatAlreadyReversed(prefixElements);
return true;
}
else if (suffix._right.Subsumes(left))
else if (suffix._right.Subsumes(builder, left))
{
// The tail of the suffix still subsumes left, so we can extend the prefix
prefixElements.Add(suffix._left);
suffix = suffix._right;
}
else if (left.Subsumes(suffix))
else if (left.Subsumes(builder, suffix))
{
// If left subsumes the suffix, then due to the loop invariant we have equivalence
prefixElements.Reverse();
prefix = left._builder.CreateConcatAlreadyReversed(prefixElements);
prefix = builder.CreateConcatAlreadyReversed(prefixElements);
return true;
}
else
......@@ -1015,9 +970,10 @@ public int GetFixedLength()
/// This function will rebuild concatenations because it pushes the FixedLengthMarker into the rightmost element.
/// Due to this this function should not be called on every character.
/// </remarks>
/// <param name="builder">the builder that owns this node</param>
/// <param name="lengthSoFar">accumulater used in the recursion for lengths of paths</param>
/// <returns>the node with fixed length markers added</returns>
public SymbolicRegexNode<TSet> AddFixedLengthMarkers(int lengthSoFar = 0)
public SymbolicRegexNode<TSet> AddFixedLengthMarkers(SymbolicRegexBuilder<TSet> builder, int lengthSoFar = 0)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
......@@ -1029,9 +985,9 @@ public SymbolicRegexNode<TSet> AddFixedLengthMarkers(int lengthSoFar = 0)
case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
// For an Alternate attempt to add markers separately for each element
return CreateAlternate(_builder,
_left.AddFixedLengthMarkers(lengthSoFar),
_right.AddFixedLengthMarkers(lengthSoFar), deduplicated: true);
return CreateAlternate(builder,
_left.AddFixedLengthMarkers(builder, lengthSoFar),
_right.AddFixedLengthMarkers(builder, lengthSoFar), deduplicated: true);
case SymbolicRegexNodeKind.Concat:
Debug.Assert(_left is not null && _right is not null);
......@@ -1039,13 +995,13 @@ public SymbolicRegexNode<TSet> AddFixedLengthMarkers(int lengthSoFar = 0)
int leftLength = _left.GetFixedLength();
if (leftLength >= 0)
{
return CreateConcat(_builder, _left, _right.AddFixedLengthMarkers(lengthSoFar + leftLength));
return CreateConcat(builder, _left, _right.AddFixedLengthMarkers(builder, lengthSoFar + leftLength));
}
// If the right side is always zero length, then just recurse to the left side
int rightLength = _right.GetFixedLength();
if (rightLength == 0)
{
return CreateConcat(_builder, _left.AddFixedLengthMarkers(lengthSoFar), _right);
return CreateConcat(builder, _left.AddFixedLengthMarkers(builder, lengthSoFar), _right);
}
break;
......@@ -1058,80 +1014,82 @@ public SymbolicRegexNode<TSet> AddFixedLengthMarkers(int lengthSoFar = 0)
// if there is one.
int thisLength = GetFixedLength();
return thisLength < 0 ? this :
CreateConcat(_builder, this, CreateFixedLengthMarker(_builder, lengthSoFar + thisLength));
CreateConcat(builder, this, CreateFixedLengthMarker(builder, lengthSoFar + thisLength));
}
/// <summary>
/// Create a derivative (<see cref="CreateDerivative(TSet, uint)"/> and <see cref="CreateDerivativeWrapper"/>) and then strip
/// Create a derivative (<see cref="CreateDerivative(SymbolicRegexBuilder{TSet}, TSet, uint)"/> and <see cref="CreateDerivativeWrapper"/>) and then strip
/// effects with <see cref="StripEffects"/>.
/// This derivative simulates backtracking, i.e. it only considers paths that backtracking would
/// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking
/// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed.
/// </summary>
/// <param name="builder">the builder that owns this node</param>
/// <param name="elem">given element wrt which the derivative is taken</param>
/// <param name="context">immediately surrounding character context that affects nullability of anchors</param>
/// <returns>the derivative</returns>
internal SymbolicRegexNode<TSet> CreateDerivativeWithoutEffects(TSet elem, uint context) => CreateDerivativeWrapper(elem, context).StripEffects();
internal SymbolicRegexNode<TSet> CreateDerivativeWithoutEffects(SymbolicRegexBuilder<TSet> builder, TSet elem, uint context) => CreateDerivativeWrapper(builder, elem, context).StripEffects(builder);
/// <summary>
/// Create a derivative (<see cref="CreateDerivative(TSet, uint)"/> and <see cref="CreateDerivativeWrapper"/>) and then strip
/// Create a derivative (<see cref="CreateDerivative(SymbolicRegexBuilder{TSet}, TSet, uint)"/> and <see cref="CreateDerivativeWrapper"/>) and then strip
/// and map effects for use in NFA simulation with <see cref="StripAndMapEffects"/>.
/// This derivative simulates backtracking, i.e. it only considers paths that backtracking would
/// take before accepting the empty string for this pattern and returns the pattern ordered in the order backtracking
/// would explore paths. For example the derivative of a*ab places a*ab before b, while for a*?ab the order is reversed.
/// </summary>
/// <remarks>
/// The differences of this to <see cref="CreateDerivativeWithoutEffects(TSet,uint)"/> are that (1) effects (e.g. capture starts and ends)
/// The differences of this to <see cref="CreateDerivativeWithoutEffects(SymbolicRegexBuilder{TSet}, TSet, uint)"/> are that (1) effects (e.g. capture starts and ends)
/// are considered and (2) the different elements that would form a top level union are instead returned as separate
/// nodes (paired with their associated effects). This function is meant to be used for NFA simulation, where top level
/// unions would be broken up into separate states.
/// </remarks>
/// <param name="builder">the builder that owns this node</param>
/// <param name="elem">given element wrt which the derivative is taken</param>
/// <param name="context">immediately surrounding character context that affects nullability of anchors</param>
/// <returns>the derivative</returns>
internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivativeWithEffects(TSet elem, uint context)
internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder<TSet> builder, TSet elem, uint context)
{
List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions = new();
CreateDerivativeWrapper(elem, context).StripAndMapEffects(context, transitions);
CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions);
return transitions;
}
// This wrapper handles the shared top-level concerns of constructing derivatives. Namely:
// -Unwrapping and rewrapping nodes in DisableBacktrackingSimulation
// -When backtracking is being simulated calling into PruneLowerPriorityThanNullability
private SymbolicRegexNode<TSet> CreateDerivativeWrapper(TSet elem, uint context)
private SymbolicRegexNode<TSet> CreateDerivativeWrapper(SymbolicRegexBuilder<TSet> builder, TSet elem, uint context)
{
if (this._kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation)
{
// This node kind can only occur at the top level and indicates that backtracking simulation is turned off
Debug.Assert(_left is not null);
SymbolicRegexNode<TSet> derivative = _left.CreateDerivative(elem, context);
SymbolicRegexNode<TSet> derivative = _left.CreateDerivative(builder, elem, context);
// Reinsert the marker that maintains the non-backtracking semantics
return _builder.CreateDisableBacktrackingSimulation(derivative);
return builder.CreateDisableBacktrackingSimulation(derivative);
}
else
{
// If this node is nullable for the given context then prune any branches that are less preferred than
// just the empty match. This is done in order to maintain backtracking semantics.
SymbolicRegexNode<TSet> node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(context) : this;
return node.CreateDerivative(elem, context);
SymbolicRegexNode<TSet> node = IsNullableFor(context) ? PruneLowerPriorityThanNullability(builder, context) : this;
return node.CreateDerivative(builder, elem, context);
}
}
/// <summary>Prune this node wrt the given context in order to maintain backtracking semantics. Mimics how backtracking chooses a path.</summary>
private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(uint context)
private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexBuilder<TSet> builder, uint context)
{
//caching pruning to avoid otherwise potential quadratic worst case behavior
SymbolicRegexNode<TSet>? prunedNode;
(SymbolicRegexNode<TSet>, uint) key = (this, context);
if (_builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode))
if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode))
{
return prunedNode;
}
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, context);
return StackHelper.CallOnEmptyStack(PruneLowerPriorityThanNullability, builder, context);
}
switch (_kind)
......@@ -1143,8 +1101,8 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(uint context)
// In a alternation (X|Y) where X is nullable (in the given context), Y must be eliminated.
// Thus, taking the higher-priority branch in backtracking that is known to lead to a match
// at which point the other branches become irrelevant and must no longer be used.
prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(context) :
CreateAlternate(_builder, _left, _right.PruneLowerPriorityThanNullability(context), deduplicated: true);
prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(builder, context) :
CreateAlternate(builder, _left, _right.PruneLowerPriorityThanNullability(builder, context), deduplicated: true);
break;
case SymbolicRegexNodeKind.Concat:
......@@ -1159,20 +1117,20 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(uint context)
//e.g. a{0,5}?b{0,5}? reduces to ()
prunedNode = _left._kind == SymbolicRegexNodeKind.Alternate ?
(_left._left!.IsNullableFor(context) ?
CreateConcat(_builder, _left._left, _right).PruneLowerPriorityThanNullability(context) :
CreateAlternate(_builder, CreateConcat(_builder, _left._left, _right), CreateConcat(_builder, _left._right!, _right).PruneLowerPriorityThanNullability(context))) :
CreateConcat(_builder, _left.PruneLowerPriorityThanNullability(context), _right.PruneLowerPriorityThanNullability(context));
CreateConcat(builder, _left._left, _right).PruneLowerPriorityThanNullability(builder, context) :
CreateAlternate(builder, CreateConcat(builder, _left._left, _right), CreateConcat(builder, _left._right!, _right).PruneLowerPriorityThanNullability(builder, context))) :
CreateConcat(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right.PruneLowerPriorityThanNullability(builder, context));
break;
case SymbolicRegexNodeKind.Loop when _info.IsLazyLoop && _lower == 0:
//lazy nullable loop reduces to (), i.e., the loop body is just forgotten
prunedNode = _builder.Epsilon;
prunedNode = builder.Epsilon;
break;
case SymbolicRegexNodeKind.Effect:
//Effects are maintained and the pruning is propagated to the body of the effect
Debug.Assert(_left is not null && _right is not null);
prunedNode = CreateEffect(_builder, _left.PruneLowerPriorityThanNullability(context), _right);
prunedNode = CreateEffect(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right);
break;
default:
......@@ -1181,7 +1139,7 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(uint context)
break;
}
_builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode;
builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode;
return prunedNode;
}
......@@ -1205,19 +1163,20 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(uint context)
/// positions for capture starts and ends. For example, given a DerivativeEffect for CaptureStart of capture number 0
/// and an input position 5, applying it to a Registers instance is simply assigning the relevant value to 5.
/// </remarks>
/// <param name="builder">the builder that owns this node</param>
/// <param name="elem">given element wrt which the derivative is taken</param>
/// <param name="context">immediately surrounding character context that affects nullability of anchors</param>
/// <returns>the derivative</returns>
private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
private SymbolicRegexNode<TSet> CreateDerivative(SymbolicRegexBuilder<TSet> builder, TSet elem, uint context)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(CreateDerivative, elem, context);
return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context);
}
SymbolicRegexNode<TSet>? derivative;
(SymbolicRegexNode<TSet>, TSet, uint) key = (this, elem, context);
if (_builder._derivativeCache.TryGetValue(key, out derivative))
if (builder._derivativeCache.TryGetValue(key, out derivative))
{
return derivative;
}
......@@ -1230,14 +1189,14 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
// The following check assumes that either (1) the element and set are minterms, in which case
// the element is exactly the set if the intersection is non-empty (satisfiable), or (2) the element is a singleton
// set in which case it is fully contained in the set if the intersection is non-empty.
if (!_builder._solver.IsEmpty(_builder._solver.And(elem, _set)))
if (!builder._solver.IsEmpty(builder._solver.And(elem, _set)))
{
// the sigleton is consumed so the derivative is epsilon
derivative = _builder.Epsilon;
derivative = builder.Epsilon;
}
else
{
derivative = _builder._nothing;
derivative = builder._nothing;
}
break;
}
......@@ -1250,12 +1209,12 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
{
// If the left side is not nullable then the character must be consumed there.
// For example, Da(ab) = Da(a)b = b.
derivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right);
derivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right);
}
else
{
SymbolicRegexNode<TSet> leftDerivative = _builder.CreateConcat(_left.CreateDerivative(elem, context), _right);
SymbolicRegexNode<TSet> rightDerivative = _builder.CreateEffect(_right.CreateDerivative(elem, context), _left);
SymbolicRegexNode<TSet> leftDerivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), _right);
SymbolicRegexNode<TSet> rightDerivative = builder.CreateEffect(_right.CreateDerivative(builder, elem, context), _left);
// If the left alternative is high-priority-nullable then
// the priority is to skip left and prioritize rderiv over lderivR
// Two examples: suppose elem = a
......@@ -1268,8 +1227,8 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
// In the second case backtracking would try to continue to follow (ab)* after reading b
// This backtracking semantics is effectively being recorded into the order of the alternatives
derivative = _left.IsHighPriorityNullableFor(context) ?
CreateAlternate(_builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) :
CreateAlternate(_builder, leftDerivative, rightDerivative);
CreateAlternate(builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) :
CreateAlternate(builder, leftDerivative, rightDerivative);
}
break;
}
......@@ -1279,10 +1238,10 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
Debug.Assert(_left is not null);
Debug.Assert(_upper > 0);
SymbolicRegexNode<TSet> bodyDerivative = _left.CreateDerivative(elem, context);
if (bodyDerivative.IsNothing)
SymbolicRegexNode<TSet> bodyDerivative = _left.CreateDerivative(builder, elem, context);
if (bodyDerivative.IsNothing(builder._solver))
{
derivative = _builder._nothing;
derivative = builder._nothing;
}
else
{
......@@ -1294,7 +1253,7 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
int newlower = _lower == 0 || _lower == int.MaxValue ? _lower : _lower - 1;
// the continued loop becomes epsilon when newlower == newupper == 0
// in which case the returned concatenation will be just bodyDerivative
derivative = _builder.CreateConcat(bodyDerivative, _builder.CreateLoop(_left, IsLazy, newlower, newupper));
derivative = builder.CreateConcat(bodyDerivative, builder.CreateLoop(_left, IsLazy, newlower, newupper));
}
break;
}
......@@ -1302,7 +1261,7 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
case SymbolicRegexNodeKind.Alternate:
{
Debug.Assert(_left is not null && _right is not null);
derivative = CreateAlternate(_builder, _left.CreateDerivative(elem, context), _right.CreateDerivative(elem, context));
derivative = CreateAlternate(builder, _left.CreateDerivative(builder, elem, context), _right.CreateDerivative(builder, elem, context));
break;
}
......@@ -1314,11 +1273,11 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
default:
// The derivative of any other case is nothing
// e.g. taking the derivative of () (epsilon) is [] (nothing)
derivative = _builder._nothing;
derivative = builder._nothing;
break;
}
_builder._derivativeCache[key] = derivative;
builder._derivativeCache[key] = derivative;
return derivative;
}
......@@ -1327,11 +1286,11 @@ private SymbolicRegexNode<TSet> CreateDerivative(TSet elem, uint context)
/// So Effect(R,E) would be simplified to just R.
/// </summary>
/// <returns>the node with all Effect nodes stripped away</returns>
internal SymbolicRegexNode<TSet> StripEffects()
internal SymbolicRegexNode<TSet> StripEffects(SymbolicRegexBuilder<TSet> builder)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(StripEffects);
return StackHelper.CallOnEmptyStack(StripEffects, builder);
}
// If the node doesn't contain any Effect nodes under it we are done
......@@ -1344,12 +1303,12 @@ internal SymbolicRegexNode<TSet> StripEffects()
case SymbolicRegexNodeKind.Effect:
Debug.Assert(_left is not null && _right is not null);
// This is the place where the effect (the right child) is getting ignored
return _left.StripEffects();
return _left.StripEffects(builder);
case SymbolicRegexNodeKind.Concat:
Debug.Assert(_left is not null && _right is not null);
Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect);
return _builder.CreateConcat(_left.StripEffects(), _right);
return builder.CreateConcat(_left.StripEffects(builder), _right);
case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
......@@ -1357,16 +1316,16 @@ internal SymbolicRegexNode<TSet> StripEffects()
// the elements. We don't want to omit deduplication here, since he stripping may make nodes equal.
List<SymbolicRegexNode<TSet>> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate);
for (int i = 0; i < elems.Count; i++)
elems[i] = elems[i].StripEffects();
return _builder.Alternate(elems);
elems[i] = elems[i].StripEffects(builder);
return builder.Alternate(elems);
case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
Debug.Assert(_left is not null);
return _builder.CreateDisableBacktrackingSimulation(_left.StripEffects());
return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder));
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
return _builder.CreateLoop(_left.StripEffects(), IsLazy, _lower, _upper);
return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper);
default:
Debug.Fail($"{nameof(StripEffects)}:{_kind}");
......@@ -1386,15 +1345,16 @@ internal SymbolicRegexNode<TSet> StripEffects()
/// Here both include the CaptureStart_0 effect, since both are nested inside the outer Effect node,
/// while only R includes the CaptureStart_1 effect.
/// </summary>
/// <param name="builder">the builder that owns this node</param>
/// <param name="context">immediately surrounding character context that affects nullability of anchors</param>
/// <param name="alternativesAndEffects">the list to insert the pairs of nodes and their effects into in priority order</param>
/// <param name="currentEffects">a helper list this function uses to accumulate effects in recursive calls</param>
internal void StripAndMapEffects(uint context, List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> alternativesAndEffects,
internal void StripAndMapEffects(SymbolicRegexBuilder<TSet> builder, uint context, List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> alternativesAndEffects,
List<DerivativeEffect>? currentEffects = null)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
StackHelper.CallOnEmptyStack(StripAndMapEffects, context, alternativesAndEffects, currentEffects);
StackHelper.CallOnEmptyStack(StripAndMapEffects, builder, context, alternativesAndEffects, currentEffects);
return;
}
......@@ -1418,7 +1378,7 @@ internal SymbolicRegexNode<TSet> StripEffects()
int oldEffectCount = currentEffects.Count;
_right.ApplyEffects((e, s) => s.Add(e), context, currentEffects);
// Recurse into the main child
_left.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
// Pop all the effects that were pushed above
currentEffects.RemoveRange(oldEffectCount, currentEffects.Count - oldEffectCount);
return;
......@@ -1430,19 +1390,19 @@ internal SymbolicRegexNode<TSet> StripEffects()
// For concat the nodes for the left hand side are added first and then fixed up by concatenating
// the right side to each of them.
int oldAlternativesCount = alternativesAndEffects.Count;
_left.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
{
var (node, effects) = alternativesAndEffects[i];
alternativesAndEffects[i] = (_builder.CreateConcat(node, _right), effects);
alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects);
}
break;
}
case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
_left.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_right.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
_right.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
break;
case SymbolicRegexNodeKind.Loop when _lower == 0 && _upper == 1:
......@@ -1452,14 +1412,14 @@ internal SymbolicRegexNode<TSet> StripEffects()
Debug.Assert(_left is not null);
// For lazy loops skipping is preferred, so output the epsilon first
if (IsLazy)
alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ?
alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ?
currentEffects.ToArray() :
Array.Empty<DerivativeEffect>()));
// Recurse into the body
_left.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
// For eager loops the body is preferred, so output the epsilon last
if (!IsLazy)
alternativesAndEffects.Add((_builder.Epsilon, currentEffects.Count > 0 ?
alternativesAndEffects.Add((builder.Epsilon, currentEffects.Count > 0 ?
currentEffects.ToArray() :
Array.Empty<DerivativeEffect>()));
break;
......@@ -1468,11 +1428,11 @@ internal SymbolicRegexNode<TSet> StripEffects()
{
Debug.Assert(_left is not null);
int oldAlternativesCount = alternativesAndEffects.Count;
_left.StripAndMapEffects(context, alternativesAndEffects, currentEffects);
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
{
var (node, effects) = alternativesAndEffects[i];
alternativesAndEffects[i] = (_builder.CreateDisableBacktrackingSimulation(node), effects);
alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects);
}
break;
}
......@@ -1632,12 +1592,12 @@ internal void ToStringHelper(StringBuilder sb)
case SymbolicRegexNodeKind.Singleton:
Debug.Assert(_set is not null);
sb.Append(_builder._solver.PrettyPrint(_set, _builder._charSetSolver));
sb.Append(_debugBuilder!._solver.PrettyPrint(_set, _debugBuilder._charSetSolver));
return;
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
if (IsAnyStar)
if (IsAnyStar(_debugBuilder!._solver))
{
sb.Append(".*");
}
......@@ -1782,19 +1742,19 @@ static void AppendNumberSuperscript(StringBuilder sb, int value)
/// <summary>
/// Returns all sets that occur in the regex or the full set if there are no sets in the regex (e.g. the regex is "^").
/// </summary>
public HashSet<TSet> GetSets()
public HashSet<TSet> GetSets(SymbolicRegexBuilder<TSet> builder)
{
var sets = new HashSet<TSet>();
CollectSets(sets);
CollectSets(builder, sets);
return sets;
}
/// <summary>Collects all sets that occur in the regex into the specified collection.</summary>
private void CollectSets(HashSet<TSet> sets)
private void CollectSets(SymbolicRegexBuilder<TSet> builder, HashSet<TSet> sets)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
StackHelper.CallOnEmptyStack(CollectSets, sets);
StackHelper.CallOnEmptyStack(CollectSets, builder, sets);
return;
}
......@@ -1804,7 +1764,7 @@ private void CollectSets(HashSet<TSet> sets)
case SymbolicRegexNodeKind.EOLAnchor:
case SymbolicRegexNodeKind.EndAnchorZ:
case SymbolicRegexNodeKind.EndAnchorZReverse:
sets.Add(_builder._newLineSet);
sets.Add(builder._newLineSet);
return;
case SymbolicRegexNodeKind.BeginningAnchor:
......@@ -1822,13 +1782,13 @@ private void CollectSets(HashSet<TSet> sets)
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
_left.CollectSets(sets);
_left.CollectSets(builder, sets);
return;
case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
_left.CollectSets(sets);
_right.CollectSets(sets);
_left.CollectSets(builder, sets);
_right.CollectSets(builder, sets);
return;
case SymbolicRegexNodeKind.Concat:
......@@ -1837,20 +1797,20 @@ private void CollectSets(HashSet<TSet> sets)
while (conc._kind == SymbolicRegexNodeKind.Concat)
{
Debug.Assert(conc._left is not null && conc._right is not null);
conc._left.CollectSets(sets);
conc._left.CollectSets(builder, sets);
conc = conc._right;
}
conc.CollectSets(sets);
conc.CollectSets(builder, sets);
return;
case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
Debug.Assert(_left is not null);
_left.CollectSets(sets);
_left.CollectSets(builder, sets);
return;
case SymbolicRegexNodeKind.NonBoundaryAnchor:
case SymbolicRegexNodeKind.BoundaryAnchor:
sets.Add(_builder._wordLetterForBoundariesSet);
sets.Add(builder._wordLetterForBoundariesSet);
return;
default:
......@@ -1860,10 +1820,10 @@ private void CollectSets(HashSet<TSet> sets)
}
/// <summary>Compute and sort all the minterms from the sets in this regex.</summary>
public TSet[] ComputeMinterms()
public TSet[] ComputeMinterms(SymbolicRegexBuilder<TSet> builder)
{
HashSet<TSet> sets = GetSets();
List<TSet> minterms = MintermGenerator<TSet>.GenerateMinterms(_builder._solver, sets);
HashSet<TSet> sets = GetSets(builder);
List<TSet> minterms = MintermGenerator<TSet>.GenerateMinterms(builder._solver, sets);
minterms.Sort();
return minterms.ToArray();
}
......@@ -1871,69 +1831,69 @@ public TSet[] ComputeMinterms()
/// <summary>
/// Create the reverse of this regex
/// </summary>
public SymbolicRegexNode<TSet> Reverse()
public SymbolicRegexNode<TSet> Reverse(SymbolicRegexBuilder<TSet> builder)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(Reverse);
return StackHelper.CallOnEmptyStack(Reverse, builder);
}
switch (_kind)
{
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
return _builder.CreateLoop(_left.Reverse(), IsLazy, _lower, _upper);
return builder.CreateLoop(_left.Reverse(builder), IsLazy, _lower, _upper);
case SymbolicRegexNodeKind.Concat:
{
Debug.Assert(_left is not null && _right is not null);
SymbolicRegexNode<TSet> rev = _left.Reverse();
SymbolicRegexNode<TSet> rev = _left.Reverse(builder);
SymbolicRegexNode<TSet> rest = _right;
while (rest._kind == SymbolicRegexNodeKind.Concat)
{
Debug.Assert(rest._left is not null && rest._right is not null);
SymbolicRegexNode<TSet> rev1 = rest._left.Reverse();
rev = _builder.CreateConcat(rev1, rev);
SymbolicRegexNode<TSet> rev1 = rest._left.Reverse(builder);
rev = builder.CreateConcat(rev1, rev);
rest = rest._right;
}
SymbolicRegexNode<TSet> restr = rest.Reverse();
rev = _builder.CreateConcat(restr, rev);
SymbolicRegexNode<TSet> restr = rest.Reverse(builder);
rev = builder.CreateConcat(restr, rev);
return rev;
}
case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
return CreateAlternate(_builder, _left.Reverse(), _right.Reverse());
return CreateAlternate(builder, _left.Reverse(builder), _right.Reverse(builder));
case SymbolicRegexNodeKind.FixedLengthMarker:
// Fixed length markers are omitted in reverse
return _builder.Epsilon;
return builder.Epsilon;
case SymbolicRegexNodeKind.BeginningAnchor:
// The reverse of BeginningAnchor is EndAnchor
return _builder.EndAnchor;
return builder.EndAnchor;
case SymbolicRegexNodeKind.EndAnchor:
return _builder.BeginningAnchor;
return builder.BeginningAnchor;
case SymbolicRegexNodeKind.BOLAnchor:
// The reverse of BOLanchor is EOLanchor
return _builder.EolAnchor;
return builder.EolAnchor;
case SymbolicRegexNodeKind.EOLAnchor:
return _builder.BolAnchor;
return builder.BolAnchor;
case SymbolicRegexNodeKind.EndAnchorZ:
// The reversal of the \Z anchor
return _builder.EndAnchorZReverse;
return builder.EndAnchorZReverse;
case SymbolicRegexNodeKind.EndAnchorZReverse:
Debug.Fail("Should only happen if a reversed regex is reversed again, which isn't expected");
return _builder.EndAnchorZ;
return builder.EndAnchorZ;
case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
Debug.Assert(_left is not null);
return _builder.CreateDisableBacktrackingSimulation(_left.Reverse());
return builder.CreateDisableBacktrackingSimulation(_left.Reverse(builder));
// Remaining cases map to themselves:
case SymbolicRegexNodeKind.Epsilon:
......@@ -1974,12 +1934,8 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1)
};
}
/// <summary>Gets the set that includes all elements that can start a match.</summary>
internal TSet GetStartSet() => _startSet;
/// <summary>Computes the set that includes all elements that can start a match.</summary>
private TSet ComputeStartSet()
public TSet GetStartSet(SymbolicRegexBuilder<TSet> builder)
{
switch (_kind)
{
......@@ -1996,7 +1952,7 @@ private TSet ComputeStartSet()
case SymbolicRegexNodeKind.BOLAnchor:
case SymbolicRegexNodeKind.CaptureStart:
case SymbolicRegexNodeKind.CaptureEnd:
return _builder._solver.Empty;
return builder._solver.Empty;
case SymbolicRegexNodeKind.Singleton:
Debug.Assert(_set is not null);
......@@ -2004,44 +1960,64 @@ private TSet ComputeStartSet()
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
return _left._startSet;
return _left.GetStartSet(builder);
case SymbolicRegexNodeKind.Concat:
{
Debug.Assert(_left is not null && _right is not null);
TSet startSet = _left.CanBeNullable ? _builder._solver.Or(_left._startSet, _right._startSet) : _left._startSet;
TSet startSet = _left.CanBeNullable ? builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder)) : _left.GetStartSet(builder);
return startSet;
}
case SymbolicRegexNodeKind.Alternate:
{
Debug.Assert(_left is not null && _right is not null);
return _builder._solver.Or(_left._startSet, _right._startSet);
return builder._solver.Or(_left.GetStartSet(builder), _right.GetStartSet(builder));
}
case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
case SymbolicRegexNodeKind.Effect:
Debug.Assert(_left is not null);
return _left._startSet;
return _left.GetStartSet(builder);
default:
Debug.Fail($"{nameof(ComputeStartSet)}:{_kind}");
return _builder._solver.Full;
Debug.Fail($"{nameof(GetStartSet)}:{_kind}");
return builder._solver.Full;
}
}
/// <summary>
/// Replace anchors that are infeasible by [] wrt the given previous character kind and what continuation is possible.
/// </summary>
/// <remarks>
/// This helps the matcher detect deadend states that have no viable matches in situations where the pattern's
/// language is empty due to interactions between anchors and the rest of the pattern. For example, a*\ba would
/// be simplified to [] when prevKind is a word letter. This allows the matcher to avoid spurious work and return
/// early.
/// </remarks>
/// <param name="builder">the builder that owns this node</param>
/// <param name="prevKind">previous character kind</param>
/// <param name="contWithWL">if true the continuation can start with wordletter or stop</param>
/// <param name="contWithNWL">if true the continuation can start with nonwordletter or stop</param>
internal SymbolicRegexNode<TSet> PruneAnchors(uint prevKind, bool contWithWL, bool contWithNWL)
internal SymbolicRegexNode<TSet> PruneAnchors(SymbolicRegexBuilder<TSet> builder, uint prevKind)
{
//first prune the anchors in the node
TSet wlbSet = builder._wordLetterForBoundariesSet;
TSet startSet = GetStartSet(builder);
//true if the startset of the node overlaps with some wordletter or the node can be nullable
bool contWithWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(wlbSet, startSet));
//true if the startset of the node overlaps with some nonwordletter or the node can be nullable
bool contWithNWL = CanBeNullable || !builder._solver.IsEmpty(builder._solver.And(builder._solver.Not(wlbSet), startSet));
return PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
}
private SymbolicRegexNode<TSet> PruneAnchorsImpl(SymbolicRegexBuilder<TSet> builder, uint prevKind, bool contWithWL, bool contWithNWL)
{
// Guard against stack overflow due to deep recursion
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return StackHelper.CallOnEmptyStack(PruneAnchors, prevKind, contWithWL, contWithNWL);
return StackHelper.CallOnEmptyStack(PruneAnchorsImpl, builder, prevKind, contWithWL, contWithNWL);
}
if (!_info.StartsWithSomeAnchor)
......@@ -2052,73 +2028,73 @@ internal SymbolicRegexNode<TSet> PruneAnchors(uint prevKind, bool contWithWL, bo
case SymbolicRegexNodeKind.BeginningAnchor:
return prevKind == CharKind.BeginningEnd ?
this :
_builder._nothing; //start anchor is only nullable if the previous character is Start
builder._nothing; //start anchor is only nullable if the previous character is Start
case SymbolicRegexNodeKind.EndAnchorZReverse:
return ((prevKind & CharKind.BeginningEnd) != 0) ?
this :
_builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n
builder._nothing; //rev(\Z) is only nullable if the previous characters is Start or the very first \n
case SymbolicRegexNodeKind.BoundaryAnchor:
return (prevKind == CharKind.WordLetter ? contWithNWL : contWithWL) ?
this :
// \b is impossible when the previous character is \w but no continuation matches \W
// or the previous character is \W but no continuation matches \w
_builder._nothing;
builder._nothing;
case SymbolicRegexNodeKind.NonBoundaryAnchor:
return (prevKind == CharKind.WordLetter ? contWithWL : contWithNWL) ?
this :
// \B is impossible when the previous character is \w but no continuation matches \w
// or the previous character is \W but no continuation matches \W
_builder._nothing;
builder._nothing;
case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
SymbolicRegexNode<TSet> body = _left.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> body = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
return body == _left ?
this :
CreateLoop(_builder, body, _lower, _upper, IsLazy);
CreateLoop(builder, body, _lower, _upper, IsLazy);
case SymbolicRegexNodeKind.Concat:
{
Debug.Assert(_left is not null && _right is not null);
SymbolicRegexNode<TSet> left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> right1 = _left.IsNullable ? _right.PruneAnchors(prevKind, contWithWL, contWithNWL) : _right;
SymbolicRegexNode<TSet> left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> right1 = _left.IsNullable ? _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL) : _right;
Debug.Assert(left1 is not null && right1 is not null);
return left1 == _left && right1 == _right ?
this :
CreateConcat(_builder, left1, right1);
CreateConcat(builder, left1, right1);
}
case SymbolicRegexNodeKind.Alternate:
{
Debug.Assert(_left is not null && _right is not null);
SymbolicRegexNode<TSet> left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> right1 = _right.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> right1 = _right.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
Debug.Assert(left1 is not null && right1 is not null);
return left1 == _left && right1 == _right ?
this :
CreateAlternate(_builder, left1, right1);
CreateAlternate(builder, left1, right1);
}
case SymbolicRegexNodeKind.Effect:
{
Debug.Assert(_left is not null && _right is not null);
SymbolicRegexNode<TSet> left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> left1 = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
return left1 == _left ?
this :
CreateEffect(_builder, left1, _right);
CreateEffect(builder, left1, _right);
}
case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
Debug.Assert(_left is not null);
SymbolicRegexNode<TSet> child = _left.PruneAnchors(prevKind, contWithWL, contWithNWL);
SymbolicRegexNode<TSet> child = _left.PruneAnchorsImpl(builder, prevKind, contWithWL, contWithNWL);
return child == _left ?
this :
_builder.CreateDisableBacktrackingSimulation(child);
builder.CreateDisableBacktrackingSimulation(child);
default:
return this;
......@@ -2175,7 +2151,7 @@ internal int ResolveFixedLength(uint context)
/// and the resulting elements re-wrapped to maintain the metadata.
/// </summary>
/// <returns>an enumeration of the elements of the alternation, or just the node itself if there is no alternation</returns>
internal IEnumerable<SymbolicRegexNode<TSet>> EnumerateAlternationBranches()
internal IEnumerable<SymbolicRegexNode<TSet>> EnumerateAlternationBranches(SymbolicRegexBuilder<TSet> builder)
{
switch (_kind)
{
......@@ -2183,10 +2159,10 @@ internal IEnumerable<SymbolicRegexNode<TSet>> EnumerateAlternationBranches()
Debug.Assert(_left is not null);
// This call should never recurse more than one level
Debug.Assert(_left._kind is not SymbolicRegexNodeKind.DisableBacktrackingSimulation);
foreach (SymbolicRegexNode<TSet> element in _left.EnumerateAlternationBranches())
foreach (SymbolicRegexNode<TSet> element in _left.EnumerateAlternationBranches(builder))
{
// Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too
yield return _builder.CreateDisableBacktrackingSimulation(element);
yield return builder.CreateDisableBacktrackingSimulation(element);
}
break;
case SymbolicRegexNodeKind.Alternate:
......
......@@ -37,8 +37,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
}
}
rootNode = rootNode.AddFixedLengthMarkers();
BDD[] minterms = rootNode.ComputeMinterms();
rootNode = rootNode.AddFixedLengthMarkers(bddBuilder);
BDD[] minterms = rootNode.ComputeMinterms(bddBuilder);
_matcher = minterms.Length > 64 ?
SymbolicRegexMatcher<BitVector>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) :
......
......@@ -64,6 +64,38 @@ public static bool TryEnsureSufficientExecutionStack()
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <param name="action">The action to invoke.</param>
/// <param name="arg1">The first argument to pass to the action.</param>
/// <param name="arg2">The second argument to pass to the action.</param>
/// <param name="arg3">The third argument to pass to the action.</param>
/// <param name="arg4">The fourth argument to pass to the action.</param>
public static void CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4>(Action<TArg1, TArg2, TArg3, TArg4> action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) =>
Task.Run(() => action(arg1, arg2, arg3, arg4))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <typeparam name="TArg5">The type of the fifth argument to pass to the function.</typeparam>
/// <param name="action">The action to invoke.</param>
/// <param name="arg1">The first argument to pass to the action.</param>
/// <param name="arg2">The second argument to pass to the action.</param>
/// <param name="arg3">The third argument to pass to the action.</param>
/// <param name="arg4">The fourth argument to pass to the action.</param>
/// <param name="arg5">The fifth argument to pass to the action.</param>
public static void CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4, TArg5>(Action<TArg1, TArg2, TArg3, TArg4, TArg5> action, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4, TArg5 arg5) =>
Task.Run(() => action(arg1, arg2, arg3, arg4, arg5))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
......@@ -126,5 +158,21 @@ public static bool TryEnsureSufficientExecutionStack()
Task.Run(() => func(arg1, arg2, arg3))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
/// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
/// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
/// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
/// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
/// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
/// <typeparam name="TResult">The return type of the function.</typeparam>
/// <param name="func">The function to invoke.</param>
/// <param name="arg1">The first argument to pass to the function.</param>
/// <param name="arg2">The second argument to pass to the function.</param>
/// <param name="arg3">The third argument to pass to the function.</param>
/// <param name="arg4">The fourth argument to pass to the function.</param>
public static TResult CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4, TResult>(Func<TArg1, TArg2, TArg3, TArg4, TResult> func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) =>
Task.Run(() => func(arg1, arg2, arg3, arg4))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
}
}
......@@ -86,7 +86,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize };
}
// add .*? in front of the pattern, this adds 1 more NFA state
......@@ -94,7 +94,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(".*?" + Pattern, options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, 1 + ExpectedSafeSize};
yield return new object[] { bddBuilder, rootNode, 1 + ExpectedSafeSize};
}
// use of anchors increases the estimate by 5x in general but in reality much less, at most 3x
......@@ -102,7 +102,7 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern + "$", options | RegexOptions.ExplicitCapture, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, 5 * ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, 5 * ExpectedSafeSize };
}
// use of captures has no effect on the estimations
......@@ -110,31 +110,32 @@ public static IEnumerable<object[]> SafeThresholdTests_MemberData()
{
RegexNode tree = RegexParser.Parse(Pattern, options, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode, ExpectedSafeSize };
yield return new object[] { bddBuilder, rootNode, ExpectedSafeSize };
}
}
[Theory]
[MemberData(nameof(SafeThresholdTests_MemberData))]
public void SafeThresholdTests(object obj, int expectedSafeSize)
public void SafeThresholdTests(object builderObj, object nodeObj, int expectedSafeSize)
{
SymbolicRegexNode<BDD> node = (SymbolicRegexNode<BDD>)obj;
SymbolicRegexBuilder<BDD> builder = (SymbolicRegexBuilder<BDD>)builderObj;
SymbolicRegexNode<BDD> node = (SymbolicRegexNode<BDD>)nodeObj;
int safeSize = node.EstimateNfaSize();
Assert.Equal(expectedSafeSize, safeSize);
int nfaStateCount = CalculateNfaStateCount(node);
int nfaStateCount = CalculateNfaStateCount(builder, node);
Assert.True(nfaStateCount <= expectedSafeSize);
}
/// <summary>
/// Compute the closure of all NFA states from root and return the size of the resulting state space.
/// </summary>
private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
private static int CalculateNfaStateCount(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> root)
{
// Here we are actually using the original BDD algebra (not converting to the BV or Uint64 algebra)
// because it does not matter which algebra we use here (this matters only for performance)
HashSet<(uint, SymbolicRegexNode<BDD>)> states = new();
Stack<(uint, SymbolicRegexNode<BDD>)> frontier = new();
List<BDD> minterms = MintermGenerator<BDD>.GenerateMinterms(root._builder._solver, root.GetSets());
List<BDD> minterms = MintermGenerator<BDD>.GenerateMinterms(builder._solver, root.GetSets(builder));
// Start from the initial state that has kind 'General' when no anchors are being used, else kind 'BeginningEnd'
(uint, SymbolicRegexNode<BDD>) initialState = (root._info.ContainsSomeAnchor ? CharKind.BeginningEnd : CharKind.General, root);
......@@ -150,7 +151,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
foreach (BDD minterm in minterms)
{
uint kind = GetCharKind(minterm);
SymbolicRegexNode<BDD> target = source.Node.CreateDerivativeWithoutEffects(minterm, source.Kind);
SymbolicRegexNode<BDD> target = source.Node.CreateDerivativeWithoutEffects(builder, minterm, source.Kind);
//In the case of an NFA all the different alternatives in the DFA state become individual states themselves
foreach (SymbolicRegexNode<BDD> node in GetAlternatives(target))
......@@ -169,7 +170,7 @@ private static int CalculateNfaStateCount(SymbolicRegexNode<BDD> root)
return states.Count;
// Enumerates the alternatives from a node, for eaxmple (ab|(bc|cd)) has three alternatives
static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD> node)
IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD> node)
{
if (node._kind == SymbolicRegexNodeKind.Alternate)
{
......@@ -178,7 +179,7 @@ static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD
foreach (SymbolicRegexNode<BDD> elem in GetAlternatives(node._right!))
yield return elem;
}
else if (!node.IsNothing) // omit deadend states
else if (!node.IsNothing(builder._solver)) // omit deadend states
{
yield return node;
}
......@@ -187,8 +188,8 @@ static IEnumerable<SymbolicRegexNode<BDD>> GetAlternatives(SymbolicRegexNode<BDD
// Simplified character kind calculation that omits the special case that minterm can be the very last \n
// This omission has practically no effect of the size of the state space, but would complicate the logic
uint GetCharKind(BDD minterm) =>
minterm.Equals(root._builder._newLineSet) ? CharKind.Newline : // is \n
(!root._builder._solver.IsEmpty(root._builder._solver.And(root._builder._wordLetterForBoundariesSet, minterm)) ?
minterm.Equals(builder._newLineSet) ? CharKind.Newline : // is \n
(!builder._solver.IsEmpty(builder._solver.And(builder._wordLetterForBoundariesSet, minterm)) ?
CharKind.WordLetter : // in \w
CharKind.General); // anything else, thus in particular in \W
}
......
......@@ -51,7 +51,7 @@
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\CharKind.cs" Link="Production\CharKind.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\CharSetSolver.cs" Link="Production\CharSetSolver.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" Link="Production\DerivativeEffect.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" Link="Production\DfaMatchingState.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\MatchingState.cs" Link="Production\MatchingState.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" Link="Production\DoublyLinkedList.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\ISolver.cs" Link="Production\ISolver.cs" />
<Compile Include="..\..\src\System\Text\RegularExpressions\Symbolic\MintermGenerator.cs" Link="Production\MintermGenerator.cs" />
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册