提交 622ef53a 编写于 作者: C Cyrus Najmabadi

Merge branch 'regexParsing4' into regexFeatures

......@@ -4,7 +4,7 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
{
internal enum RegexKind
{
None,
None = 0,
EndOfFile,
Sequence,
CompilationUnit,
......
......@@ -401,7 +401,7 @@ public RegexToken ScanHexCharacters(int count)
{
result = result.AddDiagnosticIfNone(new EmbeddedDiagnostic(
WorkspacesResources.Insufficient_hexadecimal_digits,
TextSpan.FromBounds(Text[beforeSlash].Span.Start, Text[Position - 1].Span.End)));
GetTextSpan(beforeSlash, Position)));
}
return result;
......
......@@ -501,7 +501,8 @@ public override void Accept(IRegexNodeVisitor visitor)
}
/// <summary>
/// ```a{...}```
/// Base type of all regex numeric quantifier nodes. i.e.
/// ```a{5}```, ```a{5,}``` and ```a{5,10}```
/// </summary>
internal abstract class RegexNumericQuantifierNode : RegexQuantifierNode
{
......
......@@ -43,13 +43,18 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
/// same error. Note: there is only one time we do this in this parser (see the deviation
/// documented in <see cref="ParsePossibleEcmascriptBackreferenceEscape"/>).
///
/// Note1: the above invariants make life difficult at times. This happens due to the fact that
/// Note1: "report the same error" means that we will attempt to report the error using the same
/// text the .net regex parser uses for its error messages. This is so that the user is not
/// confused when they use the IDE vs running the regex by getting different messages for the
/// same issue.
///
/// Note2: the above invariants make life difficult at times. This happens due to the fact that
/// the .net parser is multi-pass. Meaning it does a first scan (which may report errors), then
/// does the full parse. This means that it might report an error in a later location during
/// the initial scan than it would during the parse. We replicate that behavior to follow the
/// second invariant.
///
/// Note2: It would be nice if we could check these invariants at runtime, so we could control
/// Note3: It would be nice if we could check these invariants at runtime, so we could control
/// our behavior by the behavior of the real .net regex engine. For example, if the .net regex
/// engine did not report any issues, we could suppress any diagnostics we generated and we
/// could log an NFW to record which pattern we deviated on so we could fix the issue for a
......@@ -59,9 +64,16 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
/// engine is not just a parser, but something that builds an actual recognizer using techniques
/// that are not necessarily bounded. As such, while we test ourselves around it during our
/// tests, we cannot do the same at runtime as part of the IDE.
///
///
/// This parser was based off the corefx RegexParser based at:
/// https://github.com/dotnet/corefx/blob/f759243d724f462da0bcef54e86588f8a55352c6/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1
///
/// Note4: The .Net parser itself changes over time (for example to fix behavior that even it
/// thinks is buggy). When this happens, we have to make a choice as to which behavior to
/// follow. In general, the overall principle is that we should follow the more lenient
/// behavior. If we end up taking the more strict interpretation we risk giving people an error
/// during design time that they would not get at runtime. It's far worse to have that than to
/// not report an error, even though one might happen later.
/// </remarks>
internal partial struct RegexParser
{
......@@ -243,16 +255,12 @@ private RegexSequenceNode ParseSequence(bool consumeCloseParen)
{
var list = ArrayBuilder<RegexExpressionNode>.GetInstance();
if (ShouldConsumeSequenceElement(consumeCloseParen))
while (ShouldConsumeSequenceElement(consumeCloseParen))
{
do
{
var last = list.Count == 0 ? null : list.Last();
list.Add(ParsePrimaryExpressionAndQuantifiers(last));
var last = list.Count == 0 ? null : list.Last();
list.Add(ParsePrimaryExpressionAndQuantifiers(last));
TryMergeLastTwoNodes(list);
}
while (ShouldConsumeSequenceElement(consumeCloseParen));
TryMergeLastTwoNodes(list);
}
return new RegexSequenceNode(list.ToImmutableAndFree());
......@@ -726,8 +734,8 @@ private RegexConditionalGroupingNode ParseConditionalGrouping(RegexToken openPar
}
else
{
// If its a capture name, it's ok if it that capture doesn't exist. In that
// case we will just treat this as an conditional expression.
// If it's a capture name, it's ok if that capture doesn't exist. In that case we
// will just treat this as an conditional expression.
if (!HasCapture((string)capture.Value))
{
_lexer.Position = afterInnerOpenParen;
......@@ -1284,14 +1292,14 @@ private bool TryGetRangeComponentValueWorker(RegexNode component, out char ch)
ch = ((RegexSimpleEscapeNode)component).TypeToken.VirtualChars[0];
switch (ch)
{
case 'a': ch = '\u0007'; break;
case 'b': ch = '\b'; break;
case 'e': ch = '\u001B'; break;
case 'f': ch = '\f'; break;
case 'n': ch = '\n'; break;
case 'r': ch = '\r'; break;
case 't': ch = '\t'; break;
case 'v': ch = '\u000B'; break;
case 'a': ch = '\u0007'; break; // bell
case 'b': ch = '\b'; break; // backspace
case 'e': ch = '\u001B'; break; // escape
case 'f': ch = '\f'; break; // form feed
case 'n': ch = '\n'; break; // new line
case 'r': ch = '\r'; break; // carriage return
case 't': ch = '\t'; break; // tab
case 'v': ch = '\u000B'; break; // vertical tab
}
return true;
......@@ -1889,6 +1897,21 @@ private RegexControlEscapeNode ParseControlEscape(RegexToken backslashToken, boo
unchecked
{
// From: https://github.com/dotnet/corefx/blob/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1450
// Note: Roslyn accepts a control escape that current .Net parser does not.
// Specifically: \c[
//
// It is a bug that the .Net parser does not support this construct. The bug was
// reported at: https://github.com/dotnet/corefx/issues/26501 and was fixed for
// CoreFx with https://github.com/dotnet/corefx/commit/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7
//
// Because it was a bug, we follow the correct behavior. That means we will not
// report a diagnostic for a Regex that someone might run on a previous version of
// .Net that ends up throwing at runtime. That's acceptable. Our goal is to match
// the latest .Net 'correct' behavior. Not intermediary points with bugs that have
// since been fixed.
// \ca interpreted as \cA
if (ch >= 'a' && ch <= 'z')
{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册