提交 622ef53a 编写于 作者: C Cyrus Najmabadi

Merge branch 'regexParsing4' into regexFeatures

...@@ -4,7 +4,7 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions ...@@ -4,7 +4,7 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
{ {
internal enum RegexKind internal enum RegexKind
{ {
None, None = 0,
EndOfFile, EndOfFile,
Sequence, Sequence,
CompilationUnit, CompilationUnit,
......
...@@ -401,7 +401,7 @@ public RegexToken ScanHexCharacters(int count) ...@@ -401,7 +401,7 @@ public RegexToken ScanHexCharacters(int count)
{ {
result = result.AddDiagnosticIfNone(new EmbeddedDiagnostic( result = result.AddDiagnosticIfNone(new EmbeddedDiagnostic(
WorkspacesResources.Insufficient_hexadecimal_digits, WorkspacesResources.Insufficient_hexadecimal_digits,
TextSpan.FromBounds(Text[beforeSlash].Span.Start, Text[Position - 1].Span.End))); GetTextSpan(beforeSlash, Position)));
} }
return result; return result;
......
...@@ -501,7 +501,8 @@ public override void Accept(IRegexNodeVisitor visitor) ...@@ -501,7 +501,8 @@ public override void Accept(IRegexNodeVisitor visitor)
} }
/// <summary> /// <summary>
/// ```a{...}``` /// Base type of all regex numeric quantifier nodes. i.e.
/// ```a{5}```, ```a{5,}``` and ```a{5,10}```
/// </summary> /// </summary>
internal abstract class RegexNumericQuantifierNode : RegexQuantifierNode internal abstract class RegexNumericQuantifierNode : RegexQuantifierNode
{ {
......
...@@ -43,13 +43,18 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions ...@@ -43,13 +43,18 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
/// same error. Note: there is only one time we do this in this parser (see the deviation /// same error. Note: there is only one time we do this in this parser (see the deviation
/// documented in <see cref="ParsePossibleEcmascriptBackreferenceEscape"/>). /// documented in <see cref="ParsePossibleEcmascriptBackreferenceEscape"/>).
/// ///
/// Note1: the above invariants make life difficult at times. This happens due to the fact that /// Note1: "report the same error" means that we will attempt to report the error using the same
/// text the .net regex parser uses for its error messages. This is so that the user is not
/// confused when they use the IDE vs running the regex by getting different messages for the
/// same issue.
///
/// Note2: the above invariants make life difficult at times. This happens due to the fact that
/// the .net parser is multi-pass. Meaning it does a first scan (which may report errors), then /// the .net parser is multi-pass. Meaning it does a first scan (which may report errors), then
/// does the full parse. This means that it might report an error in a later location during /// does the full parse. This means that it might report an error in a later location during
/// the initial scan than it would during the parse. We replicate that behavior to follow the /// the initial scan than it would during the parse. We replicate that behavior to follow the
/// second invariant. /// second invariant.
/// ///
/// Note2: It would be nice if we could check these invariants at runtime, so we could control /// Note3: It would be nice if we could check these invariants at runtime, so we could control
/// our behavior by the behavior of the real .net regex engine. For example, if the .net regex /// our behavior by the behavior of the real .net regex engine. For example, if the .net regex
/// engine did not report any issues, we could suppress any diagnostics we generated and we /// engine did not report any issues, we could suppress any diagnostics we generated and we
/// could log an NFW to record which pattern we deviated on so we could fix the issue for a /// could log an NFW to record which pattern we deviated on so we could fix the issue for a
...@@ -59,9 +64,16 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions ...@@ -59,9 +64,16 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
/// engine is not just a parser, but something that builds an actual recognizer using techniques /// engine is not just a parser, but something that builds an actual recognizer using techniques
/// that are not necessarily bounded. As such, while we test ourselves around it during our /// that are not necessarily bounded. As such, while we test ourselves around it during our
/// tests, we cannot do the same at runtime as part of the IDE. /// tests, we cannot do the same at runtime as part of the IDE.
/// ///
/// This parser was based off the corefx RegexParser based at: /// This parser was based off the corefx RegexParser based at:
/// https://github.com/dotnet/corefx/blob/f759243d724f462da0bcef54e86588f8a55352c6/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1 /// https://github.com/dotnet/corefx/blob/f759243d724f462da0bcef54e86588f8a55352c6/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1
///
/// Note4: The .Net parser itself changes over time (for example to fix behavior that even it
/// thinks is buggy). When this happens, we have to make a choice as to which behavior to
/// follow. In general, the overall principle is that we should follow the more lenient
/// behavior. If we end up taking the more strict interpretation we risk giving people an error
/// during design time that they would not get at runtime. It's far worse to have that than to
/// not report an error, even though one might happen later.
/// </remarks> /// </remarks>
internal partial struct RegexParser internal partial struct RegexParser
{ {
...@@ -243,16 +255,12 @@ private RegexSequenceNode ParseSequence(bool consumeCloseParen) ...@@ -243,16 +255,12 @@ private RegexSequenceNode ParseSequence(bool consumeCloseParen)
{ {
var list = ArrayBuilder<RegexExpressionNode>.GetInstance(); var list = ArrayBuilder<RegexExpressionNode>.GetInstance();
if (ShouldConsumeSequenceElement(consumeCloseParen)) while (ShouldConsumeSequenceElement(consumeCloseParen))
{ {
do var last = list.Count == 0 ? null : list.Last();
{ list.Add(ParsePrimaryExpressionAndQuantifiers(last));
var last = list.Count == 0 ? null : list.Last();
list.Add(ParsePrimaryExpressionAndQuantifiers(last));
TryMergeLastTwoNodes(list); TryMergeLastTwoNodes(list);
}
while (ShouldConsumeSequenceElement(consumeCloseParen));
} }
return new RegexSequenceNode(list.ToImmutableAndFree()); return new RegexSequenceNode(list.ToImmutableAndFree());
...@@ -726,8 +734,8 @@ private RegexConditionalGroupingNode ParseConditionalGrouping(RegexToken openPar ...@@ -726,8 +734,8 @@ private RegexConditionalGroupingNode ParseConditionalGrouping(RegexToken openPar
} }
else else
{ {
// If its a capture name, it's ok if it that capture doesn't exist. In that // If it's a capture name, it's ok if that capture doesn't exist. In that case we
// case we will just treat this as an conditional expression. // will just treat this as an conditional expression.
if (!HasCapture((string)capture.Value)) if (!HasCapture((string)capture.Value))
{ {
_lexer.Position = afterInnerOpenParen; _lexer.Position = afterInnerOpenParen;
...@@ -1284,14 +1292,14 @@ private bool TryGetRangeComponentValueWorker(RegexNode component, out char ch) ...@@ -1284,14 +1292,14 @@ private bool TryGetRangeComponentValueWorker(RegexNode component, out char ch)
ch = ((RegexSimpleEscapeNode)component).TypeToken.VirtualChars[0]; ch = ((RegexSimpleEscapeNode)component).TypeToken.VirtualChars[0];
switch (ch) switch (ch)
{ {
case 'a': ch = '\u0007'; break; case 'a': ch = '\u0007'; break; // bell
case 'b': ch = '\b'; break; case 'b': ch = '\b'; break; // backspace
case 'e': ch = '\u001B'; break; case 'e': ch = '\u001B'; break; // escape
case 'f': ch = '\f'; break; case 'f': ch = '\f'; break; // form feed
case 'n': ch = '\n'; break; case 'n': ch = '\n'; break; // new line
case 'r': ch = '\r'; break; case 'r': ch = '\r'; break; // carriage return
case 't': ch = '\t'; break; case 't': ch = '\t'; break; // tab
case 'v': ch = '\u000B'; break; case 'v': ch = '\u000B'; break; // vertical tab
} }
return true; return true;
...@@ -1889,6 +1897,21 @@ private RegexControlEscapeNode ParseControlEscape(RegexToken backslashToken, boo ...@@ -1889,6 +1897,21 @@ private RegexControlEscapeNode ParseControlEscape(RegexToken backslashToken, boo
unchecked unchecked
{ {
// From: https://github.com/dotnet/corefx/blob/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1450
// Note: Roslyn accepts a control escape that current .Net parser does not.
// Specifically: \c[
//
// It is a bug that the .Net parser does not support this construct. The bug was
// reported at: https://github.com/dotnet/corefx/issues/26501 and was fixed for
// CoreFx with https://github.com/dotnet/corefx/commit/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7
//
// Because it was a bug, we follow the correct behavior. That means we will not
// report a diagnostic for a Regex that someone might run on a previous version of
// .Net that ends up throwing at runtime. That's acceptable. Our goal is to match
// the latest .Net 'correct' behavior. Not intermediary points with bugs that have
// since been fixed.
// \ca interpreted as \cA // \ca interpreted as \cA
if (ch >= 'a' && ch <= 'z') if (ch >= 'a' && ch <= 'z')
{ {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册