提交 2fb306e0 编写于 作者: C Cyrus Najmabadi

Initial work on a regex classifier.

上级 997057fd
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Threading;
using Microsoft.CodeAnalysis.CSharp.Extensions;
using Microsoft.CodeAnalysis.CSharp.RegularExpressions;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using Microsoft.CodeAnalysis.Diagnostics;
using Microsoft.CodeAnalysis.LanguageServices;
using Microsoft.CodeAnalysis.RegularExpressions;
......@@ -19,12 +16,12 @@ public CSharpValidateRegexStringDiagnosticAnalyzer()
{
}
protected override IParameterSymbol DetermineParameter(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken)
=> ((ArgumentSyntax)argumentNode).DetermineParameter(semanticModel, allowParams: false, cancellationToken);
protected override ISyntaxFactsService GetSyntaxFactsService()
=> CSharpSyntaxFactsService.Instance;
protected override ISemanticFactsService GetSemanticFactsService()
=> CSharpSemanticFactsService.Instance;
protected override IVirtualCharService GetVirtualCharService()
=> CSharpVirtualCharService.Instance;
}
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using Microsoft.CodeAnalysis.CodeStyle;
using Microsoft.CodeAnalysis.Diagnostics;
using Microsoft.CodeAnalysis.LanguageServices;
using Microsoft.CodeAnalysis.RegularExpressions;
using Microsoft.CodeAnalysis.Shared.Extensions;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.ValidateRegexString
{
......@@ -20,7 +13,6 @@ internal abstract class AbstractValidateRegexStringDiagnosticAnalyzer<TSyntaxKin
: AbstractCodeStyleDiagnosticAnalyzer
where TSyntaxKind : struct
{
private const string _patternName = "pattern";
private readonly int _stringLiteralKind;
protected AbstractValidateRegexStringDiagnosticAnalyzer(int stringLiteralKind)
......@@ -37,10 +29,9 @@ public override bool OpenFileOnly(Workspace workspace)
=> false;
protected abstract ISyntaxFactsService GetSyntaxFactsService();
protected abstract ISemanticFactsService GetSemanticFactsService();
protected abstract IVirtualCharService GetVirtualCharService();
protected abstract IParameterSymbol DetermineParameter(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken);
protected override void InitializeWorker(AnalysisContext context)
=> context.RegisterSemanticModelAction(AnalyzeSemanticModel);
......@@ -62,221 +53,63 @@ private void AnalyzeSemanticModel(SemanticModelAnalysisContext context)
return;
}
var regexType = semanticModel.Compilation.GetTypeByMetadataName(typeof(Regex).FullName);
if (regexType == null)
var detector = RegexPatternDetector.TryCreate(
semanticModel, GetSyntaxFactsService(), GetSemanticFactsService());
if (detector == null)
{
return;
}
var syntaxFacts = GetSyntaxFactsService();
var methodNamesOfInterest = GetMethodNamesOfInterest(regexType, syntaxFacts);
var root = syntaxTree.GetRoot(cancellationToken);
var analyzer = new Analyzer(this, context, regexType, methodNamesOfInterest);
analyzer.Analyze(root);
}
private HashSet<string> GetMethodNamesOfInterest(INamedTypeSymbol regexType, ISyntaxFactsService syntaxFacts)
{
var result = syntaxFacts.IsCaseSensitive
? new HashSet<string>()
: new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var methods = from method in regexType.GetMembers().OfType<IMethodSymbol>()
where method.DeclaredAccessibility == Accessibility.Public
where method.IsStatic
where method.Parameters.Any(p => p.Name == _patternName)
select method.Name;
result.AddRange(methods);
return result;
Analyze(context, detector, root, cancellationToken);
}
private struct Analyzer
private void Analyze(
SemanticModelAnalysisContext context, RegexPatternDetector detector,
SyntaxNode node, CancellationToken cancellationToken)
{
private readonly AbstractValidateRegexStringDiagnosticAnalyzer<TSyntaxKind> _analyzer;
private readonly SemanticModelAnalysisContext _context;
private readonly SemanticModel _semanticModel;
private readonly ISyntaxFactsService _syntaxFacts;
private readonly INamedTypeSymbol _regexType;
private readonly HashSet<string> _methodNamesOfInterest;
private readonly CancellationToken _cancellationToken;
cancellationToken.ThrowIfCancellationRequested();
public Analyzer(
AbstractValidateRegexStringDiagnosticAnalyzer<TSyntaxKind> analyzer,
SemanticModelAnalysisContext context, INamedTypeSymbol regexType,
HashSet<string> methodNamesOfInterest)
foreach (var child in node.ChildNodesAndTokens())
{
_analyzer = analyzer;
_context = context;
_semanticModel = context.SemanticModel;
_syntaxFacts = analyzer.GetSyntaxFactsService();
_regexType = regexType;
_methodNamesOfInterest = methodNamesOfInterest;
_cancellationToken = context.CancellationToken;
}
public void Analyze(SyntaxNode node)
{
_cancellationToken.ThrowIfCancellationRequested();
foreach (var child in node.ChildNodesAndTokens())
if (child.IsNode)
{
if (child.IsNode)
{
Analyze(child.AsNode());
}
else
{
var token = child.AsToken();
if (token.RawKind == _analyzer._stringLiteralKind)
{
AnalyzeStringLiteral(token);
}
}
Analyze(context, detector, child.AsNode(), cancellationToken);
}
}
private void AnalyzeStringLiteral(SyntaxToken stringLiteral)
{
var literalNode = stringLiteral.Parent;
var argumentNode = literalNode.Parent;
if (!_syntaxFacts.IsArgument(argumentNode))
{
return;
}
var argumentList = argumentNode.Parent;
var invocationOrCreation = argumentList.Parent;
if (_syntaxFacts.IsInvocationExpression(invocationOrCreation))
{
var invokedExpression = _syntaxFacts.GetExpressionOfInvocationExpression(invocationOrCreation);
var name = GetNameOfInvokedExpression(invokedExpression);
if (!_methodNamesOfInterest.Contains(name))
{
return;
}
// Is a string argument to a method that looks like it could be a Regex method.
// Need to do deeper analysis
var method = _semanticModel.GetSymbolInfo(invocationOrCreation, _cancellationToken).GetAnySymbol();
if (method?.ContainingType != _regexType)
{
return;
}
AnalyzeStringLiteral(stringLiteral, argumentNode);
}
else if (_syntaxFacts.IsObjectCreationExpression(invocationOrCreation))
else
{
var typeNode = _syntaxFacts.GetObjectCreationType(invocationOrCreation);
var name = GetNameOfType(typeNode, _syntaxFacts);
if (name == null)
var token = child.AsToken();
if (token.RawKind == _stringLiteralKind &&
detector.IsRegexPattern(token, cancellationToken, out var options))
{
return;
AnalyzePattern(context, token, options);
}
if (_syntaxFacts.StringComparer.Compare(nameof(Regex), name) != 0)
{
return;
}
// Argument to "new Regex". Need to do deeper analysis
AnalyzeStringLiteral(stringLiteral, argumentNode);
}
else
{
return;
}
}
private void AnalyzeStringLiteral(SyntaxToken stringLiteral, SyntaxNode argumentNode)
{
var parameter = _analyzer.DetermineParameter(_semanticModel, argumentNode, _cancellationToken);
if (parameter?.Name != _patternName)
{
return;
}
var options = GetRegexOptions(argumentNode);
var service = _analyzer.GetVirtualCharService();
if (service == null)
{
return;
}
var virtualChars = service.TryConvertToVirtualChars(stringLiteral);
if (virtualChars.IsDefaultOrEmpty)
{
return;
}
var tree = RegexParser.Parse(virtualChars, options);
foreach (var diag in tree.Diagnostics)
{
_context.ReportDiagnostic(Diagnostic.Create(
_analyzer.GetDescriptorWithSeverity(DiagnosticSeverity.Warning),
Location.Create(_semanticModel.SyntaxTree, diag.Span),
diag.Message));
}
}
}
private RegexOptions GetRegexOptions(SyntaxNode argumentNode)
private void AnalyzePattern(
SemanticModelAnalysisContext context, SyntaxToken stringLiteral, RegexOptions options)
{
var service = this.GetVirtualCharService();
if (service == null)
{
var argumentList = argumentNode.Parent;
var arguments = _syntaxFacts.GetArgumentsOfArgumentList(argumentList);
foreach (var siblingArg in arguments)
{
if (siblingArg != argumentNode)
{
var expr = _syntaxFacts.GetExpressionOfArgument(siblingArg);
if (expr != null)
{
var exprType = _semanticModel.GetTypeInfo(expr, _cancellationToken);
if (exprType.Type?.Name == nameof(RegexOptions))
{
var constVal = _semanticModel.GetConstantValue(expr, _cancellationToken);
if (constVal.HasValue)
{
return (RegexOptions)(int)constVal.Value;
}
}
}
}
}
return RegexOptions.None;
return;
}
private string GetNameOfType(SyntaxNode typeNode, ISyntaxFactsService syntaxFacts)
var virtualChars = service.TryConvertToVirtualChars(stringLiteral);
if (virtualChars.IsDefaultOrEmpty)
{
if (syntaxFacts.IsQualifiedName(typeNode))
{
return GetNameOfType(syntaxFacts.GetRightSideOfDot(typeNode), syntaxFacts);
}
else if (syntaxFacts.IsIdentifierName(typeNode))
{
return syntaxFacts.GetIdentifierOfSimpleName(typeNode).ValueText;
}
return null;
return;
}
private string GetNameOfInvokedExpression(SyntaxNode invokedExpression)
var tree = RegexParser.Parse(virtualChars, options);
foreach (var diag in tree.Diagnostics)
{
if (_syntaxFacts.IsSimpleMemberAccessExpression(invokedExpression))
{
return _syntaxFacts.GetIdentifierOfSimpleName(_syntaxFacts.GetNameOfMemberAccessExpression(invokedExpression)).ValueText;
}
else if (_syntaxFacts.IsIdentifierName(invokedExpression))
{
return _syntaxFacts.GetIdentifierOfSimpleName(invokedExpression).ValueText;
}
return null;
context.ReportDiagnostic(Diagnostic.Create(
this.GetDescriptorWithSeverity(DiagnosticSeverity.Warning),
Location.Create(context.SemanticModel.SyntaxTree, diag.Span),
diag.Message));
}
}
}
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading;
using Microsoft.CodeAnalysis.Classification;
using Microsoft.CodeAnalysis.CSharp.RegularExpressions;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.RegularExpressions;
namespace Microsoft.CodeAnalysis.CSharp.Classification.Classifiers
{
internal class RegexPatternTokenClassifier : AbstractSyntaxClassifier
{
private static readonly ConditionalWeakTable<SemanticModel, RegexPatternDetector> _modelToDetector =
new ConditionalWeakTable<SemanticModel, RegexPatternDetector>();
public override ImmutableArray<int> SyntaxTokenKinds { get; } = ImmutableArray.Create<int>((int)SyntaxKind.StringLiteralToken);
public override void AddClassifications(SyntaxToken token, SemanticModel semanticModel, ArrayBuilder<ClassifiedSpan> result, CancellationToken cancellationToken)
{
Debug.Assert(token.Kind() == SyntaxKind.StringLiteralToken);
// Do some quick syntactic checks before doing any complex work.
if (RegexPatternDetector.IsDefinitelyNotPattern(token, CSharpSyntaxFactsService.Instance))
{
return;
}
// Looks like it could be a regex pattern. Do more complex check.
// Cache the detector we create, so we don't have to continually do
// the same semantic work for every string literal token we visit.
var detector = _modelToDetector.GetValue(
semanticModel, m => RegexPatternDetector.TryCreate(
m, CSharpSyntaxFactsService.Instance, CSharpSemanticFactsService.Instance));
if (!detector.IsRegexPattern(token, cancellationToken, out var options))
{
return;
}
var virtualCharService = CSharpVirtualCharService.Instance;
var chars = virtualCharService.TryConvertToVirtualChars(token);
if (chars.IsDefaultOrEmpty)
{
return;
}
var tree = RegexParser.Parse(chars, options);
AddClassifications(tree, result);
}
private void AddClassifications(RegexTree tree, ArrayBuilder<ClassifiedSpan> result)
{
throw new NotImplementedException();
}
}
}
......@@ -296,5 +296,8 @@ public bool IsPartial(ITypeSymbol typeSymbol, CancellationToken cancellationToke
return SpecializedCollections.SingletonEnumerable(
semanticModel.GetDeclaredSymbol(memberDeclaration, cancellationToken));
}
public IParameterSymbol FindParameterForArgument(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken)
=> ((ArgumentSyntax)argumentNode).DetermineParameter(semanticModel, allowParams: false, cancellationToken);
}
}
......@@ -4,16 +4,17 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using Microsoft.CodeAnalysis.LanguageServices;
using Microsoft.CodeAnalysis.RegularExpressions;
using Microsoft.CodeAnalysis.Shared.Extensions;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.ValidateRegexString
namespace Microsoft.CodeAnalysis.RegularExpressions
{
/// <summary>
/// Helper class to detect regex pattern tokens in a document efficiently.
/// </summary>
internal class RegexPatternDetector
{
private const string _patternName = "pattern";
......@@ -23,29 +24,25 @@ internal class RegexPatternDetector
private readonly ISemanticFactsService _semanticFacts;
private readonly INamedTypeSymbol _regexType;
private readonly HashSet<string> _methodNamesOfInterest;
private readonly CancellationToken _cancellationToken;
public RegexPatternDetector(
SemanticModel semanticModel,
ISyntaxFactsService syntaxFacts,
ISemanticFactsService semanticFacts,
INamedTypeSymbol regexType,
HashSet<string> methodNamesOfInterest,
CancellationToken cancellationToken)
HashSet<string> methodNamesOfInterest)
{
_semanticModel = semanticModel;
_syntaxFacts = syntaxFacts;
_semanticFacts = semanticFacts;
_regexType = regexType;
_methodNamesOfInterest = methodNamesOfInterest;
_cancellationToken = cancellationToken;
}
public static RegexPatternDetector TryCreate(
SemanticModel semanticModel,
ISyntaxFactsService syntaxFacts,
ISemanticFactsService semanticFacts,
CancellationToken cancellationToken)
ISemanticFactsService semanticFacts)
{
var regexType = semanticModel.Compilation.GetTypeByMetadataName(typeof(Regex).FullName);
if (regexType == null)
......@@ -56,7 +53,22 @@ internal class RegexPatternDetector
var methodNamesOfInterest = GetMethodNamesOfInterest(regexType, syntaxFacts);
return new RegexPatternDetector(
semanticModel, syntaxFacts, semanticFacts,
regexType, methodNamesOfInterest, cancellationToken);
regexType, methodNamesOfInterest);
}
public static bool IsDefinitelyNotPattern(SyntaxToken token, ISyntaxFactsService syntaxFacts)
{
// We only support string literals passed in arguments to something.
// In the future we could support any string literal, as long as it has
// some marker (like a comment on it) stating it's a regex.
if (!syntaxFacts.IsStringLiteral(token) ||
!syntaxFacts.IsLiteralExpression(token.Parent) ||
!syntaxFacts.IsArgument(token.Parent.Parent))
{
return true;
}
return false;
}
private static HashSet<string> GetMethodNamesOfInterest(INamedTypeSymbol regexType, ISyntaxFactsService syntaxFacts)
......@@ -76,27 +88,18 @@ where method.Parameters.Any(p => p.Name == _patternName)
return result;
}
public bool IsRegexPattern(SyntaxToken token, out RegexOptions options)
public bool IsRegexPattern(SyntaxToken token, CancellationToken cancellationToken, out RegexOptions options)
{
options = default;
if (!_syntaxFacts.IsStringLiteral(token))
if (IsDefinitelyNotPattern(token, _syntaxFacts))
{
return false;
}
return Analyze(token, out options);
}
private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options)
{
options = default;
var stringLiteral = token;
var literalNode = stringLiteral.Parent;
var argumentNode = literalNode.Parent;
if (!_syntaxFacts.IsArgument(argumentNode))
{
return false;
}
Debug.Assert(_syntaxFacts.IsArgument(argumentNode));
var argumentList = argumentNode.Parent;
var invocationOrCreation = argumentList.Parent;
......@@ -108,10 +111,13 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options)
{
// Is a string argument to a method that looks like it could be a Regex method.
// Need to do deeper analysis
var method = _semanticModel.GetSymbolInfo(invocationOrCreation, _cancellationToken).GetAnySymbol();
if (method?.ContainingType == _regexType)
var method = _semanticModel.GetSymbolInfo(invocationOrCreation, cancellationToken).GetAnySymbol();
if (method.DeclaredAccessibility == Accessibility.Public &&
method.IsStatic &&
_regexType.Equals(method?.ContainingType))
{
return AnalyzeStringLiteral(stringLiteral, argumentNode, out options);
return AnalyzeStringLiteral(
stringLiteral, argumentNode, cancellationToken, out options);
}
}
}
......@@ -123,8 +129,13 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options)
{
if (_syntaxFacts.StringComparer.Compare(nameof(Regex), name) == 0)
{
// Argument to "new Regex". Need to do deeper analysis
return AnalyzeStringLiteral(stringLiteral, argumentNode, out options);
var typeSymbol = _semanticModel.GetTypeInfo(typeNode, cancellationToken).Type;
if (_regexType.Equals(typeSymbol))
{
// Argument to "new Regex". Need to do deeper analysis
return AnalyzeStringLiteral(
stringLiteral, argumentNode, cancellationToken, out options);
}
}
}
}
......@@ -133,21 +144,22 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options)
}
private bool AnalyzeStringLiteral(
SyntaxToken stringLiteral, SyntaxNode argumentNode, out RegexOptions options)
SyntaxToken stringLiteral, SyntaxNode argumentNode,
CancellationToken cancellationToken, out RegexOptions options)
{
options = default;
var parameter = _semanticFacts.FindParameterForArgument(_semanticModel, argumentNode, _cancellationToken);
var parameter = _semanticFacts.FindParameterForArgument(_semanticModel, argumentNode, cancellationToken);
if (parameter?.Name != _patternName)
{
return false;
}
options = GetRegexOptions(argumentNode);
options = GetRegexOptions(argumentNode, cancellationToken);
return true;
}
private RegexOptions GetRegexOptions(SyntaxNode argumentNode)
private RegexOptions GetRegexOptions(SyntaxNode argumentNode, CancellationToken cancellationToken)
{
var argumentList = argumentNode.Parent;
var arguments = _syntaxFacts.GetArgumentsOfArgumentList(argumentList);
......@@ -158,10 +170,10 @@ private RegexOptions GetRegexOptions(SyntaxNode argumentNode)
var expr = _syntaxFacts.GetExpressionOfArgument(siblingArg);
if (expr != null)
{
var exprType = _semanticModel.GetTypeInfo(expr, _cancellationToken);
var exprType = _semanticModel.GetTypeInfo(expr, cancellationToken);
if (exprType.Type?.Name == nameof(RegexOptions))
{
var constVal = _semanticModel.GetConstantValue(expr, _cancellationToken);
var constVal = _semanticModel.GetConstantValue(expr, cancellationToken);
if (constVal.HasValue)
{
return (RegexOptions)(int)constVal.Value;
......
......@@ -290,5 +290,9 @@ Namespace Microsoft.CodeAnalysis.VisualBasic
Return {semanticModel.GetDeclaredSymbol(memberDeclaration, cancellationToken)}
End Function
Public Function FindParameterForArgument(semanticModel As SemanticModel, argumentNode As SyntaxNode, cancellationToken As CancellationToken) As IParameterSymbol Implements ISemanticFactsService.FindParameterForArgument
Return DirectCast(argumentNode, ArgumentSyntax).DetermineParameter(semanticModel, allowParamArray:=False, cancellationToken)
End Function
End Class
End Namespace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册