// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
using System.Composition;
using System.Diagnostics;
using Microsoft.CodeAnalysis.Host.Mef;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
using Microsoft.CodeAnalysis.VirtualChars;
namespace Microsoft.CodeAnalysis.CSharp.VirtualChars
[ExportLanguageService(typeof(IVirtualCharService), LanguageNames.CSharp), Shared]
internal class CSharpVirtualCharService : AbstractVirtualCharService
public static readonly IVirtualCharService Instance = new CSharpVirtualCharService();
protected override ImmutableArray<VirtualChar> TryConvertToVirtualCharsWorker(SyntaxToken token)
if (token.Kind() != SyntaxKind.StringLiteralToken)
return default;
return token.IsVerbatimStringLiteral()
? TryConvertVerbatimStringToVirtualChars(token)
: TryConvertStringToVirtualChars(token);
private ImmutableArray<VirtualChar> TryConvertVerbatimStringToVirtualChars(SyntaxToken token)
=> TryConvertSimpleDoubleQuoteString(token, "@\"");
private ImmutableArray<VirtualChar> TryConvertStringToVirtualChars(SyntaxToken token)
const string StartDelimeter = "\"";
const string EndDelimeter = "\"";
var tokenText = token.Text;
if (!tokenText.StartsWith(StartDelimeter) ||
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return default;
var startIndexInclusive = StartDelimeter.Length;
var endIndexExclusive = tokenText.Length - EndDelimeter.Length;
var result = ArrayBuilder<VirtualChar>.GetInstance();
var offset = token.SpanStart;
for (var index = startIndexInclusive; index < endIndexExclusive;)
if (tokenText[index] == '\\')
if (!TryAddEscape(result, tokenText, offset, index))
return default;
index += result.Last().Span.Length;
result.Add(new VirtualChar(tokenText[index], new TextSpan(offset + index, 1)));
return result.ToImmutable();
private bool TryAddEscape(
ArrayBuilder<VirtualChar> result, string tokenText, int offset, int index)
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
return TryAddSingleCharacterEscape(result, tokenText, offset, index) ||
TryAddMultiCharacterEscape(result, tokenText, offset, index);
private bool TryAddSingleCharacterEscape(
ArrayBuilder<VirtualChar> result, string tokenText, int offset, int index)
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
var ch = tokenText[index + 1];
switch (ch)
// escaped characters that translate to themselves
case '\'':
case '"':
case '\\':
// translate escapes as per C# spec
case '0': ch = '\0'; break;
case 'a': ch = '\a'; break;
case 'b': ch = '\b'; break;
case 'f': ch = '\f'; break;
case 'n': ch = '\n'; break;
case 'r': ch = '\r'; break;
case 't': ch = '\t'; break;
case 'v': ch = '\v'; break;
return false;
result.Add(new VirtualChar(ch, new TextSpan(offset + index, 2)));
return true;
private bool TryAddMultiCharacterEscape(
ArrayBuilder<VirtualChar> result, string tokenText, int offset, int index)
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
var ch = tokenText[index + 1];
switch (ch)
case 'x':
case 'u':
case 'U':
return TryAddMultiCharacterEscape(result, tokenText, offset, index, ch);
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
private bool TryAddMultiCharacterEscape(
ArrayBuilder<VirtualChar> result, string tokenText, int offset, int index, char character)
var startIndex = index;
Debug.Assert(tokenText[index] == '\\');
// skip past the / and the escape type.
index += 2;
if (character == 'U')
// 8 character escape. May represent 1 or 2 actual chars. In the case of
// 2 chars, we will fail out as that isn't supported in this system (currently).
uint uintChar = 0;
if (!IsHexDigit(tokenText[index]))
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
for (var i = 0; i < 8; i++)
character = tokenText[index + i];
if (!IsHexDigit(character))
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
uintChar = (uint)((uintChar << 4) + HexValue(character));
if (uintChar > 0x0010FFFF)
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
// Surrogate characters aren't supported here.
if (uintChar >= 0x00010000)
// This is possible. It's a legal C# escape, but we don't support it here because it
// would need two chars to encode.
return false;
result.Add(new VirtualChar((char)uintChar, new TextSpan(startIndex + offset, 2 + 8)));
return true;
else if (character == 'u')
// 4 character escape representing one char.
var intChar = 0;
if (!IsHexDigit(tokenText[index]))
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
var endIndex = index + 1;
for (var i = 0; i < 4; i++)
var ch2 = tokenText[index + i];
if (!IsHexDigit(ch2))
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
intChar = (intChar << 4) + HexValue(ch2);
character = (char)intChar;
result.Add(new VirtualChar(character, new TextSpan(startIndex + offset, 2 + 4)));
return true;
Debug.Assert(character == 'x');
// Variable length (up to 4 chars) hexadecimal escape.
var intChar = 0;
if (!IsHexDigit(tokenText[index]))
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
var endIndex = index;
for (var i = 0; i < 4; i++)
var ch2 = tokenText[index + i];
if (!IsHexDigit(ch2))
// This is possible. These escape sequences are variable length.
intChar = (intChar << 4) + HexValue(ch2);
character = (char)intChar;
result.Add(new VirtualChar(character, TextSpan.FromBounds(startIndex + offset, endIndex + offset)));
return true;
private static int HexValue(char c)
return (c >= '0' && c <= '9') ? c - '0' : (c & 0xdf) - 'A' + 10;
private static bool IsHexDigit(char c)
return (c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f');
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
using System.Linq;
using Microsoft.CodeAnalysis.CSharp.VirtualChars;
using Microsoft.CodeAnalysis.VirtualChars;
using Xunit;
namespace Microsoft.CodeAnalysis.CSharp.UnitTests.VirtualChars
public class CSharpVirtualCharServiceTests
private const string _statementPrefix = "var v = ";
private SyntaxToken GetStringToken(string text)
var statement = _statementPrefix + text;
var parsedStatement = SyntaxFactory.ParseStatement(statement);
var token = parsedStatement.DescendantTokens().ToArray()[3];
Assert.Equal(token.Kind(), SyntaxKind.StringLiteralToken);
return token;
private void Test(string stringText, string expected)
var token = GetStringToken(stringText);
var virtualChars = CSharpVirtualCharService.Instance.TryConvertToVirtualChars(token);
var actual = ConvertToString(virtualChars);
Assert.Equal(expected, actual);
private void TestFailure(string stringText)
var token = GetStringToken(stringText);
var virtualChars = CSharpVirtualCharService.Instance.TryConvertToVirtualChars(token);
public void TestEmptyString()
Test("\"\"", "");
public void TestEmptyVerbatimString()
Test("@\"\"", "");
public void TestSimpleString()
Test("\"a\"", "['a',[1,2]]");
public void TestSimpleVerbatimString()
Test("@\"a\"", "['a',[2,3]]");
public void TestUnterminatedString()
public void TestUnterminatedVerbatimString()
public void TestSimpleEscape()
Test(@"""a\ta""", "['a',[1,2]]['\\u0009',[2,4]]['a',[4,5]]");
public void TestMultipleSimpleEscape()
Test(@"""a\t\ta""", "['a',[1,2]]['\\u0009',[2,4]]['\\u0009',[4,6]]['a',[6,7]]");
public void TestNonEscapeInVerbatim()
Test(@"@""a\ta""", "['a',[2,3]]['\\u005C',[3,4]]['t',[4,5]]['a',[5,6]]");
public void TestInvalidHexEscape()
public void TestValidHex1Escape()
Test(@"""\xa""", @"['\u000A',[1,4]]");
public void TestValidHex2Escape()
Test(@"""\xaa""", @"['\u00AA',[1,5]]");
public void TestValidHex3Escape()
Test(@"""\xaaa""", @"['\u0AAA',[1,6]]");
public void TestValidHex4Escape()
Test(@"""\xaaaa""", @"['\uAAAA',[1,7]]");
public void TestValidHex5Escape()
Test(@"""\xaaaaa""", @"['\uAAAA',[1,7]]['a',[7,8]]");
public void TestValidHex6Escape()
Test(@"""a\xaaaaa""", @"['a',[1,2]]['\uAAAA',[2,8]]['a',[8,9]]");
public void TestInvalidUnicodeEscape()
public void TestValidUnicodeEscape1()
Test(@"""\u0000""", @"['\u0000',[1,7]]");
public void TestValidUnicodeEscape2()
Test(@"""a\u0000a""", @"['a',[1,2]]['\u0000',[2,8]]['a',[8,9]]");
public void TestInvalidLongUnicodeEscape1()
public void TestInvalidLongUnicodeEscape2()
public void TestValidLongEscape1()
Test(@"""\U00000000""", @"['\u0000',[1,11]]");
public void TestValidLongEscape2()
Test(@"""\U0000ffff""", @"['\uFFFF',[1,11]]");
public void TestValidLongEscape3()
Test(@"""a\U00000000a""", @"['a',[1,2]]['\u0000',[2,12]]['a',[12,13]]");
public void TestValidButUnsupportedLongEscape1()
var token = GetStringToken(@"""\U00010000""");
public void TestEscapedQuoteInVerbatimString()
Test("@\"a\"\"a\"", @"['a',[2,3]]['\u0022',[3,5]]['a',[5,6]]");
private string ConvertToString(ImmutableArray<VirtualChar> virtualChars)
=> string.Join("", virtualChars.Select(ConvertToString));
private string ConvertToString(VirtualChar vc)
=> $"[{ConvertToString(vc.Char)},[{vc.Span.Start - _statementPrefix.Length},{vc.Span.End - _statementPrefix.Length}]]";
private string ConvertToString(char c)
=> char.IsLetterOrDigit(c) && c < 127 ? $"'{c}'" : $"'\\u{((int)c).ToString("X4")}'";
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
namespace Microsoft.CodeAnalysis.VirtualChars
internal abstract class AbstractVirtualCharService : IVirtualCharService
protected abstract ImmutableArray<VirtualChar> TryConvertToVirtualCharsWorker(SyntaxToken token);
public ImmutableArray<VirtualChar> TryConvertToVirtualChars(SyntaxToken token)
// We don't process any strings that contain diagnostics in it. That means that we can
// trust that all the string's contents (most importantly, the escape sequences) are well
// formed.
if (token.ContainsDiagnostics)
return default;
var result = TryConvertToVirtualCharsWorker(token);
// Do some invariant checking to make sure we processed the string token the same
// way the C# and VB compilers did.
if (!result.IsDefault)
// Ensure that we properly broke up the token into a sequence of characters that
// matches what the compiler did.
var expectedValueText = token.ValueText;
var actualValueText = result.CreateString();
Debug.Assert(expectedValueText == actualValueText);
if (result.Length > 0)
var currentVC = result[0];
Debug.Assert(currentVC.Span.Start > token.SpanStart, "First span has to start after the start of the string token (including its delimeter)");
Debug.Assert(currentVC.Span.Start == token.SpanStart + 1 || currentVC.Span.Start == token.SpanStart + 2, "First span should start on the second or third char of the string.");
for (var i = 1; i < result.Length; i++)
var nextVC = result[i];
Debug.Assert(currentVC.Span.End == nextVC.Span.Start, "Virtual character spans have to be touching.");
currentVC = nextVC;
var lastVC = result.Last();
Debug.Assert(lastVC.Span.End == token.Span.End - 1, "Last span has to end right before the end of the string token (including its trailing delimeter).");
return result;
/// <summary>
/// Helper to convert simple string literals that escape quotes by doubling them. This is
/// how normal VB literals and c# verbatim string literals work.
/// </summary>
/// <param name="startDelimiter">The start characters string. " in VB and @" in C#</param>
protected static ImmutableArray<VirtualChar> TryConvertSimpleDoubleQuoteString(
SyntaxToken token, string startDelimiter)
const string endDelimiter = "\"";
var tokenText = token.Text;
if (!tokenText.StartsWith(startDelimiter) ||
Debug.Assert(false, "This should not be reachable as long as the compiler added no diagnostics.");
return default;
var startIndexInclusive = startDelimiter.Length;
var endIndexExclusive = tokenText.Length - endDelimiter.Length;
var result = ArrayBuilder<VirtualChar>.GetInstance();
var offset = token.SpanStart;
for (var index = startIndexInclusive; index < endIndexExclusive;)
if (tokenText[index] == '"' &&
tokenText[index + 1] == '"')
result.Add(new VirtualChar('"', new TextSpan(offset + index, 2)));
index += 2;
result.Add(new VirtualChar(tokenText[index], new TextSpan(offset + index, 1)));
return result.ToImmutableAndFree();
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
using Microsoft.CodeAnalysis.Host;
using Microsoft.CodeAnalysis.Text;
namespace Microsoft.CodeAnalysis.VirtualChars
/// <summary>
/// Helper service that takes the raw text of a string token and produces the individual
/// characters that raw string token represents (i.e. with escapes collapsed). The difference
/// between this and the result from token.ValueText is that for each collapsed character
/// returned the original span of text in the original token can be found. i.e. if you had the
/// following in C#:
/// "G\u006fo"
/// Then you'd get back:
/// 'G' -> [0, 1) 'o' -> [1, 7) 'o' -> [7, 1)
/// This allows for embedded language processing that can refer back to the users' original code
/// instead of the escaped value we're processing.
/// </summary>
internal interface IVirtualCharService : ILanguageService
/// <summary>
/// Takes in a string token and return the <see cref="VirtualChar"/>s corresponding to each
/// char of the tokens <see cref="SyntaxToken.ValueText"/>. In other words, for each char
/// in ValueText there will be a VirtualChar in the resultant array. Each VirtualChar will
/// specify what char the language considers them to represent, as well as the span of text
/// in the original <see cref="SourceText"/> that the language created that char from.
/// For most chars this will be a single character span. i.e. 'c' -> 'c'. However, for
/// escapes this may be a multi character span. i.e. 'c' -> '\u0063'
/// If the token is not a string literal token, or the string literal has any diagnostics on
/// it, then <see langword="default"/> will be returned. Additionally, because a
/// VirtualChar can only represent a single char, while some escape sequences represent
/// multiple chars, <see langword="default"/> will also be returned in those cases. All
/// these cases could be relaxed in the future. But they greatly simplify the
/// implementation.
/// If this function succeeds, certain invariants will hold. First, each character in the
/// sequence of characters in <paramref name="token"/>.ValueText will become a single
/// VirtualChar in the result array with a matching <see cref="VirtualChar.Char"/> property.
/// Similarly, each VirtualChar's <see cref="VirtualChar.Span"/> will abut each other, and
/// the union of all of them will cover the span of the token's <see
/// cref="SyntaxToken.Text"/>
/// *not* including the start and quotes.
/// In essence the VirtualChar array acts as the information explaining how the <see
/// cref="SyntaxToken.Text"/> of the token between the quotes maps to each character in the
/// token's <see cref="SyntaxToken.ValueText"/>.
/// </summary>
ImmutableArray<VirtualChar> TryConvertToVirtualChars(SyntaxToken token);
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using Microsoft.CodeAnalysis.Text;
namespace Microsoft.CodeAnalysis.VirtualChars
/// <summary>
/// The Regex and Json parsers wants to work over an array of characters, however this array of
/// characters is not the same as the array of characters a user types into a string in C# or
/// VB. For example In C# someone may write: @"\z". This should appear to the user the same as
/// if they wrote "\\z" and the same as "\\\u007a". However, as these all have wildly different
/// presentations for the user, there needs to be a way to map back the characters it sees ( '\'
/// and 'z' ) back to the ranges of characters the user wrote.
/// VirtualChar serves this purpose. It contains the interpreted value of any language
/// character/character-escape-sequence, as well as the original SourceText span where that
/// interpreted character was created from. This allows the regex and json parsers to both
/// process input from any language uniformly, but then also produce trees and diagnostics that
/// map back properly to the original source text locations that make sense to the user.
/// </summary>
internal struct VirtualChar : IEquatable<VirtualChar>
public readonly char Char;
public readonly TextSpan Span;
public VirtualChar(char @char, TextSpan span)
if (span.IsEmpty)
throw new ArgumentException("Span should not be empty.", nameof(span));
Char = @char;
Span = span;
public override bool Equals(object obj)
=> obj is VirtualChar vc && Equals(vc);
public bool Equals(VirtualChar other)
=> Char == other.Char &&
Span == other.Span;
public override int GetHashCode()
var hashCode = 244102310;
hashCode = hashCode * -1521134295 + Char.GetHashCode();
hashCode = hashCode * -1521134295 + Span.GetHashCode();
return hashCode;
public static bool operator ==(VirtualChar char1, VirtualChar char2)
=> char1.Equals(char2);
public static bool operator !=(VirtualChar char1, VirtualChar char2)
=> !(char1 == char2);
public static implicit operator char(VirtualChar vc) => vc.Char;
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
using Microsoft.CodeAnalysis.PooledObjects;
namespace Microsoft.CodeAnalysis.VirtualChars
internal static class VirtualCharExtensions
public static string CreateString(this ImmutableArray<VirtualChar> chars)
var builder = PooledStringBuilder.GetInstance();
foreach (var vc in chars)
return builder.ToStringAndFree();
' Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
Imports System.Collections.Immutable
Imports System.Composition
Imports Microsoft.CodeAnalysis.Host.Mef
Imports Microsoft.CodeAnalysis.VirtualChars
Namespace Microsoft.CodeAnalysis.VisualBasic.VirtualChars
<ExportLanguageService(GetType(IVirtualCharService), LanguageNames.VisualBasic), [Shared]>
Friend Class VisualBasicVirtualCharService
Inherits AbstractVirtualCharService
Public Shared ReadOnly Instance As IVirtualCharService = New VisualBasicVirtualCharService()
Protected Overrides Function TryConvertToVirtualCharsWorker(token As SyntaxToken) As ImmutableArray(Of VirtualChar)
Debug.Assert(Not token.ContainsDiagnostics)
Return TryConvertSimpleDoubleQuoteString(token, """")
End Function
End Class
End Namespace
' Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
Imports System.Collections.Immutable
Imports Microsoft.CodeAnalysis.VirtualChars
Imports Microsoft.CodeAnalysis.VisualBasic.VirtualChars
Imports Xunit
Namespace Microsoft.CodeAnalysis.VisualBasic.UnitTests.VirtualChars
Public Class VisualBasicVirtualCharServiceTests
Private Const _statementPrefix As String = "dim v = "
Private Function GetStringToken(text As String) As SyntaxToken
Dim statement = _statementPrefix + text
Dim parsedStatement = SyntaxFactory.ParseExecutableStatement(statement)
Dim token = parsedStatement.DescendantTokens().ToArray()(3)
Assert.True(token.Kind() = SyntaxKind.StringLiteralToken)
Return token
End Function
Private Sub Test(stringText As String, expected As String)
Dim token = GetStringToken(stringText)
Dim virtualChars = VisualBasicVirtualCharService.Instance.TryConvertToVirtualChars(token)
Dim actual = ConvertToString(virtualChars)
Assert.Equal(expected, actual)
End Sub
Private Sub TestFailure(stringText As String)
Dim token = GetStringToken(stringText)
Dim virtualChars = VisualBasicVirtualCharService.Instance.TryConvertToVirtualChars(token)
End Sub
Public Sub TestEmptyString()
Test("""""", "")
End Sub
Public Sub TestSimpleString()
Test("""a""", "['a',[1,2]]")
End Sub
Public Sub TestStringWithDoubleQuoteInIt()
Test("""a""""b""", "['a',[1,2]]['""',[2,4]]['b',[4,5]]")
End Sub
Private Function ConvertToString(virtualChars As ImmutableArray(Of VirtualChar)) As String
Return String.Join("", virtualChars.Select(AddressOf ConvertToString))
End Function
Private Function ConvertToString(vc As VirtualChar) As String
Return $"[{ConvertToString(vc.Char)},[{vc.Span.Start - _statementPrefix.Length},{vc.Span.End - _statementPrefix.Length}]]"
End Function
Private Function ConvertToString(c As Char) As String
Return "'" + c + "'"
End Function
End Class
End Namespace
