提交 92476dc1 编写于 作者: P Paul Harrington

Merge pull request #569 from pharring/Fix516

Improve LargeEncodedText performance.

Includes the following public API changes:

Source
 1. Source that extends SourceText and attempts to override the Lines property will now fail to compile.
 2. Source that called SourceText.From(Stream) will continue to compile but will silently call a new overload with a default value which preserves the current behavior.

Binaries
 1. Binaries that extended SourceText and override the Lines property will now fail at runtime when running against the next version of Microsoft.CodeAnalysis.dll
 2. Binaries that call SourceText.From(Stream) will fail at runtime with MissingMethodException when running against the next version of Microsoft.CodeAnalysis.dll
......@@ -93,43 +93,6 @@ public void CheckSum_SHA256()
Assert.Equal("f1945cd6 c19e56b3 c1c78943 ef5ec181 16907a4c a1efc40a 57d48ab1 db7adfc5", StringTextTest.ChecksumToHexQuads(checksum));
}
[Fact]
public void TryReadByteOrderMark()
{
Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[0])));
Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef })));
Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef, 0xbb })));
Assert.Equal(Encoding.UTF8, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef, 0xBB, 0xBF })));
Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xff })));
Assert.Equal(Encoding.Unicode, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xff, 0xfe })));
Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xfe })));
Assert.Equal(Encoding.BigEndianUnicode, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xfe, 0xff })));
}
[Fact]
public void IsBinary()
{
Assert.False(EncodedStringText.IsBinary(""));
Assert.False(EncodedStringText.IsBinary("\0abc"));
Assert.False(EncodedStringText.IsBinary("a\0bc"));
Assert.False(EncodedStringText.IsBinary("abc\0"));
Assert.False(EncodedStringText.IsBinary("a\0b\0c"));
Assert.True(EncodedStringText.IsBinary("\0\0abc"));
Assert.True(EncodedStringText.IsBinary("a\0\0bc"));
Assert.True(EncodedStringText.IsBinary("abc\0\0"));
var encoding = Encoding.GetEncoding(1252);
Assert.False(EncodedStringText.IsBinary(encoding.GetString(new byte[] { 0x81, 0x8D, 0x8F, 0x90, 0x9D })));
// Unicode string: äëïöüû
Assert.False(EncodedStringText.IsBinary("abc def baz aeiouy \u00E4\u00EB\u00EF\u00F6\u00FC\u00FB"));
Assert.True(EncodedStringText.IsBinary(encoding.GetString(ProprietaryTestResources.NetFX.v4_0_30319.System)));
}
[Fact]
public void Decode_NonUtf8()
{
......
......@@ -109,7 +109,8 @@ public void CopyToLargeTest()
}
}
var text = CreateSourceText(stream);
var text = SourceText.From(stream);
Assert.IsType(typeof(LargeEncodedText), text);
char[] buffer = new char[HelloWorld.Length];
for (int start = 0; start < text.Length; start += HelloWorld.Length)
......@@ -119,5 +120,196 @@ public void CopyToLargeTest()
}
}
}
private static void CheckEqualLine(TextLine first, TextLine second)
{
Assert.Equal(first, second);
#if false
// We do not guarantee either identity or Equals!
Assert.Equal(first.Extent, second.Extent);
Assert.Equal(first.ExtentIncludingLineBreak, second.ExtentIncludingLineBreak);
#endif
}
private static void CheckNotEqualLine(TextLine first, TextLine second)
{
Assert.NotEqual(first, second);
#if false
Assert.NotEqual(first, second);
Assert.NotEqual(first.Extent, second.Extent);
Assert.NotEqual(first.ExtentIncludingLineBreak, second.ExtentIncludingLineBreak);
#endif
}
private static void CheckLine(SourceText text, int lineNumber, int start, int length, int newlineLength, string lineText)
{
var textLine = text.Lines[lineNumber];
Assert.Equal(start, textLine.Start);
Assert.Equal(start + length, textLine.End);
Assert.Equal(start + length + newlineLength, textLine.EndIncludingLineBreak);
Assert.Equal(start, textLine.Span.Start);
Assert.Equal(length, textLine.Span.Length);
Assert.Equal(start, textLine.SpanIncludingLineBreak.Start);
Assert.Equal(length + newlineLength, textLine.SpanIncludingLineBreak.Length);
Assert.Equal(lineNumber, textLine.LineNumber);
Assert.Equal(lineText, textLine.ToString());
Assert.Equal(text.ToString().Substring(start, length), textLine.ToString());
CheckEqualLine(textLine, text.Lines[lineNumber]);
for (int p = textLine.Start; p < textLine.EndIncludingLineBreak; ++p)
{
CheckEqualLine(textLine, text.Lines.GetLineFromPosition(p));
Assert.Equal(lineNumber, text.Lines.IndexOf(p));
Assert.Equal(lineNumber, text.Lines.GetLinePosition(p).Line);
Assert.Equal(p - start, text.Lines.GetLinePosition(p).Character);
}
if (start != 0)
{
CheckNotEqualLine(textLine, text.Lines.GetLineFromPosition(start - 1));
Assert.Equal(lineNumber - 1, text.Lines.IndexOf(start - 1));
Assert.Equal(lineNumber - 1, text.Lines.GetLinePosition(start - 1).Line);
}
int nextPosition = start + length + newlineLength;
if (nextPosition < text.Length)
{
CheckNotEqualLine(textLine, text.Lines.GetLineFromPosition(nextPosition));
Assert.Equal(lineNumber + 1, text.Lines.IndexOf(nextPosition));
Assert.Equal(lineNumber + 1, text.Lines.GetLinePosition(nextPosition).Line);
}
}
[Fact]
public void NewLines1()
{
var data = CreateSourceText("foo" + Environment.NewLine + " bar");
Assert.Equal(2, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 5, length: 4, newlineLength: 0, lineText: " bar");
}
[Fact]
public void NewLines2()
{
var text =
@"foo
bar
baz";
var data = CreateSourceText(text);
Assert.Equal(3, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 5, length: 3, newlineLength: 2, lineText: "bar");
CheckLine(data, lineNumber: 2, start: 10, length: 3, newlineLength: 0, lineText: "baz");
}
[Fact]
public void NewLines3()
{
var data = CreateSourceText("foo\r\nbar");
Assert.Equal(2, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 5, length: 3, newlineLength: 0, lineText: "bar");
}
[Fact]
public void NewLines4()
{
var data = CreateSourceText("foo\n\rbar\u2028");
Assert.Equal(4, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 1, lineText: "");
CheckLine(data, lineNumber: 2, start: 5, length: 3, newlineLength: 1, lineText: "bar");
CheckLine(data, lineNumber: 3, start: 9, length: 0, newlineLength: 0, lineText: "");
}
[Fact]
public void NewLines5()
{
// Trailing CR
var data = CreateSourceText("foo\r");
Assert.Equal(2, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 0, lineText: "");
}
[Fact]
public void NewLines6()
{
// Trailing CR+LF
var data = CreateSourceText("foo\r\n");
Assert.Equal(2, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 5, length: 0, newlineLength: 0, lineText: "");
}
[Fact]
public void NewLines7()
{
// Consecutive CR
var data = CreateSourceText("foo\r\rbar");
Assert.Equal(3, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 1, lineText: "");
CheckLine(data, lineNumber: 2, start: 5, length: 3, newlineLength: 0, lineText: "bar");
}
[Fact]
public void NewLines8()
{
// Mix CR with CR+LF
const string cr = "\r";
const string crLf = "\r\n";
var data = CreateSourceText("foo" + cr + crLf + cr + "bar");
Assert.Equal(4, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 2, lineText: "");
CheckLine(data, lineNumber: 2, start: 6, length: 0, newlineLength: 1, lineText: "");
CheckLine(data, lineNumber: 3, start: 7, length: 3, newlineLength: 0, lineText: "bar");
}
[Fact]
public void NewLines9()
{
// Mix CR with CR+LF
const string cr = "\r";
const string crLf = "\r\n";
const string lf = "\n";
var data = CreateSourceText("foo" + cr + crLf + lf + "bar");
Assert.Equal(4, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo");
CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 2, lineText: "");
CheckLine(data, lineNumber: 2, start: 6, length: 0, newlineLength: 1, lineText: "");
CheckLine(data, lineNumber: 3, start: 7, length: 3, newlineLength: 0, lineText: "bar");
}
[Fact]
public void Empty()
{
var data = CreateSourceText("");
Assert.Equal(1, data.Lines.Count);
CheckLine(data, lineNumber: 0, start: 0, length: 0, newlineLength: 0, lineText: "");
}
[Fact]
public void LinesGetText1()
{
var text =
@"foo
bar baz";
var data = CreateSourceText(text);
Assert.Equal(2, data.Lines.Count);
Assert.Equal("foo", data.Lines[0].ToString());
Assert.Equal("bar baz", data.Lines[1].ToString());
}
[Fact]
public void LinesGetText2()
{
var text = "foo";
var data = CreateSourceText(text);
Assert.Equal("foo", data.Lines[0].ToString());
}
}
}
\ No newline at end of file
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Immutable;
using System.IO;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis.Text;
using Xunit;
using ProprietaryTestResources = Microsoft.CodeAnalysis.Test.Resources.Proprietary;
namespace Microsoft.CodeAnalysis.UnitTests.Text
{
......@@ -15,31 +14,60 @@ public class SourceTextTests
private static readonly Encoding s_utf8 = Encoding.UTF8;
private static readonly Encoding s_utf8Bom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: true);
private static readonly Encoding s_unicode = Encoding.Unicode;
private const string HelloWorld = "Hello, World!";
[Fact]
public void Empty()
{
TestIsEmpty(SourceText.From(string.Empty));
TestIsEmpty(SourceText.From(new byte[0], 0));
TestIsEmpty(SourceText.From(new MemoryStream()));
}
private static void TestIsEmpty(SourceText text)
{
Assert.Equal(0, text.Length);
Assert.Same(string.Empty, text.ToString());
Assert.Equal(1, text.Lines.Count);
Assert.Equal(0, text.Lines[0].Span.Length);
}
[Fact]
public void Encoding1()
{
Assert.Same(s_utf8, SourceText.From("foo", s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From("foo", s_unicode).Encoding);
Assert.Same(s_unicode, SourceText.From(new MemoryStream(s_unicode.GetBytes("foo")), s_unicode).Encoding);
Assert.Same(s_utf8, SourceText.From(HelloWorld, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(HelloWorld, s_unicode).Encoding);
var bytes = s_unicode.GetBytes(HelloWorld);
Assert.Same(s_unicode, SourceText.From(bytes, bytes.Length, s_unicode).Encoding);
var stream = new MemoryStream(bytes);
Assert.Same(s_unicode, SourceText.From(stream, s_unicode).Encoding);
}
[Fact]
public void EncodingBOM()
{
var stream = new MemoryStream(s_utf8Bom.GetPreamble().Concat(s_utf8Bom.GetBytes("abc")).ToArray());
var bytes = s_utf8Bom.GetPreamble().Concat(s_utf8Bom.GetBytes("abc")).ToArray();
Assert.Equal(s_utf8.EncodingName, SourceText.From(bytes, bytes.Length, s_unicode).Encoding.EncodingName);
var stream = new MemoryStream(bytes);
Assert.Equal(s_utf8.EncodingName, SourceText.From(stream, s_unicode).Encoding.EncodingName);
}
[Fact]
public void ChecksumAlgorithm1()
{
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From("foo").ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From("foo", checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From("foo", checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(HelloWorld).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(HelloWorld, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(HelloWorld, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm);
var stream = new MemoryStream(s_unicode.GetBytes("foo"));
var bytes = s_unicode.GetBytes(HelloWorld);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(bytes, bytes.Length).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(bytes, bytes.Length, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(bytes, bytes.Length, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm);
var stream = new MemoryStream(bytes);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(stream).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(stream, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm);
Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(stream, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm);
......@@ -48,14 +76,14 @@ public void ChecksumAlgorithm1()
[Fact]
public void ContentEquals()
{
var f = SourceText.From("foo", s_utf8);
var f = SourceText.From(HelloWorld, s_utf8);
Assert.True(f.ContentEquals(SourceText.From("foo", s_utf8)));
Assert.False(f.ContentEquals(SourceText.From("fooo", s_utf8)));
Assert.True(SourceText.From("foo", s_utf8).ContentEquals(SourceText.From("foo", s_utf8)));
Assert.True(f.ContentEquals(SourceText.From(HelloWorld, s_utf8)));
Assert.False(f.ContentEquals(SourceText.From(HelloWorld + "o", s_utf8)));
Assert.True(SourceText.From(HelloWorld, s_utf8).ContentEquals(SourceText.From(HelloWorld, s_utf8)));
var e1 = EncodedStringText.Create(new MemoryStream(s_unicode.GetBytes("foo")), s_unicode);
var e2 = EncodedStringText.Create(new MemoryStream(s_utf8.GetBytes("foo")), s_utf8);
var e1 = EncodedStringText.Create(new MemoryStream(s_unicode.GetBytes(HelloWorld)), s_unicode);
var e2 = EncodedStringText.Create(new MemoryStream(s_utf8.GetBytes(HelloWorld)), s_utf8);
Assert.True(e1.ContentEquals(e1));
Assert.True(f.ContentEquals(e1));
......@@ -65,5 +93,73 @@ public void ContentEquals()
Assert.True(e1.ContentEquals(e2));
Assert.True(e2.ContentEquals(e1));
}
[Fact]
public void IsBinary()
{
Assert.False(SourceText.IsBinary(""));
Assert.False(SourceText.IsBinary("\0abc"));
Assert.False(SourceText.IsBinary("a\0bc"));
Assert.False(SourceText.IsBinary("abc\0"));
Assert.False(SourceText.IsBinary("a\0b\0c"));
Assert.True(SourceText.IsBinary("\0\0abc"));
Assert.True(SourceText.IsBinary("a\0\0bc"));
Assert.True(SourceText.IsBinary("abc\0\0"));
var encoding = Encoding.GetEncoding(1252);
Assert.False(SourceText.IsBinary(encoding.GetString(new byte[] { 0x81, 0x8D, 0x8F, 0x90, 0x9D })));
// Unicode string: äëïöüû
Assert.False(SourceText.IsBinary("abc def baz aeiouy \u00E4\u00EB\u00EF\u00F6\u00FC\u00FB"));
Assert.True(SourceText.IsBinary(encoding.GetString(ProprietaryTestResources.NetFX.v4_0_30319.System)));
}
[Fact]
public void FromThrowsIfBinary()
{
var bytes = ProprietaryTestResources.NetFX.v4_0_30319.System;
Assert.Throws<InvalidDataException>(() => SourceText.From(bytes, bytes.Length, throwIfBinaryDetected: true));
var stream = new MemoryStream(bytes);
Assert.Throws<InvalidDataException>(() => SourceText.From(stream, throwIfBinaryDetected: true));
}
private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data)
{
TestTryReadByteOrderMark(expectedEncoding, expectedPreambleLength, data, data == null ? 0 : data.Length);
}
private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data, int validLength)
{
int actualPreambleLength;
Encoding actualEncoding = SourceText.TryReadByteOrderMark(data, validLength, out actualPreambleLength);
if (expectedEncoding == null)
{
Assert.Null(actualEncoding);
}
else
{
Assert.Equal(expectedEncoding, actualEncoding);
}
Assert.Equal(expectedPreambleLength, actualPreambleLength);
}
[Fact]
public void TryReadByteOrderMark()
{
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[0]);
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef });
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef, 0xbb });
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef, 0xBB, 0xBF }, validLength: 2);
TestTryReadByteOrderMark(expectedEncoding: Encoding.UTF8, expectedPreambleLength: 3, data: new byte[] { 0xef, 0xBB, 0xBF });
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xff });
TestTryReadByteOrderMark(expectedEncoding: Encoding.Unicode, expectedPreambleLength: 2, data: new byte[] { 0xff, 0xfe });
TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xfe });
TestTryReadByteOrderMark(expectedEncoding: Encoding.BigEndianUnicode, expectedPreambleLength: 2, data: new byte[] { 0xfe, 0xff });
}
}
}
......@@ -236,6 +236,9 @@ public void FromStream_CheckSum_BOM()
[Fact]
public void FromStream_CheckSum_NoBOM()
{
// Note: The 0x95 is outside the ASCII range, so a question mark will
// be substituted in decoded text. Note, however, that the checksum
// should be derived from the original input.
var bytes = new byte[] { 0x61, 0x62, 0x95 };
var source = SourceText.From(new MemoryStream(bytes), Encoding.ASCII);
......
......@@ -91,7 +91,6 @@
<Compile Include="Interop\IClrStrongName.cs" />
<Compile Include="Interop\SuppressUnmanagedCodeSecurityAttribute.cs" />
<Compile Include="IVsSQM.cs" />
<Compile Include="LargeEncodedText.cs" />
<Compile Include="MetadataCache.cs" />
<Compile Include="AnalyzerFileReference.cs" />
<Compile Include="MetadataFileReference.cs" />
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Immutable;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.Text
{
internal sealed class EncodedStringText : SourceText
internal static class EncodedStringText
{
/// <summary>
/// Underlying string on which this SourceText instance is based
/// </summary>
private readonly string _source;
private readonly Encoding _encoding;
private const int LargeObjectHeapLimit = 80 * 1024; // 80KB
private EncodedStringText(string source, Encoding encoding, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinary)
: base(checksum: checksum, checksumAlgorithm: checksumAlgorithm)
{
if (throwIfBinary && IsBinary(source))
{
throw new InvalidDataException();
}
Debug.Assert(source != null);
Debug.Assert(encoding != null);
_source = source;
_encoding = encoding;
}
private const int LargeObjectHeapLimitInChars = 40 * 1024; // 40KB
/// <summary>
/// Encoding to use when there is no byte order mark (BOM) on the stream. This encoder may throw a <see cref="DecoderFallbackException"/>
......@@ -42,9 +19,13 @@ private EncodedStringText(string source, Encoding encoding, ImmutableArray<byte>
private static readonly Encoding FallbackEncoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
/// <summary>
/// Initializes an instance of <see cref="EncodedStringText"/> with provided bytes.
/// Initializes an instance of <see cref="SourceText"/> from the provided stream. This version differs
/// from <see cref="SourceText.From(Stream, Encoding, SourceHashAlgorithm, bool)"/> in two ways:
/// 1. It attempts to minimize allocations by trying to read the stream into a byte array.
/// 2. If <paramref name="defaultEncoding"/> is null, it will first try UTF8 and, if that fails, it will
/// try <see cref="Encoding.Default"/>.
/// </summary>
/// <param name="stream"></param>
/// <param name="stream">The stream containing encoded text.</param>
/// <param name="defaultEncoding">
/// Specifies an encoding to be used if the actual encoding can't be determined from the stream content (the stream doesn't start with Byte Order Mark).
/// If not specified auto-detect heuristics are used to determine the encoding. If these heuristics fail the decoding is assumed to be <see cref="Encoding.Default"/>.
......@@ -84,156 +65,36 @@ internal static SourceText Create(Stream stream, Encoding defaultEncoding = null
}
}
public override Encoding Encoding
{
get { return _encoding; }
}
/// <summary>
/// Underlying string which is the source of this SourceText instance
/// </summary>
public string Source
{
get { return _source; }
}
/// <summary>
/// The length of the text represented by <see cref="EncodedStringText"/>.
/// </summary>
public override int Length
{
get { return this.Source.Length; }
}
/// <summary>
/// Returns a character at given position.
/// </summary>
/// <param name="position">The position to get the character from.</param>
/// <returns>The character.</returns>
/// <exception cref="ArgumentOutOfRangeException">When position is negative or
/// greater than <see cref="Length"/>.</exception>
public override char this[int position]
{
get
{
// NOTE: we are not validating position here as that would not
// add any value to the range check that string accessor performs anyways.
return _source[position];
}
}
/// <summary>
/// Provides a string representation of the StringText located within given span.
/// </summary>
/// <exception cref="ArgumentOutOfRangeException">When given span is outside of the text range.</exception>
public override string ToString(TextSpan span)
{
if (span.End > this.Source.Length)
{
throw new ArgumentOutOfRangeException("span");
}
if (span.Start == 0 && span.Length == this.Length)
{
return this.Source;
}
else
{
return this.Source.Substring(span.Start, span.Length);
}
}
public override void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count)
{
this.Source.CopyTo(sourceIndex, destination, destinationIndex, count);
}
public override void Write(TextWriter textWriter, TextSpan span, CancellationToken cancellationToken = default(CancellationToken))
{
if (span.Start == 0 && span.End == this.Length)
{
textWriter.Write(this.Source);
}
else
{
base.Write(textWriter, span, cancellationToken);
}
}
#region Encoding Detection
/// <summary>
/// Check for occurrence of two consecutive NUL (U+0000) characters.
/// This is unlikely to appear in genuine text, so it's a good heuristic
/// to detect binary files.
/// </summary>
/// <remarks>
/// internal for unit testing
/// </remarks>
internal static bool IsBinary(string text)
{
// PERF: We can advance two chars at a time unless we find a NUL.
for (int i = 1; i < text.Length;)
{
if (text[i] == '\0')
{
if (text[i - 1] == '\0')
{
return true;
}
i += 1;
}
else
{
i += 2;
}
}
return false;
}
/// <summary>
/// Try to create a <see cref="EncodedStringText"/> from the given stream using the given encoding.
/// Try to create a <see cref="SourceText"/> from the given stream using the given encoding.
/// </summary>
/// <param name="data">The input stream containing the encoded text. The stream will not be closed.</param>
/// <param name="encoding">The expected encoding of the stream. The actual encoding used may be different if byte order marks are detected.</param>
/// <param name="checksumAlgorithm">The checksum algorithm to use.</param>
/// <param name="throwIfBinaryDetected">Throw <see cref="InvalidDataException"/> if binary (non-text) data is detected.</param>
/// <returns>The <see cref="EncodedStringText"/> decoded from the stream.</returns>
/// <returns>The <see cref="SourceText"/> decoded from the stream.</returns>
/// <exception cref="DecoderFallbackException">The decoder was unable to decode the stream with the given encoding.</exception>
/// <remarks>
/// internal for unit testing
/// </remarks>
internal static SourceText Decode(Stream data, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected = false)
{
data.Seek(0, SeekOrigin.Begin);
if (data.Length > LargeObjectHeapLimit)
{
return LargeEncodedText.Decode(data, encoding, checksumAlgorithm, throwIfBinaryDetected);
}
Encoding actualEncoding;
ImmutableArray<byte> checksum = default(ImmutableArray<byte>);
string text;
Debug.Assert(data != null);
Debug.Assert(encoding != null);
byte[] buffer = TryGetByteArrayFromStream(data);
if (buffer != null)
{
text = Decode(buffer, (int)data.Length, encoding, out actualEncoding);
data.Seek(0, SeekOrigin.Begin);
// Since we have the buffer, compute the checksum here. This saves allocations if we later
// need to write out debugging information.
checksum = CalculateChecksum(buffer, offset: 0, count: (int)data.Length, algorithmId: checksumAlgorithm);
}
else
// For small streams, see if we can read the byte buffer directly.
if (encoding.GetMaxCharCount((int)data.Length) < LargeObjectHeapLimitInChars)
{
text = Decode(data, encoding, out actualEncoding);
byte[] buffer = TryGetByteArrayFromStream(data);
if (buffer != null)
{
return SourceText.From(buffer, (int)data.Length, encoding, checksumAlgorithm, throwIfBinaryDetected);
}
}
return new EncodedStringText(text, actualEncoding, checksum, checksumAlgorithm, throwIfBinary: throwIfBinaryDetected);
return SourceText.From(data, encoding, checksumAlgorithm, throwIfBinaryDetected);
}
/// <summary>
......@@ -265,40 +126,6 @@ private static byte[] TryGetByteArrayFromStream(Stream data)
return null;
}
/// <summary>
/// Decode the given stream using the given encoding. Does not
/// close the stream afterwards.
/// </summary>
/// <param name="data">Data stream</param>
/// <param name="encoding">Default encoding to use for decoding.</param>
/// <param name="actualEncoding">Actual encoding used to read the text.</param>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use <see cref="DecoderExceptionFallback"/> as its fallback decoder.</exception>
/// <returns>Decoded stream as a text string</returns>
private static string Decode(Stream data, Encoding encoding, out Encoding actualEncoding)
{
data.Seek(0, SeekOrigin.Begin);
int length = (int)data.Length;
if (length == 0)
{
actualEncoding = encoding;
return string.Empty;
}
// Note: We are setting the buffer size to 4KB instead of the default 1KB. That's
// because we can reach this code path for FileStreams that are larger than 80KB
// and, to avoid FileStream buffer allocations for small files, we may intentionally
// be using a FileStream with a very small (1 byte) buffer. Using 4KB here matches
// the default buffer size for FileStream and means we'll still be doing file I/O
// in 4KB chunks.
using (var reader = new StreamReader(data, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(4096, length), leaveOpen: true))
{
string text = reader.ReadToEnd();
actualEncoding = reader.CurrentEncoding;
return text;
}
}
/// <summary>
/// If the MemoryStream was created with publiclyVisible=true, then we can access its buffer
/// directly and save allocations in StreamReader. The input MemoryStream is not closed on exit.
......@@ -353,117 +180,5 @@ private static bool TryGetByteArrayFromFileStream(FileStream stream, out byte[]
// line compiler actually specifies a very small buffer size.
return stream.Read(buffer, 0, length) == length;
}
/// <summary>
/// Decode text from a byte array.
/// </summary>
/// <param name="buffer">The byte array containing encoded text.</param>
/// <param name="length">The count of valid bytes in <paramref name="buffer"/>.</param>
/// <param name="encoding">The encoding to use if an encoding cannot be determined from the byte order mark.</param>
/// <param name="actualEncoding">The actual encoding used.</param>
/// <returns>The decoded text.</returns>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use <see cref="DecoderExceptionFallback"/>
/// as its fallback decoder.</exception>
private static string Decode(byte[] buffer, int length, Encoding encoding, out Encoding actualEncoding)
{
int preambleLength;
actualEncoding = TryReadByteOrderMark(buffer, length, out preambleLength) ?? encoding;
return actualEncoding.GetString(buffer, preambleLength, length - preambleLength);
}
/// <summary>
/// Detect an encoding by looking for byte order marks.
/// </summary>
/// <param name="source">A buffer containing the encoded text.</param>
/// <param name="length">The length of valid data in the buffer.</param>
/// <param name="preambleLength">The length of any detected byte order marks.</param>
/// <returns>The detected encoding or null if no recognized byte order mark was present.</returns>
private static Encoding TryReadByteOrderMark(byte[] source, int length, out int preambleLength)
{
Debug.Assert(source != null);
Debug.Assert(length <= source.Length);
if (length >= 2)
{
switch (source[0])
{
case 0xFE:
if (source[1] == 0xFF)
{
preambleLength = 2;
return Encoding.BigEndianUnicode;
}
break;
case 0xFF:
if (source[1] == 0xFE)
{
preambleLength = 2;
return Encoding.Unicode;
}
break;
case 0xEF:
if (source[1] == 0xBB && length >= 3 && source[2] == 0xBF)
{
preambleLength = 3;
return Encoding.UTF8;
}
break;
}
}
preambleLength = 0;
return null;
}
[ThreadStatic]
private static byte[] t_bomBytes;
/// <summary>
/// Detect an encoding by looking for byte order marks at the beginning of the stream.
/// </summary>
/// <param name="data">The stream containing encoded text.</param>
/// <returns>The detected encoding or null if no recognized byte order mark was present.</returns>
/// <remarks>
/// On exit, the stream's position is set to the first position after any decoded byte order
/// mark or rewound to the start if no byte order mark was detected.
/// </remarks>
internal static Encoding TryReadByteOrderMark(Stream data)
{
Debug.Assert(data != null);
data.Seek(0, SeekOrigin.Begin);
if (data.Length < 2)
{
// Not long enough for any valid BOM prefix
return null;
}
// PERF: Avoid repeated calls to Stream.ReadByte since that method allocates a 1-byte array on each call.
// Instead, using a thread local byte array.
if (t_bomBytes == null)
{
t_bomBytes = new byte[3];
}
int validLength = Math.Min((int)data.Length, t_bomBytes.Length);
data.Read(t_bomBytes, 0, validLength);
int preambleLength;
Encoding detectedEncoding = TryReadByteOrderMark(t_bomBytes, validLength, out preambleLength);
if (preambleLength != validLength)
{
data.Seek(preambleLength, SeekOrigin.Begin);
}
return detectedEncoding;
}
#endregion
}
}
......@@ -562,6 +562,7 @@
<Compile Include="Syntax\TranslationSyntaxReference.cs" />
<Compile Include="Text\ChangedText.cs" />
<Compile Include="Text\CompositeText.cs" />
<Compile Include="Text\LargeEncodedText.cs" />
<Compile Include="Text\LinePosition.cs" />
<Compile Include="Text\LinePositionSpan.cs" />
<Compile Include="Text\SourceHashAlgorithm.cs" />
......
......@@ -1327,6 +1327,7 @@ Microsoft.CodeAnalysis.Text.SourceText
Microsoft.CodeAnalysis.Text.SourceText.ChecksumAlgorithm.get
Microsoft.CodeAnalysis.Text.SourceText.ContentEquals(Microsoft.CodeAnalysis.Text.SourceText other)
Microsoft.CodeAnalysis.Text.SourceText.GetSubText(int start)
Microsoft.CodeAnalysis.Text.SourceText.Lines.get
Microsoft.CodeAnalysis.Text.SourceText.Replace(Microsoft.CodeAnalysis.Text.TextSpan span, string newText)
Microsoft.CodeAnalysis.Text.SourceText.Replace(int start, int length, string newText)
Microsoft.CodeAnalysis.Text.SourceText.SourceText(System.Collections.Immutable.ImmutableArray<byte> checksum = default(System.Collections.Immutable.ImmutableArray<byte>), Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, Microsoft.CodeAnalysis.Text.SourceTextContainer container = null)
......@@ -2011,7 +2012,8 @@ static Microsoft.CodeAnalysis.Text.LinePosition.operator >=(Microsoft.CodeAnalys
static Microsoft.CodeAnalysis.Text.LinePositionSpan.operator !=(Microsoft.CodeAnalysis.Text.LinePositionSpan left, Microsoft.CodeAnalysis.Text.LinePositionSpan right)
static Microsoft.CodeAnalysis.Text.LinePositionSpan.operator ==(Microsoft.CodeAnalysis.Text.LinePositionSpan left, Microsoft.CodeAnalysis.Text.LinePositionSpan right)
static Microsoft.CodeAnalysis.Text.SourceText.CalculateChecksum(byte[] buffer, int offset, int count, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm algorithmId)
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1)
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false)
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false)
static Microsoft.CodeAnalysis.Text.SourceText.From(string text, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1)
static Microsoft.CodeAnalysis.Text.TextChange.implicit operator Microsoft.CodeAnalysis.Text.TextChangeRange(Microsoft.CodeAnalysis.Text.TextChange change)
static Microsoft.CodeAnalysis.Text.TextChange.operator !=(Microsoft.CodeAnalysis.Text.TextChange left, Microsoft.CodeAnalysis.Text.TextChange right)
......@@ -2110,9 +2112,9 @@ virtual Microsoft.CodeAnalysis.SyntaxWalker.VisitTrivia(Microsoft.CodeAnalysis.S
virtual Microsoft.CodeAnalysis.Text.SourceText.Container.get
virtual Microsoft.CodeAnalysis.Text.SourceText.ContentEqualsImpl(Microsoft.CodeAnalysis.Text.SourceText other)
virtual Microsoft.CodeAnalysis.Text.SourceText.GetChangeRanges(Microsoft.CodeAnalysis.Text.SourceText oldText)
virtual Microsoft.CodeAnalysis.Text.SourceText.GetLinesCore()
virtual Microsoft.CodeAnalysis.Text.SourceText.GetSubText(Microsoft.CodeAnalysis.Text.TextSpan span)
virtual Microsoft.CodeAnalysis.Text.SourceText.GetTextChanges(Microsoft.CodeAnalysis.Text.SourceText oldText)
virtual Microsoft.CodeAnalysis.Text.SourceText.Lines.get
virtual Microsoft.CodeAnalysis.Text.SourceText.ToString(Microsoft.CodeAnalysis.Text.TextSpan span)
virtual Microsoft.CodeAnalysis.Text.SourceText.WithChanges(System.Collections.Generic.IEnumerable<Microsoft.CodeAnalysis.Text.TextChange> changes)
virtual Microsoft.CodeAnalysis.Text.SourceText.Write(System.IO.TextWriter writer, Microsoft.CodeAnalysis.Text.TextSpan span, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken))
......
using System;
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Immutable;
using System.IO;
using System.Text;
......@@ -123,21 +125,9 @@ private int GetIndexFromPosition(int position)
}
}
public override Encoding Encoding
{
get
{
return _encoding;
}
}
public override Encoding Encoding => _encoding;
public override int Length
{
get
{
return _length;
}
}
public override int Length => _length;
public override void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count)
{
......@@ -196,5 +186,70 @@ public override void Write(TextWriter writer, TextSpan span, CancellationToken c
chunkIndex++;
}
}
/// <summary>
/// Called from <see cref="SourceText.Lines"/> to initialize the <see cref="TextLineCollection"/>. Thereafter,
/// the collection is cached.
/// </summary>
/// <returns>A new <see cref="TextLineCollection"/> representing the individual text lines.</returns>
protected override TextLineCollection GetLinesCore()
{
return new LineInfo(this, ParseLineStarts());
}
private int[] ParseLineStarts()
{
var position = 0;
var index = 0;
var lastCr = -1;
var arrayBuilder = ArrayBuilder<int>.GetInstance();
// The following loop goes through every character in the text. It is highly
// performance critical, and thus inlines knowledge about common line breaks
// and non-line breaks.
foreach (var chunk in _chunks)
{
foreach (var c in chunk)
{
index++;
// Common case - ASCII & not a line break
const uint bias = '\r' + 1;
if (unchecked(c - bias) <= (127 - bias))
{
continue;
}
switch(c)
{
case '\r':
lastCr = index;
goto line_break;
case '\n':
// Assumes that the only 2-char line break sequence is CR+LF
if (lastCr == (index - 1))
{
position = index;
break;
}
goto line_break;
case '\u0085':
case '\u2028':
case '\u2029':
line_break:
arrayBuilder.Add(position);
position = index;
break;
}
}
}
// Create a start for the final line.
arrayBuilder.Add(position);
return arrayBuilder.ToArrayAndFree();
}
}
}
......@@ -6,7 +6,6 @@
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading;
using Roslyn.Utilities;
......@@ -20,14 +19,17 @@ public abstract class SourceText
{
private const int CharBufferSize = 32 * 1024;
private const int CharBufferCount = 5;
private const int LargeObjectHeapLimitInChars = 40 * 1024; // 40KB
private static readonly ObjectPool<char[]> s_charArrayPool = new ObjectPool<char[]>(() => new char[CharBufferSize], CharBufferCount);
private readonly SourceHashAlgorithm _checksumAlgorithm;
private SourceTextContainer _lazyContainer;
private LineInfo _lazyLineInfo;
private TextLineCollection _lazyLineInfo;
private ImmutableArray<byte> _lazyChecksum;
private static readonly Encoding Utf8EncodingWithNoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: false);
protected SourceText(ImmutableArray<byte> checksum = default(ImmutableArray<byte>), SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, SourceTextContainer container = null)
{
ValidateChecksumAlgorithm(checksumAlgorithm);
......@@ -78,7 +80,7 @@ public static SourceText From(string text, Encoding encoding = null, SourceHashA
/// <summary>
/// Constructs a <see cref="SourceText"/> from stream content.
/// </summary>
/// <param name="stream">Stream.</param>
/// <param name="stream">Stream. The stream must be seekable.</param>
/// <param name="encoding">
/// Data encoding to use if the stream doesn't start with Byte Order Mark specifying the encoding.
/// <see cref="Encoding.UTF8"/> if not specified.
......@@ -86,14 +88,18 @@ public static SourceText From(string text, Encoding encoding = null, SourceHashA
/// <param name="checksumAlgorithm">
/// Hash algorithm to use to calculate checksum of the text that's saved to PDB.
/// </param>
/// <param name="throwIfBinaryDetected">If the decoded text contains at least two consecutive NUL
/// characters, then an <see cref="InvalidDataException"/> is thrown.</param>
/// <exception cref="ArgumentNullException"><paramref name="stream"/> is null.</exception>
/// <exception cref="ArgumentException">
/// <paramref name="stream"/> doesn't support reading or seeking.
/// <paramref name="checksumAlgorithm"/> is not supported.
/// </exception>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use a throwing decoder as a fallback</exception>
/// <exception cref="InvalidDataException">Two consecutive NUL characters were detected in the decoded text and <paramref name="throwIfBinaryDetected"/> was true.</exception>
/// <exception cref="IOException">An I/O error occurs.</exception>
/// <remarks>Reads from the beginning of the stream. Leaves the stream open.</remarks>
public static SourceText From(Stream stream, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1)
public static SourceText From(Stream stream, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false)
{
if (stream == null)
{
......@@ -107,29 +113,158 @@ public static SourceText From(Stream stream, Encoding encoding = null, SourceHas
ValidateChecksumAlgorithm(checksumAlgorithm);
encoding = encoding ?? Encoding.UTF8;
encoding = encoding ?? Utf8EncodingWithNoBOM;
// TODO: unify encoding detection with EncodedStringText
// If the resulting string would end up on the large object heap, then use LargeEncodedText.
if (encoding.GetMaxCharCount((int)stream.Length) >= LargeObjectHeapLimitInChars)
{
return LargeEncodedText.Decode(stream, encoding, checksumAlgorithm, throwIfBinaryDetected);
}
stream.Seek(0, SeekOrigin.Begin);
string text;
using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true))
string text = Decode(stream, encoding, out encoding);
if (throwIfBinaryDetected && IsBinary(text))
{
text = reader.ReadToEnd();
encoding = reader.CurrentEncoding;
throw new InvalidDataException();
}
return new StringText(text, encoding, CalculateChecksum(stream, checksumAlgorithm), checksumAlgorithm);
var checksum = CalculateChecksum(stream, checksumAlgorithm);
return new StringText(text, encoding, checksum, checksumAlgorithm);
}
/// <summary>
/// Constructs a <see cref="SourceText"/> from a byte array.
/// </summary>
/// <param name="buffer">The encoded source buffer.</param>
/// <param name="length">The number of bytes to read from the buffer.</param>
/// <param name="encoding">
/// Data encoding to use if the encoded buffer doesn't start with Byte Order Mark.
/// <see cref="Encoding.UTF8"/> if not specified.
/// </param>
/// <param name="checksumAlgorithm">
/// Hash algorithm to use to calculate checksum of the text that's saved to PDB.
/// </param>
/// <param name="throwIfBinaryDetected">If the decoded text contains at least two consecutive NUL
/// characters, then an <see cref="InvalidDataException"/> is thrown.</param>
/// <returns>The decoded text.</returns>
/// <exception cref="ArgumentNullException">The <paramref name="buffer"/> is null.</exception>
/// <exception cref="ArgumentOutOfRangeException">The <paramref name="length"/> is negative or longer than the <paramref name="buffer"/>.</exception>
/// <exception cref="ArgumentException"><paramref name="checksumAlgorithm"/> is not supported.</exception>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use a throwing decoder as a fallback</exception>
/// <exception cref="InvalidDataException">Two consecutive NUL characters were detected in the decoded text and <paramref name="throwIfBinaryDetected"/> was true.</exception>
public static SourceText From(byte[] buffer, int length, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false)
{
if (buffer == null)
{
throw new ArgumentNullException(nameof(buffer));
}
if (length < 0 || length > buffer.Length)
{
throw new ArgumentOutOfRangeException(nameof(length));
}
ValidateChecksumAlgorithm(checksumAlgorithm);
string text = Decode(buffer, length, encoding ?? Utf8EncodingWithNoBOM, out encoding);
if (throwIfBinaryDetected && IsBinary(text))
{
throw new InvalidDataException();
}
// Since we have the bytes in hand, it's easy to compute the checksum.
var checksum = CalculateChecksum(buffer, 0, length, checksumAlgorithm);
return new StringText(text, encoding, checksum, checksumAlgorithm);
}
/// <summary>
/// Decode text from a stream.
/// </summary>
/// <param name="stream">The stream containing encoded text.</param>
/// <param name="encoding">The encoding to use if an encoding cannot be determined from the byte order mark.</param>
/// <param name="actualEncoding">The actual encoding used.</param>
/// <returns>The decoded text.</returns>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use a throwing decoder as a fallback</exception>
private static string Decode(Stream stream, Encoding encoding, out Encoding actualEncoding)
{
Debug.Assert(stream != null);
Debug.Assert(encoding != null);
stream.Seek(0, SeekOrigin.Begin);
int length = (int)stream.Length;
if (length == 0)
{
actualEncoding = encoding;
return string.Empty;
}
// Note: We are setting the buffer size to 4KB instead of the default 1KB. That's
// because we can reach this code path for FileStreams and, to avoid FileStream
// buffer allocations for small files, we may intentionally be using a FileStream
// with a very small (1 byte) buffer. Using 4KB here matches the default buffer
// size for FileStream and means we'll still be doing file I/O in 4KB chunks.
using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(4096, length), leaveOpen: true))
{
string text = reader.ReadToEnd();
actualEncoding = reader.CurrentEncoding;
return text;
}
}
/// <summary>
/// Decode text from a byte array.
/// </summary>
public SourceHashAlgorithm ChecksumAlgorithm
/// <param name="buffer">The byte array containing encoded text.</param>
/// <param name="length">The count of valid bytes in <paramref name="buffer"/>.</param>
/// <param name="encoding">The encoding to use if an encoding cannot be determined from the byte order mark.</param>
/// <param name="actualEncoding">The actual encoding used.</param>
/// <returns>The decoded text.</returns>
/// <exception cref="DecoderFallbackException">If the given encoding is set to use a throwing decoder as a fallback</exception>
private static string Decode(byte[] buffer, int length, Encoding encoding, out Encoding actualEncoding)
{
get { return _checksumAlgorithm; }
Debug.Assert(buffer != null);
Debug.Assert(encoding != null);
int preambleLength;
actualEncoding = TryReadByteOrderMark(buffer, length, out preambleLength) ?? encoding;
return actualEncoding.GetString(buffer, preambleLength, length - preambleLength);
}
/// <summary>
/// Check for occurrence of two consecutive NUL (U+0000) characters.
/// This is unlikely to appear in genuine text, so it's a good heuristic
/// to detect binary files.
/// </summary>
/// <remarks>
/// internal for unit testing
/// </remarks>
internal static bool IsBinary(string text)
{
// PERF: We can advance two chars at a time unless we find a NUL.
for (int i = 1; i < text.Length;)
{
if (text[i] == '\0')
{
if (text[i - 1] == '\0')
{
return true;
}
i += 1;
}
else
{
i += 2;
}
}
return false;
}
/// <summary>
/// Hash algorithm to use to calculate checksum of the text that's saved to PDB.
/// </summary>
public SourceHashAlgorithm ChecksumAlgorithm => _checksumAlgorithm;
/// <summary>
/// Encoding of the file that the text was read from or is going to be saved to.
/// <c>null</c> if the encoding is unspecified.
......@@ -479,21 +614,26 @@ public virtual IReadOnlyList<TextChange> GetTextChanges(SourceText oldText)
/// <summary>
/// The collection of individual text lines.
/// </summary>
public virtual TextLineCollection Lines
public TextLineCollection Lines
{
get
{
if (_lazyLineInfo == null)
{
var info = new LineInfo(this, this.ParseLineStarts());
Interlocked.CompareExchange(ref _lazyLineInfo, info, null);
}
return _lazyLineInfo;
var info = _lazyLineInfo;
return info ?? Interlocked.CompareExchange(ref _lazyLineInfo, info = GetLinesCore(), null) ?? info;
}
}
private class LineInfo : TextLineCollection
/// <summary>
/// Called from <see cref="Lines"/> to initialize the <see cref="TextLineCollection"/>. Thereafter,
/// the collection is cached.
/// </summary>
/// <returns>A new <see cref="TextLineCollection"/> representing the individual text lines.</returns>
protected virtual TextLineCollection GetLinesCore()
{
return new LineInfo(this, ParseLineStarts());
}
internal sealed class LineInfo : TextLineCollection
{
private readonly SourceText _text;
private readonly int[] _lineStarts;
......@@ -505,10 +645,7 @@ public LineInfo(SourceText text, int[] lineStarts)
_lineStarts = lineStarts;
}
public override int Count
{
get { return _lineStarts.Length; }
}
public override int Count => _lineStarts.Length;
public override TextLine this[int index]
{
......@@ -701,6 +838,55 @@ protected virtual bool ContentEqualsImpl(SourceText other)
#endregion
/// <summary>
/// Detect an encoding by looking for byte order marks.
/// </summary>
/// <param name="source">A buffer containing the encoded text.</param>
/// <param name="length">The length of valid data in the buffer.</param>
/// <param name="preambleLength">The length of any detected byte order marks.</param>
/// <returns>The detected encoding or null if no recognized byte order mark was present.</returns>
internal static Encoding TryReadByteOrderMark(byte[] source, int length, out int preambleLength)
{
Debug.Assert(source != null);
Debug.Assert(length <= source.Length);
if (length >= 2)
{
switch (source[0])
{
case 0xFE:
if (source[1] == 0xFF)
{
preambleLength = 2;
return Encoding.BigEndianUnicode;
}
break;
case 0xFF:
if (source[1] == 0xFE)
{
preambleLength = 2;
return Encoding.Unicode;
}
break;
case 0xEF:
if (source[1] == 0xBB && length >= 3 && source[2] == 0xBF)
{
preambleLength = 3;
return Encoding.UTF8;
}
break;
}
}
preambleLength = 0;
return null;
}
private class StaticContainer : SourceTextContainer
{
private readonly SourceText _text;
......@@ -710,10 +896,7 @@ public StaticContainer(SourceText text)
_text = text;
}
public override SourceText CurrentText
{
get { return _text; }
}
public override SourceText CurrentText => _text;
public override event EventHandler<TextChangeEventArgs> TextChanged
{
......
......@@ -4,10 +4,8 @@
using System.Collections.Immutable;
using System.Diagnostics;
using System.IO;
using System.Reflection;
using System.Text;
using System.Threading;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.Text
{
......@@ -28,26 +26,17 @@ internal StringText(string source, Encoding encodingOpt, ImmutableArray<byte> ch
_encodingOpt = encodingOpt;
}
public override Encoding Encoding
{
get { return _encodingOpt; }
}
public override Encoding Encoding => _encodingOpt;
/// <summary>
/// Underlying string which is the source of this <see cref="StringText"/>instance
/// </summary>
public string Source
{
get { return _source; }
}
public string Source => _source;
/// <summary>
/// The length of the text represented by <see cref="StringText"/>.
/// </summary>
public override int Length
{
get { return this.Source.Length; }
}
public override int Length => _source.Length;
/// <summary>
/// Returns a character at given position.
......
......@@ -26,7 +26,7 @@ private class SnapshotSourceText : SourceText
/// <summary>
/// Use a separate class for closed files to simplify memory leak investigations
/// </summary>
internal class ClosedSnapshotSourceText : SnapshotSourceText
internal sealed class ClosedSnapshotSourceText : SnapshotSourceText
{
public ClosedSnapshotSourceText(ITextSnapshot roslynSnapshot, Encoding encodingOpt)
: base(roslynSnapshot, encodingOpt, containerOpt: null)
......@@ -43,7 +43,6 @@ public ClosedSnapshotSourceText(ITextSnapshot roslynSnapshot, Encoding encodingO
private readonly Encoding _encodingOpt;
private readonly TextBufferContainer _containerOpt;
private readonly int _reiteratedVersion;
private LineInfo _lineInfo;
private SnapshotSourceText(ITextSnapshot editorSnapshot, Encoding encodingOpt)
{
......@@ -145,17 +144,9 @@ public override int Length
}
#region Lines
public override TextLineCollection Lines
protected override TextLineCollection GetLinesCore()
{
get
{
if (_lineInfo == null)
{
System.Threading.Interlocked.CompareExchange(ref _lineInfo, new LineInfo(this), null);
}
return _lineInfo;
}
return new LineInfo(this);
}
private class LineInfo : TextLineCollection
......
......@@ -30,7 +30,7 @@ Namespace Microsoft.VisualStudio.LanguageServices.VisualBasic.ProjectSystemShim
Function CompileAndGetErrorCount(metadataService As IMetadataService) As Integer
Dim trees = _files.Select(Function(path)
Using stream = FileUtilities.OpenRead(path)
Return SyntaxFactory.ParseSyntaxTree(EncodedStringText.From(stream), options:=_compilerOptions.ParseOptions, path:=path)
Return SyntaxFactory.ParseSyntaxTree(SourceText.From(stream), options:=_compilerOptions.ParseOptions, path:=path)
End Using
End Function)
......
......@@ -61,9 +61,6 @@
<Compile Include="..\..\..\Compilers\Core\Desktop\AssemblyReferenceResolver.cs">
<Link>InternalUtilities\AssemblyReferenceResolver.cs</Link>
</Compile>
<Compile Include="..\..\..\Compilers\Core\Desktop\LargeEncodedText.cs">
<Link>InternalUtilities\LargeEncodedText.cs</Link>
</Compile>
<Compile Include="..\..\..\Compilers\Core\Desktop\MetadataFileReferenceResolver.cs">
<Link>InternalUtilities\MetadataFileReferenceResolver.cs</Link>
</Compile>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册