diff --git a/src/Compilers/Core/CodeAnalysisTest/Text/EncodedStringTextTests.cs b/src/Compilers/Core/CodeAnalysisTest/Text/EncodedStringTextTests.cs index 3a34c44262b459647317171533258ba61a629c04..f6e1c250dfb8686f48be30cbd8b586d930e43d2d 100644 --- a/src/Compilers/Core/CodeAnalysisTest/Text/EncodedStringTextTests.cs +++ b/src/Compilers/Core/CodeAnalysisTest/Text/EncodedStringTextTests.cs @@ -93,43 +93,6 @@ public void CheckSum_SHA256() Assert.Equal("f1945cd6 c19e56b3 c1c78943 ef5ec181 16907a4c a1efc40a 57d48ab1 db7adfc5", StringTextTest.ChecksumToHexQuads(checksum)); } - [Fact] - public void TryReadByteOrderMark() - { - Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[0]))); - - Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef }))); - Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef, 0xbb }))); - Assert.Equal(Encoding.UTF8, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xef, 0xBB, 0xBF }))); - - Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xff }))); - Assert.Equal(Encoding.Unicode, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xff, 0xfe }))); - - Assert.Null(EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xfe }))); - Assert.Equal(Encoding.BigEndianUnicode, EncodedStringText.TryReadByteOrderMark(new MemoryStream(new byte[] { 0xfe, 0xff }))); - } - - [Fact] - public void IsBinary() - { - Assert.False(EncodedStringText.IsBinary("")); - - Assert.False(EncodedStringText.IsBinary("\0abc")); - Assert.False(EncodedStringText.IsBinary("a\0bc")); - Assert.False(EncodedStringText.IsBinary("abc\0")); - Assert.False(EncodedStringText.IsBinary("a\0b\0c")); - - Assert.True(EncodedStringText.IsBinary("\0\0abc")); - Assert.True(EncodedStringText.IsBinary("a\0\0bc")); - Assert.True(EncodedStringText.IsBinary("abc\0\0")); - - var encoding = Encoding.GetEncoding(1252); - Assert.False(EncodedStringText.IsBinary(encoding.GetString(new byte[] { 0x81, 0x8D, 0x8F, 0x90, 0x9D }))); - // Unicode string: äëïöüû - Assert.False(EncodedStringText.IsBinary("abc def baz aeiouy \u00E4\u00EB\u00EF\u00F6\u00FC\u00FB")); - Assert.True(EncodedStringText.IsBinary(encoding.GetString(ProprietaryTestResources.NetFX.v4_0_30319.System))); - } - [Fact] public void Decode_NonUtf8() { diff --git a/src/Compilers/Core/CodeAnalysisTest/Text/LargeEncodedTextTests.cs b/src/Compilers/Core/CodeAnalysisTest/Text/LargeEncodedTextTests.cs index 419239d3ff1a028191239183d39070ac575fd72f..f143b8e87402a2fde6e8d21b863fb368e56ec8b8 100644 --- a/src/Compilers/Core/CodeAnalysisTest/Text/LargeEncodedTextTests.cs +++ b/src/Compilers/Core/CodeAnalysisTest/Text/LargeEncodedTextTests.cs @@ -109,7 +109,8 @@ public void CopyToLargeTest() } } - var text = CreateSourceText(stream); + var text = SourceText.From(stream); + Assert.IsType(typeof(LargeEncodedText), text); char[] buffer = new char[HelloWorld.Length]; for (int start = 0; start < text.Length; start += HelloWorld.Length) @@ -119,5 +120,196 @@ public void CopyToLargeTest() } } } + + private static void CheckEqualLine(TextLine first, TextLine second) + { + Assert.Equal(first, second); +#if false + // We do not guarantee either identity or Equals! + Assert.Equal(first.Extent, second.Extent); + Assert.Equal(first.ExtentIncludingLineBreak, second.ExtentIncludingLineBreak); +#endif + } + + private static void CheckNotEqualLine(TextLine first, TextLine second) + { + Assert.NotEqual(first, second); +#if false + Assert.NotEqual(first, second); + Assert.NotEqual(first.Extent, second.Extent); + Assert.NotEqual(first.ExtentIncludingLineBreak, second.ExtentIncludingLineBreak); +#endif + } + + private static void CheckLine(SourceText text, int lineNumber, int start, int length, int newlineLength, string lineText) + { + var textLine = text.Lines[lineNumber]; + + Assert.Equal(start, textLine.Start); + Assert.Equal(start + length, textLine.End); + Assert.Equal(start + length + newlineLength, textLine.EndIncludingLineBreak); + Assert.Equal(start, textLine.Span.Start); + Assert.Equal(length, textLine.Span.Length); + Assert.Equal(start, textLine.SpanIncludingLineBreak.Start); + Assert.Equal(length + newlineLength, textLine.SpanIncludingLineBreak.Length); + Assert.Equal(lineNumber, textLine.LineNumber); + Assert.Equal(lineText, textLine.ToString()); + Assert.Equal(text.ToString().Substring(start, length), textLine.ToString()); + + CheckEqualLine(textLine, text.Lines[lineNumber]); + for (int p = textLine.Start; p < textLine.EndIncludingLineBreak; ++p) + { + CheckEqualLine(textLine, text.Lines.GetLineFromPosition(p)); + Assert.Equal(lineNumber, text.Lines.IndexOf(p)); + Assert.Equal(lineNumber, text.Lines.GetLinePosition(p).Line); + Assert.Equal(p - start, text.Lines.GetLinePosition(p).Character); + } + + if (start != 0) + { + CheckNotEqualLine(textLine, text.Lines.GetLineFromPosition(start - 1)); + Assert.Equal(lineNumber - 1, text.Lines.IndexOf(start - 1)); + Assert.Equal(lineNumber - 1, text.Lines.GetLinePosition(start - 1).Line); + } + + int nextPosition = start + length + newlineLength; + if (nextPosition < text.Length) + { + CheckNotEqualLine(textLine, text.Lines.GetLineFromPosition(nextPosition)); + Assert.Equal(lineNumber + 1, text.Lines.IndexOf(nextPosition)); + Assert.Equal(lineNumber + 1, text.Lines.GetLinePosition(nextPosition).Line); + } + } + + [Fact] + public void NewLines1() + { + var data = CreateSourceText("foo" + Environment.NewLine + " bar"); + Assert.Equal(2, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 5, length: 4, newlineLength: 0, lineText: " bar"); + } + + [Fact] + public void NewLines2() + { + var text = +@"foo +bar +baz"; + var data = CreateSourceText(text); + Assert.Equal(3, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 5, length: 3, newlineLength: 2, lineText: "bar"); + CheckLine(data, lineNumber: 2, start: 10, length: 3, newlineLength: 0, lineText: "baz"); + } + + [Fact] + public void NewLines3() + { + var data = CreateSourceText("foo\r\nbar"); + Assert.Equal(2, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 5, length: 3, newlineLength: 0, lineText: "bar"); + } + + [Fact] + public void NewLines4() + { + var data = CreateSourceText("foo\n\rbar\u2028"); + Assert.Equal(4, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 1, lineText: ""); + CheckLine(data, lineNumber: 2, start: 5, length: 3, newlineLength: 1, lineText: "bar"); + CheckLine(data, lineNumber: 3, start: 9, length: 0, newlineLength: 0, lineText: ""); + } + + [Fact] + public void NewLines5() + { + // Trailing CR + var data = CreateSourceText("foo\r"); + Assert.Equal(2, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 0, lineText: ""); + } + + [Fact] + public void NewLines6() + { + // Trailing CR+LF + var data = CreateSourceText("foo\r\n"); + Assert.Equal(2, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 2, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 5, length: 0, newlineLength: 0, lineText: ""); + } + + [Fact] + public void NewLines7() + { + // Consecutive CR + var data = CreateSourceText("foo\r\rbar"); + Assert.Equal(3, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 1, lineText: ""); + CheckLine(data, lineNumber: 2, start: 5, length: 3, newlineLength: 0, lineText: "bar"); + } + + [Fact] + public void NewLines8() + { + // Mix CR with CR+LF + const string cr = "\r"; + const string crLf = "\r\n"; + var data = CreateSourceText("foo" + cr + crLf + cr + "bar"); + Assert.Equal(4, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 2, lineText: ""); + CheckLine(data, lineNumber: 2, start: 6, length: 0, newlineLength: 1, lineText: ""); + CheckLine(data, lineNumber: 3, start: 7, length: 3, newlineLength: 0, lineText: "bar"); + } + + [Fact] + public void NewLines9() + { + // Mix CR with CR+LF + const string cr = "\r"; + const string crLf = "\r\n"; + const string lf = "\n"; + var data = CreateSourceText("foo" + cr + crLf + lf + "bar"); + Assert.Equal(4, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 3, newlineLength: 1, lineText: "foo"); + CheckLine(data, lineNumber: 1, start: 4, length: 0, newlineLength: 2, lineText: ""); + CheckLine(data, lineNumber: 2, start: 6, length: 0, newlineLength: 1, lineText: ""); + CheckLine(data, lineNumber: 3, start: 7, length: 3, newlineLength: 0, lineText: "bar"); + } + + [Fact] + public void Empty() + { + var data = CreateSourceText(""); + Assert.Equal(1, data.Lines.Count); + CheckLine(data, lineNumber: 0, start: 0, length: 0, newlineLength: 0, lineText: ""); + } + + [Fact] + public void LinesGetText1() + { + var text = +@"foo +bar baz"; + var data = CreateSourceText(text); + Assert.Equal(2, data.Lines.Count); + Assert.Equal("foo", data.Lines[0].ToString()); + Assert.Equal("bar baz", data.Lines[1].ToString()); + } + + [Fact] + public void LinesGetText2() + { + var text = "foo"; + var data = CreateSourceText(text); + Assert.Equal("foo", data.Lines[0].ToString()); + } } } \ No newline at end of file diff --git a/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs b/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs index b898ed7488e551ef278081b671fde6ef2c46e367..5568694430374868f10cff64f67e35aef7ffd1aa 100644 --- a/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs +++ b/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs @@ -1,12 +1,11 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. -using System; -using System.Collections.Immutable; using System.IO; using System.Linq; using System.Text; using Microsoft.CodeAnalysis.Text; using Xunit; +using ProprietaryTestResources = Microsoft.CodeAnalysis.Test.Resources.Proprietary; namespace Microsoft.CodeAnalysis.UnitTests.Text { @@ -15,31 +14,60 @@ public class SourceTextTests private static readonly Encoding s_utf8 = Encoding.UTF8; private static readonly Encoding s_utf8Bom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: true); private static readonly Encoding s_unicode = Encoding.Unicode; + private const string HelloWorld = "Hello, World!"; + + [Fact] + public void Empty() + { + TestIsEmpty(SourceText.From(string.Empty)); + TestIsEmpty(SourceText.From(new byte[0], 0)); + TestIsEmpty(SourceText.From(new MemoryStream())); + } + + private static void TestIsEmpty(SourceText text) + { + Assert.Equal(0, text.Length); + Assert.Same(string.Empty, text.ToString()); + Assert.Equal(1, text.Lines.Count); + Assert.Equal(0, text.Lines[0].Span.Length); + } [Fact] public void Encoding1() { - Assert.Same(s_utf8, SourceText.From("foo", s_utf8).Encoding); - Assert.Same(s_unicode, SourceText.From("foo", s_unicode).Encoding); - Assert.Same(s_unicode, SourceText.From(new MemoryStream(s_unicode.GetBytes("foo")), s_unicode).Encoding); + Assert.Same(s_utf8, SourceText.From(HelloWorld, s_utf8).Encoding); + Assert.Same(s_unicode, SourceText.From(HelloWorld, s_unicode).Encoding); + + var bytes = s_unicode.GetBytes(HelloWorld); + Assert.Same(s_unicode, SourceText.From(bytes, bytes.Length, s_unicode).Encoding); + + var stream = new MemoryStream(bytes); + Assert.Same(s_unicode, SourceText.From(stream, s_unicode).Encoding); } [Fact] public void EncodingBOM() { - var stream = new MemoryStream(s_utf8Bom.GetPreamble().Concat(s_utf8Bom.GetBytes("abc")).ToArray()); + var bytes = s_utf8Bom.GetPreamble().Concat(s_utf8Bom.GetBytes("abc")).ToArray(); + Assert.Equal(s_utf8.EncodingName, SourceText.From(bytes, bytes.Length, s_unicode).Encoding.EncodingName); + + var stream = new MemoryStream(bytes); Assert.Equal(s_utf8.EncodingName, SourceText.From(stream, s_unicode).Encoding.EncodingName); } [Fact] public void ChecksumAlgorithm1() { - Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From("foo").ChecksumAlgorithm); - Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From("foo", checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm); - Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From("foo", checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm); + Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(HelloWorld).ChecksumAlgorithm); + Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(HelloWorld, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm); + Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(HelloWorld, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm); - var stream = new MemoryStream(s_unicode.GetBytes("foo")); + var bytes = s_unicode.GetBytes(HelloWorld); + Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(bytes, bytes.Length).ChecksumAlgorithm); + Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(bytes, bytes.Length, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm); + Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(bytes, bytes.Length, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm); + var stream = new MemoryStream(bytes); Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(stream).ChecksumAlgorithm); Assert.Equal(SourceHashAlgorithm.Sha1, SourceText.From(stream, checksumAlgorithm: SourceHashAlgorithm.Sha1).ChecksumAlgorithm); Assert.Equal(SourceHashAlgorithm.Sha256, SourceText.From(stream, checksumAlgorithm: SourceHashAlgorithm.Sha256).ChecksumAlgorithm); @@ -48,14 +76,14 @@ public void ChecksumAlgorithm1() [Fact] public void ContentEquals() { - var f = SourceText.From("foo", s_utf8); + var f = SourceText.From(HelloWorld, s_utf8); - Assert.True(f.ContentEquals(SourceText.From("foo", s_utf8))); - Assert.False(f.ContentEquals(SourceText.From("fooo", s_utf8))); - Assert.True(SourceText.From("foo", s_utf8).ContentEquals(SourceText.From("foo", s_utf8))); + Assert.True(f.ContentEquals(SourceText.From(HelloWorld, s_utf8))); + Assert.False(f.ContentEquals(SourceText.From(HelloWorld + "o", s_utf8))); + Assert.True(SourceText.From(HelloWorld, s_utf8).ContentEquals(SourceText.From(HelloWorld, s_utf8))); - var e1 = EncodedStringText.Create(new MemoryStream(s_unicode.GetBytes("foo")), s_unicode); - var e2 = EncodedStringText.Create(new MemoryStream(s_utf8.GetBytes("foo")), s_utf8); + var e1 = EncodedStringText.Create(new MemoryStream(s_unicode.GetBytes(HelloWorld)), s_unicode); + var e2 = EncodedStringText.Create(new MemoryStream(s_utf8.GetBytes(HelloWorld)), s_utf8); Assert.True(e1.ContentEquals(e1)); Assert.True(f.ContentEquals(e1)); @@ -65,5 +93,73 @@ public void ContentEquals() Assert.True(e1.ContentEquals(e2)); Assert.True(e2.ContentEquals(e1)); } + + [Fact] + public void IsBinary() + { + Assert.False(SourceText.IsBinary("")); + + Assert.False(SourceText.IsBinary("\0abc")); + Assert.False(SourceText.IsBinary("a\0bc")); + Assert.False(SourceText.IsBinary("abc\0")); + Assert.False(SourceText.IsBinary("a\0b\0c")); + + Assert.True(SourceText.IsBinary("\0\0abc")); + Assert.True(SourceText.IsBinary("a\0\0bc")); + Assert.True(SourceText.IsBinary("abc\0\0")); + + var encoding = Encoding.GetEncoding(1252); + Assert.False(SourceText.IsBinary(encoding.GetString(new byte[] { 0x81, 0x8D, 0x8F, 0x90, 0x9D }))); + // Unicode string: äëïöüû + Assert.False(SourceText.IsBinary("abc def baz aeiouy \u00E4\u00EB\u00EF\u00F6\u00FC\u00FB")); + Assert.True(SourceText.IsBinary(encoding.GetString(ProprietaryTestResources.NetFX.v4_0_30319.System))); + } + + [Fact] + public void FromThrowsIfBinary() + { + var bytes = ProprietaryTestResources.NetFX.v4_0_30319.System; + Assert.Throws(() => SourceText.From(bytes, bytes.Length, throwIfBinaryDetected: true)); + + var stream = new MemoryStream(bytes); + Assert.Throws(() => SourceText.From(stream, throwIfBinaryDetected: true)); + } + + private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data) + { + TestTryReadByteOrderMark(expectedEncoding, expectedPreambleLength, data, data == null ? 0 : data.Length); + } + + private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data, int validLength) + { + int actualPreambleLength; + Encoding actualEncoding = SourceText.TryReadByteOrderMark(data, validLength, out actualPreambleLength); + if (expectedEncoding == null) + { + Assert.Null(actualEncoding); + } + else + { + Assert.Equal(expectedEncoding, actualEncoding); + } + + Assert.Equal(expectedPreambleLength, actualPreambleLength); + } + + [Fact] + public void TryReadByteOrderMark() + { + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[0]); + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef }); + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef, 0xbb }); + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xef, 0xBB, 0xBF }, validLength: 2); + TestTryReadByteOrderMark(expectedEncoding: Encoding.UTF8, expectedPreambleLength: 3, data: new byte[] { 0xef, 0xBB, 0xBF }); + + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xff }); + TestTryReadByteOrderMark(expectedEncoding: Encoding.Unicode, expectedPreambleLength: 2, data: new byte[] { 0xff, 0xfe }); + + TestTryReadByteOrderMark(expectedEncoding: null, expectedPreambleLength: 0, data: new byte[] { 0xfe }); + TestTryReadByteOrderMark(expectedEncoding: Encoding.BigEndianUnicode, expectedPreambleLength: 2, data: new byte[] { 0xfe, 0xff }); + } } } diff --git a/src/Compilers/Core/CodeAnalysisTest/Text/StringTextTest.cs b/src/Compilers/Core/CodeAnalysisTest/Text/StringTextTest.cs index b88e422059156890944418e2caa15302924b6764..317fd6544bebc77d0dd2954e357c3cc0dbcc67c3 100644 --- a/src/Compilers/Core/CodeAnalysisTest/Text/StringTextTest.cs +++ b/src/Compilers/Core/CodeAnalysisTest/Text/StringTextTest.cs @@ -236,6 +236,9 @@ public void FromStream_CheckSum_BOM() [Fact] public void FromStream_CheckSum_NoBOM() { + // Note: The 0x95 is outside the ASCII range, so a question mark will + // be substituted in decoded text. Note, however, that the checksum + // should be derived from the original input. var bytes = new byte[] { 0x61, 0x62, 0x95 }; var source = SourceText.From(new MemoryStream(bytes), Encoding.ASCII); diff --git a/src/Compilers/Core/Desktop/CodeAnalysis.Desktop.csproj b/src/Compilers/Core/Desktop/CodeAnalysis.Desktop.csproj index 38846f3d9f83933fff58fda23a4805eef5d16eee..5172ece203f534e71b7fdf256c7dc4f20beee5c3 100644 --- a/src/Compilers/Core/Desktop/CodeAnalysis.Desktop.csproj +++ b/src/Compilers/Core/Desktop/CodeAnalysis.Desktop.csproj @@ -91,7 +91,6 @@ - diff --git a/src/Compilers/Core/Desktop/EncodedStringText.cs b/src/Compilers/Core/Desktop/EncodedStringText.cs index 06eae252d7037d9f43097b05a4ec4f1b6ed21133..6d135262adba60a5874878114fcfedf2e66a3674 100644 --- a/src/Compilers/Core/Desktop/EncodedStringText.cs +++ b/src/Compilers/Core/Desktop/EncodedStringText.cs @@ -1,39 +1,16 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; -using System.Collections.Immutable; using System.Diagnostics; using System.IO; using System.Text; -using System.Threading; using Roslyn.Utilities; namespace Microsoft.CodeAnalysis.Text { - internal sealed class EncodedStringText : SourceText + internal static class EncodedStringText { - /// - /// Underlying string on which this SourceText instance is based - /// - private readonly string _source; - - private readonly Encoding _encoding; - - private const int LargeObjectHeapLimit = 80 * 1024; // 80KB - - private EncodedStringText(string source, Encoding encoding, ImmutableArray checksum, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinary) - : base(checksum: checksum, checksumAlgorithm: checksumAlgorithm) - { - if (throwIfBinary && IsBinary(source)) - { - throw new InvalidDataException(); - } - - Debug.Assert(source != null); - Debug.Assert(encoding != null); - _source = source; - _encoding = encoding; - } + private const int LargeObjectHeapLimitInChars = 40 * 1024; // 40KB /// /// Encoding to use when there is no byte order mark (BOM) on the stream. This encoder may throw a @@ -42,9 +19,13 @@ private EncodedStringText(string source, Encoding encoding, ImmutableArray private static readonly Encoding FallbackEncoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); /// - /// Initializes an instance of with provided bytes. + /// Initializes an instance of from the provided stream. This version differs + /// from in two ways: + /// 1. It attempts to minimize allocations by trying to read the stream into a byte array. + /// 2. If is null, it will first try UTF8 and, if that fails, it will + /// try . /// - /// + /// The stream containing encoded text. /// /// Specifies an encoding to be used if the actual encoding can't be determined from the stream content (the stream doesn't start with Byte Order Mark). /// If not specified auto-detect heuristics are used to determine the encoding. If these heuristics fail the decoding is assumed to be . @@ -84,156 +65,36 @@ internal static SourceText Create(Stream stream, Encoding defaultEncoding = null } } - public override Encoding Encoding - { - get { return _encoding; } - } - - /// - /// Underlying string which is the source of this SourceText instance - /// - public string Source - { - get { return _source; } - } - - /// - /// The length of the text represented by . - /// - public override int Length - { - get { return this.Source.Length; } - } - - /// - /// Returns a character at given position. - /// - /// The position to get the character from. - /// The character. - /// When position is negative or - /// greater than . - public override char this[int position] - { - get - { - // NOTE: we are not validating position here as that would not - // add any value to the range check that string accessor performs anyways. - - return _source[position]; - } - } - - /// - /// Provides a string representation of the StringText located within given span. - /// - /// When given span is outside of the text range. - public override string ToString(TextSpan span) - { - if (span.End > this.Source.Length) - { - throw new ArgumentOutOfRangeException("span"); - } - - if (span.Start == 0 && span.Length == this.Length) - { - return this.Source; - } - else - { - return this.Source.Substring(span.Start, span.Length); - } - } - - public override void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count) - { - this.Source.CopyTo(sourceIndex, destination, destinationIndex, count); - } - - public override void Write(TextWriter textWriter, TextSpan span, CancellationToken cancellationToken = default(CancellationToken)) - { - if (span.Start == 0 && span.End == this.Length) - { - textWriter.Write(this.Source); - } - else - { - base.Write(textWriter, span, cancellationToken); - } - } - - #region Encoding Detection - - /// - /// Check for occurrence of two consecutive NUL (U+0000) characters. - /// This is unlikely to appear in genuine text, so it's a good heuristic - /// to detect binary files. - /// - /// - /// internal for unit testing - /// - internal static bool IsBinary(string text) - { - // PERF: We can advance two chars at a time unless we find a NUL. - for (int i = 1; i < text.Length;) - { - if (text[i] == '\0') - { - if (text[i - 1] == '\0') - { - return true; - } - - i += 1; - } - else - { - i += 2; - } - } - - return false; - } - /// - /// Try to create a from the given stream using the given encoding. + /// Try to create a from the given stream using the given encoding. /// /// The input stream containing the encoded text. The stream will not be closed. /// The expected encoding of the stream. The actual encoding used may be different if byte order marks are detected. /// The checksum algorithm to use. /// Throw if binary (non-text) data is detected. - /// The decoded from the stream. + /// The decoded from the stream. /// The decoder was unable to decode the stream with the given encoding. /// /// internal for unit testing /// internal static SourceText Decode(Stream data, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected = false) { - data.Seek(0, SeekOrigin.Begin); - - if (data.Length > LargeObjectHeapLimit) - { - return LargeEncodedText.Decode(data, encoding, checksumAlgorithm, throwIfBinaryDetected); - } - - Encoding actualEncoding; - ImmutableArray checksum = default(ImmutableArray); - string text; + Debug.Assert(data != null); + Debug.Assert(encoding != null); - byte[] buffer = TryGetByteArrayFromStream(data); - if (buffer != null) - { - text = Decode(buffer, (int)data.Length, encoding, out actualEncoding); + data.Seek(0, SeekOrigin.Begin); - // Since we have the buffer, compute the checksum here. This saves allocations if we later - // need to write out debugging information. - checksum = CalculateChecksum(buffer, offset: 0, count: (int)data.Length, algorithmId: checksumAlgorithm); - } - else + // For small streams, see if we can read the byte buffer directly. + if (encoding.GetMaxCharCount((int)data.Length) < LargeObjectHeapLimitInChars) { - text = Decode(data, encoding, out actualEncoding); + byte[] buffer = TryGetByteArrayFromStream(data); + if (buffer != null) + { + return SourceText.From(buffer, (int)data.Length, encoding, checksumAlgorithm, throwIfBinaryDetected); + } } - return new EncodedStringText(text, actualEncoding, checksum, checksumAlgorithm, throwIfBinary: throwIfBinaryDetected); + return SourceText.From(data, encoding, checksumAlgorithm, throwIfBinaryDetected); } /// @@ -265,40 +126,6 @@ private static byte[] TryGetByteArrayFromStream(Stream data) return null; } - /// - /// Decode the given stream using the given encoding. Does not - /// close the stream afterwards. - /// - /// Data stream - /// Default encoding to use for decoding. - /// Actual encoding used to read the text. - /// If the given encoding is set to use as its fallback decoder. - /// Decoded stream as a text string - private static string Decode(Stream data, Encoding encoding, out Encoding actualEncoding) - { - data.Seek(0, SeekOrigin.Begin); - - int length = (int)data.Length; - if (length == 0) - { - actualEncoding = encoding; - return string.Empty; - } - - // Note: We are setting the buffer size to 4KB instead of the default 1KB. That's - // because we can reach this code path for FileStreams that are larger than 80KB - // and, to avoid FileStream buffer allocations for small files, we may intentionally - // be using a FileStream with a very small (1 byte) buffer. Using 4KB here matches - // the default buffer size for FileStream and means we'll still be doing file I/O - // in 4KB chunks. - using (var reader = new StreamReader(data, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(4096, length), leaveOpen: true)) - { - string text = reader.ReadToEnd(); - actualEncoding = reader.CurrentEncoding; - return text; - } - } - /// /// If the MemoryStream was created with publiclyVisible=true, then we can access its buffer /// directly and save allocations in StreamReader. The input MemoryStream is not closed on exit. @@ -353,117 +180,5 @@ private static bool TryGetByteArrayFromFileStream(FileStream stream, out byte[] // line compiler actually specifies a very small buffer size. return stream.Read(buffer, 0, length) == length; } - - /// - /// Decode text from a byte array. - /// - /// The byte array containing encoded text. - /// The count of valid bytes in . - /// The encoding to use if an encoding cannot be determined from the byte order mark. - /// The actual encoding used. - /// The decoded text. - /// If the given encoding is set to use - /// as its fallback decoder. - private static string Decode(byte[] buffer, int length, Encoding encoding, out Encoding actualEncoding) - { - int preambleLength; - actualEncoding = TryReadByteOrderMark(buffer, length, out preambleLength) ?? encoding; - return actualEncoding.GetString(buffer, preambleLength, length - preambleLength); - } - - /// - /// Detect an encoding by looking for byte order marks. - /// - /// A buffer containing the encoded text. - /// The length of valid data in the buffer. - /// The length of any detected byte order marks. - /// The detected encoding or null if no recognized byte order mark was present. - private static Encoding TryReadByteOrderMark(byte[] source, int length, out int preambleLength) - { - Debug.Assert(source != null); - Debug.Assert(length <= source.Length); - - if (length >= 2) - { - switch (source[0]) - { - case 0xFE: - if (source[1] == 0xFF) - { - preambleLength = 2; - return Encoding.BigEndianUnicode; - } - - break; - - case 0xFF: - if (source[1] == 0xFE) - { - preambleLength = 2; - return Encoding.Unicode; - } - - break; - - case 0xEF: - if (source[1] == 0xBB && length >= 3 && source[2] == 0xBF) - { - preambleLength = 3; - return Encoding.UTF8; - } - - break; - } - } - - preambleLength = 0; - return null; - } - - [ThreadStatic] - private static byte[] t_bomBytes; - - /// - /// Detect an encoding by looking for byte order marks at the beginning of the stream. - /// - /// The stream containing encoded text. - /// The detected encoding or null if no recognized byte order mark was present. - /// - /// On exit, the stream's position is set to the first position after any decoded byte order - /// mark or rewound to the start if no byte order mark was detected. - /// - internal static Encoding TryReadByteOrderMark(Stream data) - { - Debug.Assert(data != null); - data.Seek(0, SeekOrigin.Begin); - - if (data.Length < 2) - { - // Not long enough for any valid BOM prefix - return null; - } - - // PERF: Avoid repeated calls to Stream.ReadByte since that method allocates a 1-byte array on each call. - // Instead, using a thread local byte array. - if (t_bomBytes == null) - { - t_bomBytes = new byte[3]; - } - - int validLength = Math.Min((int)data.Length, t_bomBytes.Length); - data.Read(t_bomBytes, 0, validLength); - - int preambleLength; - Encoding detectedEncoding = TryReadByteOrderMark(t_bomBytes, validLength, out preambleLength); - - if (preambleLength != validLength) - { - data.Seek(preambleLength, SeekOrigin.Begin); - } - - return detectedEncoding; - } - - #endregion } } diff --git a/src/Compilers/Core/Portable/CodeAnalysis.csproj b/src/Compilers/Core/Portable/CodeAnalysis.csproj index af3056229e3c18fc1069aafa37dd63a97038d99f..3ed19e5a47c151321f5398bf8c630616e377df5f 100644 --- a/src/Compilers/Core/Portable/CodeAnalysis.csproj +++ b/src/Compilers/Core/Portable/CodeAnalysis.csproj @@ -562,6 +562,7 @@ + diff --git a/src/Compilers/Core/Portable/PublicAPI.txt b/src/Compilers/Core/Portable/PublicAPI.txt index 0f16dd5ddecabb2f6070414b9924a4fe6080065e..c7ddc770368753959370322113f46bf7e71fab21 100644 --- a/src/Compilers/Core/Portable/PublicAPI.txt +++ b/src/Compilers/Core/Portable/PublicAPI.txt @@ -1327,6 +1327,7 @@ Microsoft.CodeAnalysis.Text.SourceText Microsoft.CodeAnalysis.Text.SourceText.ChecksumAlgorithm.get Microsoft.CodeAnalysis.Text.SourceText.ContentEquals(Microsoft.CodeAnalysis.Text.SourceText other) Microsoft.CodeAnalysis.Text.SourceText.GetSubText(int start) +Microsoft.CodeAnalysis.Text.SourceText.Lines.get Microsoft.CodeAnalysis.Text.SourceText.Replace(Microsoft.CodeAnalysis.Text.TextSpan span, string newText) Microsoft.CodeAnalysis.Text.SourceText.Replace(int start, int length, string newText) Microsoft.CodeAnalysis.Text.SourceText.SourceText(System.Collections.Immutable.ImmutableArray checksum = default(System.Collections.Immutable.ImmutableArray), Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, Microsoft.CodeAnalysis.Text.SourceTextContainer container = null) @@ -2011,7 +2012,8 @@ static Microsoft.CodeAnalysis.Text.LinePosition.operator >=(Microsoft.CodeAnalys static Microsoft.CodeAnalysis.Text.LinePositionSpan.operator !=(Microsoft.CodeAnalysis.Text.LinePositionSpan left, Microsoft.CodeAnalysis.Text.LinePositionSpan right) static Microsoft.CodeAnalysis.Text.LinePositionSpan.operator ==(Microsoft.CodeAnalysis.Text.LinePositionSpan left, Microsoft.CodeAnalysis.Text.LinePositionSpan right) static Microsoft.CodeAnalysis.Text.SourceText.CalculateChecksum(byte[] buffer, int offset, int count, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm algorithmId) -static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1) +static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false) +static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false) static Microsoft.CodeAnalysis.Text.SourceText.From(string text, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1) static Microsoft.CodeAnalysis.Text.TextChange.implicit operator Microsoft.CodeAnalysis.Text.TextChangeRange(Microsoft.CodeAnalysis.Text.TextChange change) static Microsoft.CodeAnalysis.Text.TextChange.operator !=(Microsoft.CodeAnalysis.Text.TextChange left, Microsoft.CodeAnalysis.Text.TextChange right) @@ -2110,9 +2112,9 @@ virtual Microsoft.CodeAnalysis.SyntaxWalker.VisitTrivia(Microsoft.CodeAnalysis.S virtual Microsoft.CodeAnalysis.Text.SourceText.Container.get virtual Microsoft.CodeAnalysis.Text.SourceText.ContentEqualsImpl(Microsoft.CodeAnalysis.Text.SourceText other) virtual Microsoft.CodeAnalysis.Text.SourceText.GetChangeRanges(Microsoft.CodeAnalysis.Text.SourceText oldText) +virtual Microsoft.CodeAnalysis.Text.SourceText.GetLinesCore() virtual Microsoft.CodeAnalysis.Text.SourceText.GetSubText(Microsoft.CodeAnalysis.Text.TextSpan span) virtual Microsoft.CodeAnalysis.Text.SourceText.GetTextChanges(Microsoft.CodeAnalysis.Text.SourceText oldText) -virtual Microsoft.CodeAnalysis.Text.SourceText.Lines.get virtual Microsoft.CodeAnalysis.Text.SourceText.ToString(Microsoft.CodeAnalysis.Text.TextSpan span) virtual Microsoft.CodeAnalysis.Text.SourceText.WithChanges(System.Collections.Generic.IEnumerable changes) virtual Microsoft.CodeAnalysis.Text.SourceText.Write(System.IO.TextWriter writer, Microsoft.CodeAnalysis.Text.TextSpan span, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) diff --git a/src/Compilers/Core/Desktop/LargeEncodedText.cs b/src/Compilers/Core/Portable/Text/LargeEncodedText.cs similarity index 71% rename from src/Compilers/Core/Desktop/LargeEncodedText.cs rename to src/Compilers/Core/Portable/Text/LargeEncodedText.cs index a1daf50e4b54ccbbffa9b7c3a10816a58d049c7b..2ca6b7393de7b1036ede09d10cbeb92e4bd12a85 100644 --- a/src/Compilers/Core/Desktop/LargeEncodedText.cs +++ b/src/Compilers/Core/Portable/Text/LargeEncodedText.cs @@ -1,4 +1,6 @@ -using System; +// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; using System.Collections.Immutable; using System.IO; using System.Text; @@ -123,21 +125,9 @@ private int GetIndexFromPosition(int position) } } - public override Encoding Encoding - { - get - { - return _encoding; - } - } + public override Encoding Encoding => _encoding; - public override int Length - { - get - { - return _length; - } - } + public override int Length => _length; public override void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count) { @@ -196,5 +186,70 @@ public override void Write(TextWriter writer, TextSpan span, CancellationToken c chunkIndex++; } } + + /// + /// Called from to initialize the . Thereafter, + /// the collection is cached. + /// + /// A new representing the individual text lines. + protected override TextLineCollection GetLinesCore() + { + return new LineInfo(this, ParseLineStarts()); + } + + private int[] ParseLineStarts() + { + var position = 0; + var index = 0; + var lastCr = -1; + var arrayBuilder = ArrayBuilder.GetInstance(); + + // The following loop goes through every character in the text. It is highly + // performance critical, and thus inlines knowledge about common line breaks + // and non-line breaks. + foreach (var chunk in _chunks) + { + foreach (var c in chunk) + { + index++; + + // Common case - ASCII & not a line break + const uint bias = '\r' + 1; + if (unchecked(c - bias) <= (127 - bias)) + { + continue; + } + + switch(c) + { + case '\r': + lastCr = index; + goto line_break; + + case '\n': + // Assumes that the only 2-char line break sequence is CR+LF + if (lastCr == (index - 1)) + { + position = index; + break; + } + + goto line_break; + + case '\u0085': + case '\u2028': + case '\u2029': + line_break: + arrayBuilder.Add(position); + position = index; + break; + } + } + } + + // Create a start for the final line. + arrayBuilder.Add(position); + return arrayBuilder.ToArrayAndFree(); + } } } diff --git a/src/Compilers/Core/Portable/Text/SourceText.cs b/src/Compilers/Core/Portable/Text/SourceText.cs index f244b73862a2be5df2f2e08d5c804c194b43c829..bf935ea98efa80cd1450679faa0fd160731f7fd6 100644 --- a/src/Compilers/Core/Portable/Text/SourceText.cs +++ b/src/Compilers/Core/Portable/Text/SourceText.cs @@ -6,7 +6,6 @@ using System.Diagnostics; using System.IO; using System.Linq; -using System.Reflection; using System.Text; using System.Threading; using Roslyn.Utilities; @@ -20,14 +19,17 @@ public abstract class SourceText { private const int CharBufferSize = 32 * 1024; private const int CharBufferCount = 5; + private const int LargeObjectHeapLimitInChars = 40 * 1024; // 40KB private static readonly ObjectPool s_charArrayPool = new ObjectPool(() => new char[CharBufferSize], CharBufferCount); private readonly SourceHashAlgorithm _checksumAlgorithm; private SourceTextContainer _lazyContainer; - private LineInfo _lazyLineInfo; + private TextLineCollection _lazyLineInfo; private ImmutableArray _lazyChecksum; + private static readonly Encoding Utf8EncodingWithNoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: false); + protected SourceText(ImmutableArray checksum = default(ImmutableArray), SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, SourceTextContainer container = null) { ValidateChecksumAlgorithm(checksumAlgorithm); @@ -78,7 +80,7 @@ public static SourceText From(string text, Encoding encoding = null, SourceHashA /// /// Constructs a from stream content. /// - /// Stream. + /// Stream. The stream must be seekable. /// /// Data encoding to use if the stream doesn't start with Byte Order Mark specifying the encoding. /// if not specified. @@ -86,14 +88,18 @@ public static SourceText From(string text, Encoding encoding = null, SourceHashA /// /// Hash algorithm to use to calculate checksum of the text that's saved to PDB. /// + /// If the decoded text contains at least two consecutive NUL + /// characters, then an is thrown. /// is null. /// /// doesn't support reading or seeking. /// is not supported. /// + /// If the given encoding is set to use a throwing decoder as a fallback + /// Two consecutive NUL characters were detected in the decoded text and was true. /// An I/O error occurs. /// Reads from the beginning of the stream. Leaves the stream open. - public static SourceText From(Stream stream, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1) + public static SourceText From(Stream stream, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false) { if (stream == null) { @@ -107,29 +113,158 @@ public static SourceText From(Stream stream, Encoding encoding = null, SourceHas ValidateChecksumAlgorithm(checksumAlgorithm); - encoding = encoding ?? Encoding.UTF8; + encoding = encoding ?? Utf8EncodingWithNoBOM; - // TODO: unify encoding detection with EncodedStringText + // If the resulting string would end up on the large object heap, then use LargeEncodedText. + if (encoding.GetMaxCharCount((int)stream.Length) >= LargeObjectHeapLimitInChars) + { + return LargeEncodedText.Decode(stream, encoding, checksumAlgorithm, throwIfBinaryDetected); + } - stream.Seek(0, SeekOrigin.Begin); - string text; - using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true)) + string text = Decode(stream, encoding, out encoding); + if (throwIfBinaryDetected && IsBinary(text)) { - text = reader.ReadToEnd(); - encoding = reader.CurrentEncoding; + throw new InvalidDataException(); } - return new StringText(text, encoding, CalculateChecksum(stream, checksumAlgorithm), checksumAlgorithm); + var checksum = CalculateChecksum(stream, checksumAlgorithm); + return new StringText(text, encoding, checksum, checksumAlgorithm); } /// + /// Constructs a from a byte array. + /// + /// The encoded source buffer. + /// The number of bytes to read from the buffer. + /// + /// Data encoding to use if the encoded buffer doesn't start with Byte Order Mark. + /// if not specified. + /// + /// /// Hash algorithm to use to calculate checksum of the text that's saved to PDB. + /// + /// If the decoded text contains at least two consecutive NUL + /// characters, then an is thrown. + /// The decoded text. + /// The is null. + /// The is negative or longer than the . + /// is not supported. + /// If the given encoding is set to use a throwing decoder as a fallback + /// Two consecutive NUL characters were detected in the decoded text and was true. + public static SourceText From(byte[] buffer, int length, Encoding encoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false) + { + if (buffer == null) + { + throw new ArgumentNullException(nameof(buffer)); + } + + if (length < 0 || length > buffer.Length) + { + throw new ArgumentOutOfRangeException(nameof(length)); + } + + ValidateChecksumAlgorithm(checksumAlgorithm); + + string text = Decode(buffer, length, encoding ?? Utf8EncodingWithNoBOM, out encoding); + if (throwIfBinaryDetected && IsBinary(text)) + { + throw new InvalidDataException(); + } + + // Since we have the bytes in hand, it's easy to compute the checksum. + var checksum = CalculateChecksum(buffer, 0, length, checksumAlgorithm); + return new StringText(text, encoding, checksum, checksumAlgorithm); + } + + /// + /// Decode text from a stream. + /// + /// The stream containing encoded text. + /// The encoding to use if an encoding cannot be determined from the byte order mark. + /// The actual encoding used. + /// The decoded text. + /// If the given encoding is set to use a throwing decoder as a fallback + private static string Decode(Stream stream, Encoding encoding, out Encoding actualEncoding) + { + Debug.Assert(stream != null); + Debug.Assert(encoding != null); + + stream.Seek(0, SeekOrigin.Begin); + + int length = (int)stream.Length; + if (length == 0) + { + actualEncoding = encoding; + return string.Empty; + } + + // Note: We are setting the buffer size to 4KB instead of the default 1KB. That's + // because we can reach this code path for FileStreams and, to avoid FileStream + // buffer allocations for small files, we may intentionally be using a FileStream + // with a very small (1 byte) buffer. Using 4KB here matches the default buffer + // size for FileStream and means we'll still be doing file I/O in 4KB chunks. + using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(4096, length), leaveOpen: true)) + { + string text = reader.ReadToEnd(); + actualEncoding = reader.CurrentEncoding; + return text; + } + } + + /// + /// Decode text from a byte array. /// - public SourceHashAlgorithm ChecksumAlgorithm + /// The byte array containing encoded text. + /// The count of valid bytes in . + /// The encoding to use if an encoding cannot be determined from the byte order mark. + /// The actual encoding used. + /// The decoded text. + /// If the given encoding is set to use a throwing decoder as a fallback + private static string Decode(byte[] buffer, int length, Encoding encoding, out Encoding actualEncoding) { - get { return _checksumAlgorithm; } + Debug.Assert(buffer != null); + Debug.Assert(encoding != null); + int preambleLength; + actualEncoding = TryReadByteOrderMark(buffer, length, out preambleLength) ?? encoding; + return actualEncoding.GetString(buffer, preambleLength, length - preambleLength); } + /// + /// Check for occurrence of two consecutive NUL (U+0000) characters. + /// This is unlikely to appear in genuine text, so it's a good heuristic + /// to detect binary files. + /// + /// + /// internal for unit testing + /// + internal static bool IsBinary(string text) + { + // PERF: We can advance two chars at a time unless we find a NUL. + for (int i = 1; i < text.Length;) + { + if (text[i] == '\0') + { + if (text[i - 1] == '\0') + { + return true; + } + + i += 1; + } + else + { + i += 2; + } + } + + return false; + } + + /// + /// Hash algorithm to use to calculate checksum of the text that's saved to PDB. + /// + public SourceHashAlgorithm ChecksumAlgorithm => _checksumAlgorithm; + /// /// Encoding of the file that the text was read from or is going to be saved to. /// null if the encoding is unspecified. @@ -479,21 +614,26 @@ public virtual IReadOnlyList GetTextChanges(SourceText oldText) /// /// The collection of individual text lines. /// - public virtual TextLineCollection Lines + public TextLineCollection Lines { get { - if (_lazyLineInfo == null) - { - var info = new LineInfo(this, this.ParseLineStarts()); - Interlocked.CompareExchange(ref _lazyLineInfo, info, null); - } - - return _lazyLineInfo; + var info = _lazyLineInfo; + return info ?? Interlocked.CompareExchange(ref _lazyLineInfo, info = GetLinesCore(), null) ?? info; } } - private class LineInfo : TextLineCollection + /// + /// Called from to initialize the . Thereafter, + /// the collection is cached. + /// + /// A new representing the individual text lines. + protected virtual TextLineCollection GetLinesCore() + { + return new LineInfo(this, ParseLineStarts()); + } + + internal sealed class LineInfo : TextLineCollection { private readonly SourceText _text; private readonly int[] _lineStarts; @@ -505,10 +645,7 @@ public LineInfo(SourceText text, int[] lineStarts) _lineStarts = lineStarts; } - public override int Count - { - get { return _lineStarts.Length; } - } + public override int Count => _lineStarts.Length; public override TextLine this[int index] { @@ -701,6 +838,55 @@ protected virtual bool ContentEqualsImpl(SourceText other) #endregion + /// + /// Detect an encoding by looking for byte order marks. + /// + /// A buffer containing the encoded text. + /// The length of valid data in the buffer. + /// The length of any detected byte order marks. + /// The detected encoding or null if no recognized byte order mark was present. + internal static Encoding TryReadByteOrderMark(byte[] source, int length, out int preambleLength) + { + Debug.Assert(source != null); + Debug.Assert(length <= source.Length); + + if (length >= 2) + { + switch (source[0]) + { + case 0xFE: + if (source[1] == 0xFF) + { + preambleLength = 2; + return Encoding.BigEndianUnicode; + } + + break; + + case 0xFF: + if (source[1] == 0xFE) + { + preambleLength = 2; + return Encoding.Unicode; + } + + break; + + case 0xEF: + if (source[1] == 0xBB && length >= 3 && source[2] == 0xBF) + { + preambleLength = 3; + return Encoding.UTF8; + } + + break; + } + } + + preambleLength = 0; + return null; + } + private class StaticContainer : SourceTextContainer { private readonly SourceText _text; @@ -710,10 +896,7 @@ public StaticContainer(SourceText text) _text = text; } - public override SourceText CurrentText - { - get { return _text; } - } + public override SourceText CurrentText => _text; public override event EventHandler TextChanged { diff --git a/src/Compilers/Core/Portable/Text/StringText.cs b/src/Compilers/Core/Portable/Text/StringText.cs index e35d18cb9abce8e2bddb7d1802269209e35978a4..6040fc591e3e4c0f4353460c25f7e0f72a846af1 100644 --- a/src/Compilers/Core/Portable/Text/StringText.cs +++ b/src/Compilers/Core/Portable/Text/StringText.cs @@ -4,10 +4,8 @@ using System.Collections.Immutable; using System.Diagnostics; using System.IO; -using System.Reflection; using System.Text; using System.Threading; -using Roslyn.Utilities; namespace Microsoft.CodeAnalysis.Text { @@ -28,26 +26,17 @@ internal StringText(string source, Encoding encodingOpt, ImmutableArray ch _encodingOpt = encodingOpt; } - public override Encoding Encoding - { - get { return _encodingOpt; } - } + public override Encoding Encoding => _encodingOpt; /// /// Underlying string which is the source of this instance /// - public string Source - { - get { return _source; } - } + public string Source => _source; /// /// The length of the text represented by . /// - public override int Length - { - get { return this.Source.Length; } - } + public override int Length => _source.Length; /// /// Returns a character at given position. diff --git a/src/EditorFeatures/Text/Extensions.SnapshotSourceText.cs b/src/EditorFeatures/Text/Extensions.SnapshotSourceText.cs index 1fb9efe6f2adf2690c16b4772de3884549b476e3..a27e9937f377c528c4f3b1d8fd97e78c2ccf799d 100644 --- a/src/EditorFeatures/Text/Extensions.SnapshotSourceText.cs +++ b/src/EditorFeatures/Text/Extensions.SnapshotSourceText.cs @@ -26,7 +26,7 @@ private class SnapshotSourceText : SourceText /// /// Use a separate class for closed files to simplify memory leak investigations /// - internal class ClosedSnapshotSourceText : SnapshotSourceText + internal sealed class ClosedSnapshotSourceText : SnapshotSourceText { public ClosedSnapshotSourceText(ITextSnapshot roslynSnapshot, Encoding encodingOpt) : base(roslynSnapshot, encodingOpt, containerOpt: null) @@ -43,7 +43,6 @@ public ClosedSnapshotSourceText(ITextSnapshot roslynSnapshot, Encoding encodingO private readonly Encoding _encodingOpt; private readonly TextBufferContainer _containerOpt; private readonly int _reiteratedVersion; - private LineInfo _lineInfo; private SnapshotSourceText(ITextSnapshot editorSnapshot, Encoding encodingOpt) { @@ -145,17 +144,9 @@ public override int Length } #region Lines - public override TextLineCollection Lines + protected override TextLineCollection GetLinesCore() { - get - { - if (_lineInfo == null) - { - System.Threading.Interlocked.CompareExchange(ref _lineInfo, new LineInfo(this), null); - } - - return _lineInfo; - } + return new LineInfo(this); } private class LineInfo : TextLineCollection diff --git a/src/VisualStudio/VisualBasic/Impl/ProjectSystemShim/TempPECompiler.TempPEProject.vb b/src/VisualStudio/VisualBasic/Impl/ProjectSystemShim/TempPECompiler.TempPEProject.vb index 7ad976d9b25954a1100788e50c1e0a8685e625ef..33d2712bcac1ba91073c42654205472ba20cfaee 100644 --- a/src/VisualStudio/VisualBasic/Impl/ProjectSystemShim/TempPECompiler.TempPEProject.vb +++ b/src/VisualStudio/VisualBasic/Impl/ProjectSystemShim/TempPECompiler.TempPEProject.vb @@ -30,7 +30,7 @@ Namespace Microsoft.VisualStudio.LanguageServices.VisualBasic.ProjectSystemShim Function CompileAndGetErrorCount(metadataService As IMetadataService) As Integer Dim trees = _files.Select(Function(path) Using stream = FileUtilities.OpenRead(path) - Return SyntaxFactory.ParseSyntaxTree(EncodedStringText.From(stream), options:=_compilerOptions.ParseOptions, path:=path) + Return SyntaxFactory.ParseSyntaxTree(SourceText.From(stream), options:=_compilerOptions.ParseOptions, path:=path) End Using End Function) diff --git a/src/Workspaces/Core/Desktop/Workspaces.Desktop.csproj b/src/Workspaces/Core/Desktop/Workspaces.Desktop.csproj index c3ae26410c624a614f4483fa3df7a865704300bc..6ca437c7d79cd554e4d46762f72aaa6e7f890d09 100644 --- a/src/Workspaces/Core/Desktop/Workspaces.Desktop.csproj +++ b/src/Workspaces/Core/Desktop/Workspaces.Desktop.csproj @@ -61,9 +61,6 @@ InternalUtilities\AssemblyReferenceResolver.cs - - InternalUtilities\LargeEncodedText.cs - InternalUtilities\MetadataFileReferenceResolver.cs