提交 779bc9a7 编写于 作者: H Heejae Chang 提交者: GitHub

Merge pull request #14531 from heejaechang/sourcetextreader

Reduce string allocation from TextFactory
......@@ -10,8 +10,6 @@
using System.IO.Compression;
using Roslyn.Test.Utilities;
using System.Linq;
using System.Collections.Immutable;
using System.Reflection;
namespace Microsoft.CodeAnalysis.UnitTests
{
......@@ -194,6 +192,36 @@ public void FromSource_Large()
AssertEx.Equal(Encoding.Unicode.GetPreamble().Concat(Encoding.Unicode.GetBytes(LargeSource)), Decompress(text.Blob.Skip(4)));
}
[Fact]
public void FromTextReader_Small()
{
var expected = SourceText.From(SmallSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);
var actual = SourceText.From(new StringReader(SmallSource), SmallSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);
Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}
[Fact]
public void FromTextReader_Large()
{
var expected = SourceText.From(LargeSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);
var actual = SourceText.From(new StringReader(LargeSource), LargeSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);
Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}
[Fact]
public void FromSource_Precomputed()
{
......
......@@ -29,6 +29,11 @@ private static SourceText CreateSourceText(Stream stream, Encoding encoding = nu
return LargeText.Decode(stream, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1, throwIfBinaryDetected: true, canBeEmbedded: false);
}
private static SourceText CreateSourceText(TextReader reader, int length, Encoding encoding = null)
{
return LargeText.Decode(reader, length, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1);
}
private const string HelloWorld = "Hello, world!";
[Fact]
......@@ -313,5 +318,18 @@ public void LinesGetText2()
var data = CreateSourceText(text);
Assert.Equal("foo", data.Lines[0].ToString());
}
[Fact]
public void FromTextReader()
{
var expected = "foo";
var expectedSourceText = CreateSourceText(expected);
var actual = new StringReader(expected);
var actualSourceText = CreateSourceText(actual, expected.Length);
Assert.Equal("foo", actualSourceText.Lines[0].ToString());
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());
}
}
}
\ No newline at end of file
......@@ -229,6 +229,41 @@ public void FromThrowsIfBinary()
Assert.Throws<InvalidDataException>(() => SourceText.From(stream, throwIfBinaryDetected: true));
}
[Fact]
public void FromTextReader()
{
var expected = "Text reader source text test";
var expectedSourceText = SourceText.From(expected);
var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());
Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}
[Fact]
public void FromTextReader_Large()
{
var expected = new string('l', SourceText.LargeObjectHeapLimitInChars);
var expectedSourceText = SourceText.From(expected);
var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);
Assert.IsType<LargeText>(actualSourceText);
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());
var utf8NoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}
private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data)
{
TestTryReadByteOrderMark(expectedEncoding, expectedPreambleLength, data, data == null ? 0 : data.Length);
......
......@@ -11,5 +11,6 @@
// required paramter type is different for all overloads and so there is no ambiguity.
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.Byte[],System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader,System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0027:Public API with optional parameter(s) should have the most parameters amongst its public overloads.", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.String,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]
......@@ -19,11 +19,29 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
Debug.Assert(stream.CanSeek);
long length = stream.Length;
int maxCharCount;
if (encoding.TryGetMaxCharCount(length, out maxCharCount))
{
return maxCharCount;
}
#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
}
internal static bool TryGetMaxCharCount(this Encoding encoding, long length, out int maxCharCount)
{
maxCharCount = 0;
if (length <= int.MaxValue)
{
try
{
return encoding.GetMaxCharCount((int)length);
maxCharCount = encoding.GetMaxCharCount((int)length);
return true;
}
catch (ArgumentOutOfRangeException)
{
......@@ -33,11 +51,7 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
}
}
#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
return false;
}
}
}
......@@ -815,6 +815,7 @@ static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Micro
static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>(Microsoft.CodeAnalysis.SeparatedSyntaxList<Microsoft.CodeAnalysis.SyntaxNode> nodes) -> Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader reader, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
virtual Microsoft.CodeAnalysis.Diagnostics.AnalysisContext.RegisterOperationAction(System.Action<Microsoft.CodeAnalysis.Diagnostics.OperationAnalysisContext> action, System.Collections.Immutable.ImmutableArray<Microsoft.CodeAnalysis.OperationKind> operationKinds) -> void
......
......@@ -24,14 +24,15 @@ internal sealed class LargeText : SourceText
private readonly ImmutableArray<char[]> _chunks;
private readonly int[] _chunkStartOffsets;
private readonly int _length;
private readonly Encoding _encoding;
private readonly Encoding _encodingOpt;
internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
: base(checksum, checksumAlgorithm, embeddedTextBlob)
{
_chunks = chunks;
_encoding = encoding;
_encodingOpt = encodingOpt;
_chunkStartOffsets = new int[chunks.Length];
int offset = 0;
for (int i = 0; i < chunks.Length; i++)
{
......@@ -42,6 +43,11 @@ internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableAr
_length = offset;
}
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
: this(chunks, encodingOpt, default(ImmutableArray<byte>), checksumAlgorithm, default(ImmutableArray<byte>))
{
}
internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected, bool canBeEmbedded)
{
stream.Seek(0, SeekOrigin.Begin);
......@@ -58,48 +64,69 @@ internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAl
using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(length, 4096), leaveOpen: true))
{
ArrayBuilder<char[]> chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);
while (!reader.EndOfStream)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}
var chunks = ReadChunksFromTextReader(reader, maxCharRemainingGuess, throwIfBinaryDetected);
char[] chunk = new char[nextChunkSize];
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
return new LargeText(chunks, reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
}
}
int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}
internal static SourceText Decode(TextReader reader, int length, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
{
if (length == 0)
{
return SourceText.From(string.Empty, encodingOpt, checksumAlgorithm);
}
maxCharRemainingGuess -= charsRead;
// throwIfBinaryDetected == false since we are given text reader from the beginning
var chunks = ReadChunksFromTextReader(reader, length, throwIfBinaryDetected: false);
if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
}
return new LargeText(chunks, encodingOpt, checksumAlgorithm);
}
// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}
private static ImmutableArray<char[]> ReadChunksFromTextReader(TextReader reader, int maxCharRemainingGuess, bool throwIfBinaryDetected)
{
var chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);
chunks.Add(chunk);
while (reader.Peek() != -1)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
return new LargeText(chunks.ToImmutableAndFree(), reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
char[] chunk = new char[nextChunkSize];
int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}
maxCharRemainingGuess -= charsRead;
if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
}
// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}
chunks.Add(chunk);
}
return chunks.ToImmutableAndFree();
}
/// <summary>
......@@ -146,7 +173,7 @@ private int GetIndexFromPosition(int position)
}
}
public override Encoding Encoding => _encoding;
public override Encoding Encoding => _encodingOpt;
public override int Length => _length;
......@@ -272,7 +299,7 @@ private int[] ParseLineStarts()
case '\u0085':
case '\u2028':
case '\u2029':
line_break:
line_break:
arrayBuilder.Add(position);
position = index;
break;
......
......@@ -29,7 +29,7 @@ public abstract class SourceText
private TextLineCollection _lazyLineInfo;
private ImmutableArray<byte> _lazyChecksum;
private ImmutableArray<byte> _precomputedEmbeddedTextBlob;
private static readonly Encoding s_utf8EncodingWithNoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: false);
protected SourceText(ImmutableArray<byte> checksum = default(ImmutableArray<byte>), SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1, SourceTextContainer container = null)
......@@ -97,6 +97,43 @@ public static SourceText From(string text, Encoding encoding = null, SourceHashA
return new StringText(text, encoding, checksumAlgorithm: checksumAlgorithm);
}
/// <summary>
/// Constructs a <see cref="SourceText"/> from text in a string.
/// </summary>
/// <param name="reader">TextReader</param>
/// <param name="length">length of content from <paramref name="reader"/></param>
/// <param name="encoding">
/// Encoding of the file that the <paramref name="reader"/> was read from or is going to be saved to.
/// <c>null</c> if the encoding is unspecified.
/// If the encoding is not specified the resulting <see cref="SourceText"/> isn't debuggable.
/// If an encoding-less <see cref="SourceText"/> is written to a file a <see cref="Encoding.UTF8"/> shall be used as a default.
/// </param>
/// <param name="checksumAlgorithm">
/// Hash algorithm to use to calculate checksum of the text that's saved to PDB.
/// </param>
/// <exception cref="ArgumentNullException"><paramref name="reader"/> is null.</exception>
/// <exception cref="ArgumentException"><paramref name="checksumAlgorithm"/> is not supported.</exception>
public static SourceText From(
TextReader reader,
int length,
Encoding encoding = null,
SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1)
{
if (reader == null)
{
throw new ArgumentNullException(nameof(reader));
}
// If the resulting string would end up on the large object heap, then use LargeEncodedText.
if (length >= LargeObjectHeapLimitInChars)
{
return LargeText.Decode(reader, length, encoding, checksumAlgorithm);
}
string text = reader.ReadToEnd();
return From(text, encoding, checksumAlgorithm);
}
// 1.0 BACKCOMPAT OVERLOAD - DO NOT TOUCH
[EditorBrowsable(EditorBrowsableState.Never)]
public static SourceText From(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected)
......@@ -192,9 +229,9 @@ public static SourceText From(byte[] buffer, int length, Encoding encoding, Sour
/// <exception cref="DecoderFallbackException">If the given encoding is set to use a throwing decoder as a fallback</exception>
/// <exception cref="InvalidDataException">Two consecutive NUL characters were detected in the decoded text and <paramref name="throwIfBinaryDetected"/> was true.</exception>
public static SourceText From(
byte[] buffer,
int length,
Encoding encoding = null,
byte[] buffer,
int length,
Encoding encoding = null,
SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1,
bool throwIfBinaryDetected = false,
bool canBeEmbedded = false)
......
......@@ -169,58 +169,6 @@ private unsafe TextReader CreateTextReaderFromTemporaryStorage(ISupportDirectMem
return new DirectMemoryAccessStreamReader(src + 1, streamLength / sizeof(char) - 1);
}
private unsafe class DirectMemoryAccessStreamReader : TextReader
{
private char* _position;
private readonly char* _end;
public DirectMemoryAccessStreamReader(char* src, int length)
{
Debug.Assert(src != null);
Debug.Assert(length >= 0);
_position = src;
_end = _position + length;
}
public override int Read()
{
if (_position >= _end)
{
return -1;
}
return *_position++;
}
public override int Read(char[] buffer, int index, int count)
{
if (buffer == null)
{
throw new ArgumentNullException(nameof(buffer));
}
if (index < 0 || index >= buffer.Length)
{
throw new ArgumentOutOfRangeException(nameof(index));
}
if (count < 0 || (index + count) > buffer.Length)
{
throw new ArgumentOutOfRangeException(nameof(count));
}
count = Math.Min(count, (int)(_end - _position));
if (count > 0)
{
Marshal.Copy((IntPtr)_position, buffer, index, count);
_position += count;
}
return count;
}
}
}
private class TemporaryStreamStorage : ITemporaryStreamStorage, ITemporaryStorageWithName
......@@ -337,6 +285,72 @@ private async Task WriteStreamMaybeAsync(Stream stream, bool useAsync, Cancellat
}
}
}
internal unsafe class DirectMemoryAccessStreamReader : TextReader
{
private char* _position;
private readonly char* _end;
public DirectMemoryAccessStreamReader(char* src, int length)
{
Debug.Assert(src != null);
Debug.Assert(length >= 0);
_position = src;
_end = _position + length;
Length = length;
}
public int Length { get; }
public override int Peek()
{
if (_position >= _end)
{
return -1;
}
return *_position;
}
public override int Read()
{
if (_position >= _end)
{
return -1;
}
return *_position++;
}
public override int Read(char[] buffer, int index, int count)
{
if (buffer == null)
{
throw new ArgumentNullException(nameof(buffer));
}
if (index < 0 || index >= buffer.Length)
{
throw new ArgumentOutOfRangeException(nameof(index));
}
if (count < 0 || (index + count) > buffer.Length)
{
throw new ArgumentOutOfRangeException(nameof(count));
}
count = Math.Min(count, (int)(_end - _position));
if (count > 0)
{
Marshal.Copy((IntPtr)_position, buffer, index, count);
_position += count;
}
return count;
}
}
}
}
......@@ -21,6 +21,13 @@ public SourceText CreateText(Stream stream, Encoding defaultEncoding, Cancellati
public SourceText CreateText(TextReader reader, Encoding encoding, CancellationToken cancellationToken = default(CancellationToken))
{
cancellationToken.ThrowIfCancellationRequested();
var temporaryStorageReader = reader as TemporaryStorageServiceFactory.DirectMemoryAccessStreamReader;
if (temporaryStorageReader != null)
{
return SourceText.From(temporaryStorageReader, temporaryStorageReader.Length, encoding);
}
return SourceText.From(reader.ReadToEnd(), encoding);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册