提交 6313aa3c 编写于 作者: J Jared Parsons

Read logic must consider unicode character lengths

The SourceTextStream type was operating under the assumption that
Encoder.Convert was a non-throwing method so long as it was passed a
destination buffer with at least one byte available for writing.  The
actual contract for Convert is it will not throw so long as it is able
to write the result of converting at least one character to the
destination buffer (or there is nothing to convert).  In that case it
will throw an ArgumentException indicating it attempting to do work but
was unable to do so.

The SourceTextStream type processes the characters in chunks according
to the count passed into Read.  This caused a bug when a character which
was represented with more than one byte value was at the end of a
logical chunk of text.  The Converter would convert all the chars except
the last one.  But SourceTextStream continued processing because there
was at least one byte left in the destination buffer and hence an
exception was thrown.

The fix is to not check for count > 0 when processing but instead count
>= the maximum number of bytes the encoding could produce for a single
character.

Note: I did consider calling GetByteCount here instead but decided
against it.  It essentially forces the encoder to do the work of
decoding the lead byte twice on every iteration of the loop. Seemed
better to keep the simple worst case check here.

closes #1197
closes #1221
上级 d26da4e6
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
<Compile Include="InternalUtilities\StringExtensionsTests.cs" /> <Compile Include="InternalUtilities\StringExtensionsTests.cs" />
<Compile Include="PEWriter\BinaryWriterTests.cs" /> <Compile Include="PEWriter\BinaryWriterTests.cs" />
<Compile Include="Text\LargeEncodedTextTests.cs" /> <Compile Include="Text\LargeEncodedTextTests.cs" />
<Compile Include="Text\SourceTextStreamTests.cs" />
<Compile Include="XmlDocumentationCommentTextReaderTests.cs" /> <Compile Include="XmlDocumentationCommentTextReaderTests.cs" />
</ItemGroup> </ItemGroup>
<ItemGroup Label="Project References"> <ItemGroup Label="Project References">
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis.Text;
using Xunit;
namespace Microsoft.CodeAnalysis.UnitTests.Text
{
public sealed class SourceTextStreamTests
{
private static readonly Encoding s_utf8NoBom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
/// <summary>
/// In the case the destination buffer is of insufficient length to store the reading of a single
/// character we will throw. Returning 0 is not correct here as that indicates end of stream
/// not insufficient space in destination buffer.
/// </summary>
[Fact]
public void MinimumLength()
{
var sourceText = SourceText.From("hello world", s_utf8NoBom);
using (var stream = new SourceTextStream(sourceText))
{
var buffer = new byte[100];
var max = s_utf8NoBom.GetMaxByteCount(charCount: 1);
for (int i = 0; i < max; i++)
{
var local = i;
Assert.Throws(typeof(ArgumentException), () => stream.Read(buffer, 0, local));
}
}
}
/// <summary>
/// In the case there is insufficient number of bytes to store the next character the read should
/// complete with the already read data vs. throwing.
/// </summary>
[Fact]
public void Issue1197()
{
var baseText = "food time";
var text = string.Format("{0}{1}", baseText, '\u2019');
var encoding = s_utf8NoBom;
var sourceText = SourceText.From(text, encoding);
using (var stream = new SourceTextStream(sourceText, bufferSize: text.Length * 2))
{
var buffer = new byte[baseText.Length + 1];
Assert.Equal(baseText.Length, stream.Read(buffer, 0, buffer.Length));
Assert.True(buffer.Take(baseText.Length).SequenceEqual(encoding.GetBytes(baseText)));
Assert.Equal(3, stream.Read(buffer, 0, buffer.Length));
Assert.True(buffer.Take(3).SequenceEqual(encoding.GetBytes(new[] { '\u2019' })));
}
}
}
}
...@@ -14,6 +14,7 @@ internal sealed class SourceTextStream : Stream ...@@ -14,6 +14,7 @@ internal sealed class SourceTextStream : Stream
private readonly SourceText _source; private readonly SourceText _source;
private readonly Encoder _encoder; private readonly Encoder _encoder;
private int _minimumTargetBufferCount;
private int _position; private int _position;
private int _sourceOffset; private int _sourceOffset;
private readonly char[] _charBuffer; private readonly char[] _charBuffer;
...@@ -25,6 +26,7 @@ public SourceTextStream(SourceText source, int bufferSize = 2048) ...@@ -25,6 +26,7 @@ public SourceTextStream(SourceText source, int bufferSize = 2048)
{ {
_source = source; _source = source;
_encoder = source.Encoding.GetEncoder(); _encoder = source.Encoding.GetEncoder();
_minimumTargetBufferCount = source.Encoding.GetMaxByteCount(charCount: 1);
_sourceOffset = 0; _sourceOffset = 0;
_position = 0; _position = 0;
_charBuffer = new char[Math.Min(bufferSize, _source.Length)]; _charBuffer = new char[Math.Min(bufferSize, _source.Length)];
...@@ -60,18 +62,20 @@ public override long Length ...@@ -60,18 +62,20 @@ public override long Length
public override long Position public override long Position
{ {
get get { return _position; }
{ set { throw new NotSupportedException(); }
return _position;
}
set
{
throw new NotSupportedException();
}
} }
public override int Read(byte[] buffer, int offset, int count) public override int Read(byte[] buffer, int offset, int count)
{ {
if (count < _minimumTargetBufferCount)
{
// The buffer must be able to hold at least one character from the
// SourceText stream. Returning 0 for that case isn't correct because
// that indicates end of stream vs. insufficient buffer.
throw new ArgumentException(nameof(count));
}
int originalCount = count; int originalCount = count;
if (!_preambleWritten) if (!_preambleWritten)
...@@ -81,7 +85,7 @@ public override int Read(byte[] buffer, int offset, int count) ...@@ -81,7 +85,7 @@ public override int Read(byte[] buffer, int offset, int count)
count -= bytesWritten; count -= bytesWritten;
} }
while (count > 0 && _position < _source.Length) while (count >= _minimumTargetBufferCount && _position < _source.Length)
{ {
if (_bufferUnreadChars == 0) if (_bufferUnreadChars == 0)
{ {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册