Read logic must consider unicode character lengths

The SourceTextStream type was operating under the assumption that Encoder.Convert was a non-throwing method so long as it was passed a destination buffer with at least one byte available for writing. The actual contract for Convert is it will not throw so long as it is able to write the result of converting at least one character to the destination buffer (or there is nothing to convert). In that case it will throw an ArgumentException indicating it attempting to do work but was unable to do so. The SourceTextStream type processes the characters in chunks according to the count passed into Read. This caused a bug when a character which was represented with more than one byte value was at the end of a logical chunk of text. The Converter would convert all the chars except the last one. But SourceTextStream continued processing because there was at least one byte left in the destination buffer and hence an exception was thrown. The fix is to not check for count > 0 when processing but instead count >= the maximum number of bytes the encoding could produce for a single character. Note: I did consider calling GetByteCount here instead but decided against it. It essentially forces the encoder to do the work of decoding the lead byte twice on every iteration of the loop. Seemed better to keep the simple worst case check here. closes #1197 closes #1221

Read logic must consider unicode character lengths
The SourceTextStream type was operating under the assumption that Encoder.Convert was a non-throwing method so long as it was passed a destination buffer with at least one byte available for writing. The actual contract for Convert is it will not throw so long as it is able to write the result of converting at least one character to the destination buffer (or there is nothing to convert). In that case it will throw an ArgumentException indicating it attempting to do work but was unable to do so. The SourceTextStream type processes the characters in chunks according to the count passed into Read. This caused a bug when a character which was represented with more than one byte value was at the end of a logical chunk of text. The Converter would convert all the chars except the last one. But SourceTextStream continued processing because there was at least one byte left in the destination buffer and hence an exception was thrown. The fix is to not check for count > 0 when processing but instead count >= the maximum number of bytes the encoding could produce for a single character. Note: I did consider calling GetByteCount here instead but decided against it. It essentially forces the encoder to do the work of decoding the lead byte twice on every iteration of the loop. Seemed better to keep the simple worst case check here. closes #1197 closes #1221
6313aa3c · Jared Parsons · d26da4e6 · 6313aa3c · 6313aa3c · 6313aa3c
3 changed file
--- a/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj
+++ b/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj
@@ -30,6 +30,7 @@
    <Compile Include="InternalUtilities\StringExtensionsTests.cs" />
    <Compile Include="PEWriter\BinaryWriterTests.cs" />
    <Compile Include="Text\LargeEncodedTextTests.cs" />
+    <Compile Include="Text\SourceTextStreamTests.cs" />
    <Compile Include="XmlDocumentationCommentTextReaderTests.cs" />
  </ItemGroup>
  <ItemGroup Label="Project References">

--- a/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs
+++ b/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs
+// Copyright (c) Microsoft.  All Rights Reserved.  Licensed under the Apache License, Version 2.0.  See License.txt in the project root for license information.
+using System;
+using System.Linq;
+using System.Text;
+using Microsoft.CodeAnalysis.Text;
+using Xunit;
+namespace Microsoft.CodeAnalysis.UnitTests.Text
+{
+    public sealed class SourceTextStreamTests
+    {
+        private static readonly Encoding s_utf8NoBom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
+        /// <summary>
+        /// In the case the destination buffer is of insufficient length to store the reading of a single 
+        /// character we will throw.  Returning 0 is not correct here as that indicates end of stream
+        /// not insufficient space in destination buffer.
+        /// </summary>
+        [Fact]
+        public void MinimumLength()
+        {
+            var sourceText = SourceText.From("hello world", s_utf8NoBom);
+            using (var stream = new SourceTextStream(sourceText))
+            {
+                var buffer = new byte[100];
+                var max = s_utf8NoBom.GetMaxByteCount(charCount: 1);
+                for (int i = 0; i < max; i++)
+                {
+                    var local = i;
+                    Assert.Throws(typeof(ArgumentException), () => stream.Read(buffer, 0, local));
+                }
+            }
+        }
+        /// <summary>
+        /// In the case there is insufficient number of bytes to store the next character the read should
+        /// complete with the already read data vs. throwing.
+        /// </summary>
+        [Fact]
+        public void Issue1197()
+        {
+            var baseText = "food time";
+            var text = string.Format("{0}{1}", baseText, '\u2019');
+            var encoding = s_utf8NoBom;
+            var sourceText = SourceText.From(text, encoding);
+            using (var stream = new SourceTextStream(sourceText, bufferSize: text.Length * 2))
+            {
+                var buffer = new byte[baseText.Length + 1];
+                Assert.Equal(baseText.Length, stream.Read(buffer, 0, buffer.Length));
+                Assert.True(buffer.Take(baseText.Length).SequenceEqual(encoding.GetBytes(baseText)));
+                Assert.Equal(3, stream.Read(buffer, 0, buffer.Length));
+                Assert.True(buffer.Take(3).SequenceEqual(encoding.GetBytes(new[] { '\u2019' })));
+            }
+        }
+    }
+}
--- a/src/Compilers/Core/Portable/Text/SourceTextStream.cs
+++ b/src/Compilers/Core/Portable/Text/SourceTextStream.cs
@@ -14,6 +14,7 @@ internal sealed class SourceTextStream : Stream
        private readonly SourceText _source;
        private readonly Encoder _encoder;
+        private int _minimumTargetBufferCount;
        private int _position;
        private int _sourceOffset;
        private readonly char[] _charBuffer;
@@ -25,6 +26,7 @@ public SourceTextStream(SourceText source, int bufferSize = 2048)
        {
            _source = source;
            _encoder = source.Encoding.GetEncoder();
+            _minimumTargetBufferCount = source.Encoding.GetMaxByteCount(charCount: 1);
            _sourceOffset = 0;
            _position = 0;
            _charBuffer = new char[Math.Min(bufferSize, _source.Length)];
@@ -60,18 +62,20 @@ public override long Length
        public override long Position
        {
-            get
+            get { return _position; }
-            {
+            set { throw new NotSupportedException(); }
-                return _position;
-            }
-            set
-            {
-                throw new NotSupportedException();
-            }
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
+            if (count < _minimumTargetBufferCount)
+            {
+                // The buffer must be able to hold at least one character from the 
+                // SourceText stream.  Returning 0 for that case isn't correct because
+                // that indicates end of stream vs. insufficient buffer. 
+                throw new ArgumentException(nameof(count));
+            }
            int originalCount = count;
            if (!_preambleWritten)
@@ -81,7 +85,7 @@ public override int Read(byte[] buffer, int offset, int count)
                count -= bytesWritten;
            }
-            while (count > 0 && _position < _source.Length)
+            while (count >= _minimumTargetBufferCount && _position < _source.Length)
            {
                if (_bufferUnreadChars == 0)
                {