From 6313aa3c90e7425942fb12b51582cef2c619acb7 Mon Sep 17 00:00:00 2001
From: Jared Parsons <jaredpparsons@gmail.com>
Date: Thu, 12 Mar 2015 10:45:30 -0700
Subject: [PATCH] Read logic must consider unicode character lengths

The SourceTextStream type was operating under the assumption that
Encoder.Convert was a non-throwing method so long as it was passed a
destination buffer with at least one byte available for writing.  The
actual contract for Convert is it will not throw so long as it is able
to write the result of converting at least one character to the
destination buffer (or there is nothing to convert).  In that case it
will throw an ArgumentException indicating it attempting to do work but
was unable to do so.

The SourceTextStream type processes the characters in chunks according
to the count passed into Read.  This caused a bug when a character which
was represented with more than one byte value was at the end of a
logical chunk of text.  The Converter would convert all the chars except
the last one.  But SourceTextStream continued processing because there
was at least one byte left in the destination buffer and hence an
exception was thrown.

The fix is to not check for count > 0 when processing but instead count
>= the maximum number of bytes the encoding could produce for a single
character.

Note: I did consider calling GetByteCount here instead but decided
against it.  It essentially forces the encoder to do the work of
decoding the lead byte twice on every iteration of the loop. Seemed
better to keep the simple worst case check here.

closes #1197
closes #1221
---
 .../CodeAnalysisTest/CodeAnalysisTest.csproj  |  1 +
 .../Text/SourceTextStreamTests.cs             | 58 +++++++++++++++++++
 .../Core/Portable/Text/SourceTextStream.cs    | 22 ++++---
 3 files changed, 72 insertions(+), 9 deletions(-)
 create mode 100644 src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs
diff --git a/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj b/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj
index 0a211286f41..b2d348c1935 100644
--- a/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj
+++ b/src/Compilers/Core/CodeAnalysisTest/CodeAnalysisTest.csproj
@@ -30,6 +30,7 @@
     <Compile Include="InternalUtilities\StringExtensionsTests.cs" />
     <Compile Include="PEWriter\BinaryWriterTests.cs" />
     <Compile Include="Text\LargeEncodedTextTests.cs" />
+    <Compile Include="Text\SourceTextStreamTests.cs" />
     <Compile Include="XmlDocumentationCommentTextReaderTests.cs" />
   </ItemGroup>
   <ItemGroup Label="Project References">
diff --git a/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs b/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs
new file mode 100644
index 00000000000..26c2ef067d9
--- /dev/null
+++ b/src/Compilers/Core/CodeAnalysisTest/Text/SourceTextStreamTests.cs
@@ -0,0 +1,58 @@
+﻿// Copyright (c) Microsoft.  All Rights Reserved.  Licensed under the Apache License, Version 2.0.  See License.txt in the project root for license information.
+
+using System;
+using System.Linq;
+using System.Text;
+using Microsoft.CodeAnalysis.Text;
+using Xunit;
+
+namespace Microsoft.CodeAnalysis.UnitTests.Text
+{
+    public sealed class SourceTextStreamTests
+    {
+        private static readonly Encoding s_utf8NoBom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
+
+        /// <summary>
+        /// In the case the destination buffer is of insufficient length to store the reading of a single 
+        /// character we will throw.  Returning 0 is not correct here as that indicates end of stream
+        /// not insufficient space in destination buffer.
+        /// </summary>
+        [Fact]
+        public void MinimumLength()
+        {
+            var sourceText = SourceText.From("hello world", s_utf8NoBom);
+            using (var stream = new SourceTextStream(sourceText))
+            {
+                var buffer = new byte[100];
+                var max = s_utf8NoBom.GetMaxByteCount(charCount: 1);
+                for (int i = 0; i < max; i++)
+                {
+                    var local = i;
+                    Assert.Throws(typeof(ArgumentException), () => stream.Read(buffer, 0, local));
+                }
+            }
+        }
+
+        /// <summary>
+        /// In the case there is insufficient number of bytes to store the next character the read should
+        /// complete with the already read data vs. throwing.
+        /// </summary>
+        [Fact]
+        public void Issue1197()
+        {
+            var baseText = "food time";
+            var text = string.Format("{0}{1}", baseText, '\u2019');
+            var encoding = s_utf8NoBom;
+            var sourceText = SourceText.From(text, encoding);
+            using (var stream = new SourceTextStream(sourceText, bufferSize: text.Length * 2))
+            {
+                var buffer = new byte[baseText.Length + 1];
+                Assert.Equal(baseText.Length, stream.Read(buffer, 0, buffer.Length));
+                Assert.True(buffer.Take(baseText.Length).SequenceEqual(encoding.GetBytes(baseText)));
+
+                Assert.Equal(3, stream.Read(buffer, 0, buffer.Length));
+                Assert.True(buffer.Take(3).SequenceEqual(encoding.GetBytes(new[] { '\u2019' })));
+            }
+        }
+    }
+}
diff --git a/src/Compilers/Core/Portable/Text/SourceTextStream.cs b/src/Compilers/Core/Portable/Text/SourceTextStream.cs
index f715f3da86e..6e75b22f355 100644
--- a/src/Compilers/Core/Portable/Text/SourceTextStream.cs
+++ b/src/Compilers/Core/Portable/Text/SourceTextStream.cs
@@ -14,6 +14,7 @@ internal sealed class SourceTextStream : Stream
         private readonly SourceText _source;
         private readonly Encoder _encoder;
 
+        private int _minimumTargetBufferCount;
         private int _position;
         private int _sourceOffset;
         private readonly char[] _charBuffer;
@@ -25,6 +26,7 @@ public SourceTextStream(SourceText source, int bufferSize = 2048)
         {
             _source = source;
             _encoder = source.Encoding.GetEncoder();
+            _minimumTargetBufferCount = source.Encoding.GetMaxByteCount(charCount: 1);
             _sourceOffset = 0;
             _position = 0;
             _charBuffer = new char[Math.Min(bufferSize, _source.Length)];
@@ -60,18 +62,20 @@ public override long Length
 
         public override long Position
         {
-            get
-            {
-                return _position;
-            }
-            set
-            {
-                throw new NotSupportedException();
-            }
+            get { return _position; }
+            set { throw new NotSupportedException(); }
         }
 
         public override int Read(byte[] buffer, int offset, int count)
         {
+            if (count < _minimumTargetBufferCount)
+            {
+                // The buffer must be able to hold at least one character from the 
+                // SourceText stream.  Returning 0 for that case isn't correct because
+                // that indicates end of stream vs. insufficient buffer. 
+                throw new ArgumentException(nameof(count));
+            }
+
             int originalCount = count;
 
             if (!_preambleWritten)
@@ -81,7 +85,7 @@ public override int Read(byte[] buffer, int offset, int count)
                 count -= bytesWritten;
             }
 
-            while (count > 0 && _position < _source.Length)
+            while (count >= _minimumTargetBufferCount && _position < _source.Length)
             {
                 if (_bufferUnreadChars == 0)
                 {
-- 
GitLab