EncodedStringText.cs 10.3 KB
Newer Older
1
// Copyright (c) Microsoft.  All Rights Reserved.  Licensed under the Apache License, Version 2.0.  See License.txt in the project root for license information.
P
Pilchie 已提交
2 3 4 5 6

using System;
using System.Diagnostics;
using System.IO;
using System.Text;
P
Pharring 已提交
7
using Roslyn.Utilities;
P
Pilchie 已提交
8 9 10

namespace Microsoft.CodeAnalysis.Text
{
11
    internal static class EncodedStringText
P
Pilchie 已提交
12
    {
13
        private const int LargeObjectHeapLimitInChars = 40 * 1024; // 40KB
P
Pilchie 已提交
14

P
Pharring 已提交
15 16 17 18
        /// <summary>
        /// Encoding to use when there is no byte order mark (BOM) on the stream. This encoder may throw a <see cref="DecoderFallbackException"/>
        /// if the stream contains invalid UTF-8 bytes.
        /// </summary>
A
Andy Gocke 已提交
19
        private static readonly Encoding s_utf8Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
P
Pharring 已提交
20

21
        /// <summary>
A
Andy Gocke 已提交
22 23 24 25
        /// Encoding to use when UTF-8 fails. We try to find the following, in order, if available:
        ///     1. The default ANSI codepage
       ///      2. CodePage 1252.
       ///      3. Latin1.
26
        /// </summary>
A
Andy Gocke 已提交
27
        private static readonly Encoding s_fallbackEncoding = GetFallbackEncoding();
28

A
Andy Gocke 已提交
29
        private static Encoding GetFallbackEncoding()
30 31 32
        {
            try
            {
A
Andy Gocke 已提交
33 34
                if (CoreClrShim.IsCoreClr)
                {
35 36
                    // If we're running on CoreCLR there is no "default" codepage but
                    // we should be able to grab 1252 from System.Text.Encoding.CodePages
A
Andy Gocke 已提交
37 38 39 40 41 42
                    CoreClrShim.Encoding.RegisterProvider(CoreClrShim.CodePagesEncodingProvider.Instance);
                    // We should now have 1252 from the CodePagesEncodingProvider
                    return PortableShim.Encoding.GetEncoding(1252);
                }
                else
                {
43 44 45
                    // If we're running on the desktop framework we should be able
                    // to get the default ANSI code page in the operating system's
                    // regional and language settings,
A
Andy Gocke 已提交
46 47 48
                    return PortableShim.Encoding.GetEncoding(0)
                        ?? PortableShim.Encoding.GetEncoding(1252);
                }
49 50 51 52 53 54 55
            }
            catch (NotSupportedException)
            {
                return Encoding.GetEncoding(name: "Latin1");
            }
        }

P
Pilchie 已提交
56
        /// <summary>
57 58 59 60
        /// Initializes an instance of <see cref="SourceText"/> from the provided stream. This version differs
        /// from <see cref="SourceText.From(Stream, Encoding, SourceHashAlgorithm, bool)"/> in two ways:
        /// 1. It attempts to minimize allocations by trying to read the stream into a byte array.
        /// 2. If <paramref name="defaultEncoding"/> is null, it will first try UTF8 and, if that fails, it will
61
        ///    try CodePage 1252. If CodePage 1252 is not available on the system, then it will try Latin1.
P
Pilchie 已提交
62
        /// </summary>
63
        /// <param name="stream">The stream containing encoded text.</param>
64 65
        /// <param name="defaultEncoding">
        /// Specifies an encoding to be used if the actual encoding can't be determined from the stream content (the stream doesn't start with Byte Order Mark).
J
Jared Parsons 已提交
66
        /// If not specified auto-detect heuristics are used to determine the encoding. If these heuristics fail the decoding is assumed to be Encoding.Default.
67
        /// Note that if the stream starts with Byte Order Mark the value of <paramref name="defaultEncoding"/> is ignored.
P
Pilchie 已提交
68
        /// </param>
69
        /// <param name="checksumAlgorithm">Hash algorithm used to calculate document checksum.</param>
70 71 72 73
        /// <exception cref="InvalidDataException">
        /// The stream content can't be decoded using the specified <paramref name="defaultEncoding"/>, or
        /// <paramref name="defaultEncoding"/> is null and the stream appears to be a binary file.
        /// </exception>
P
Pilchie 已提交
74
        /// <exception cref="IOException">An IO error occurred while reading from the stream.</exception>
P
Pharring 已提交
75
        internal static SourceText Create(Stream stream, Encoding defaultEncoding = null, SourceHashAlgorithm checksumAlgorithm = SourceHashAlgorithm.Sha1)
P
Pilchie 已提交
76 77 78 79
        {
            Debug.Assert(stream != null);
            Debug.Assert(stream.CanRead && stream.CanSeek);

80 81
            bool detectEncoding = defaultEncoding == null;
            if (detectEncoding)
P
Pilchie 已提交
82
            {
P
Pharring 已提交
83
                try
84
                {
A
Andy Gocke 已提交
85
                    return Decode(stream, s_utf8Encoding, checksumAlgorithm, throwIfBinaryDetected: false);
P
Pharring 已提交
86 87 88
                }
                catch (DecoderFallbackException)
                {
J
Jared Parsons 已提交
89
                    // Fall back to Encoding.ASCII
90 91 92 93 94
                }
            }

            try
            {
A
Andy Gocke 已提交
95
                return Decode(stream, defaultEncoding ?? s_fallbackEncoding, checksumAlgorithm, throwIfBinaryDetected: detectEncoding);
96 97 98 99
            }
            catch (DecoderFallbackException e)
            {
                throw new InvalidDataException(e.Message);
P
Pilchie 已提交
100
            }
101 102
        }

P
Pilchie 已提交
103
        /// <summary>
104
        /// Try to create a <see cref="SourceText"/> from the given stream using the given encoding.
P
Pilchie 已提交
105
        /// </summary>
P
Pharring 已提交
106 107 108 109
        /// <param name="data">The input stream containing the encoded text. The stream will not be closed.</param>
        /// <param name="encoding">The expected encoding of the stream. The actual encoding used may be different if byte order marks are detected.</param>
        /// <param name="checksumAlgorithm">The checksum algorithm to use.</param>
        /// <param name="throwIfBinaryDetected">Throw <see cref="InvalidDataException"/> if binary (non-text) data is detected.</param>
110
        /// <returns>The <see cref="SourceText"/> decoded from the stream.</returns>
P
Pharring 已提交
111 112 113 114 115
        /// <exception cref="DecoderFallbackException">The decoder was unable to decode the stream with the given encoding.</exception>
        /// <remarks>
        /// internal for unit testing
        /// </remarks>
        internal static SourceText Decode(Stream data, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected = false)
P
Pilchie 已提交
116
        {
117 118
            Debug.Assert(data != null);
            Debug.Assert(encoding != null);
P
Pilchie 已提交
119

120
            data.Seek(0, SeekOrigin.Begin);
P
Pharring 已提交
121

122 123
            // For small streams, see if we can read the byte buffer directly.
            if (encoding.GetMaxCharCount((int)data.Length) < LargeObjectHeapLimitInChars)
P
Pharring 已提交
124
            {
125 126 127 128 129
                byte[] buffer = TryGetByteArrayFromStream(data);
                if (buffer != null)
                {
                    return SourceText.From(buffer, (int)data.Length, encoding, checksumAlgorithm, throwIfBinaryDetected);
                }
P
Pilchie 已提交
130 131
            }

132
            return SourceText.From(data, encoding, checksumAlgorithm, throwIfBinaryDetected);
P
Pilchie 已提交
133 134 135
        }

        /// <summary>
P
Pharring 已提交
136
        /// Some streams are easily represented as byte arrays.
P
Pilchie 已提交
137
        /// </summary>
P
Pharring 已提交
138 139 140 141 142 143
        /// <param name="data">The stream</param>
        /// <returns>
        /// The contents of <paramref name="data"/> as a byte array or null if the stream can't easily
        /// be read into a byte array.
        /// </returns>
        private static byte[] TryGetByteArrayFromStream(Stream data)
P
Pilchie 已提交
144
        {
P
Pharring 已提交
145 146 147 148 149
            byte[] buffer;

            // PERF: If the input is a MemoryStream, we may be able to get at the buffer directly
            var memoryStream = data as MemoryStream;
            if (memoryStream != null && TryGetByteArrayFromMemoryStream(memoryStream, out buffer))
P
Pilchie 已提交
150
            {
P
Pharring 已提交
151
                return buffer;
P
Pilchie 已提交
152
            }
P
Pharring 已提交
153 154

            // PERF: If the input is a FileStream, we may be able to minimize allocations
J
Jared Parsons 已提交
155 156
            if (data.GetType() == PortableShim.FileStream.Type &&
                TryGetByteArrayFromFileStream(data, out buffer))
P
Pilchie 已提交
157
            {
P
Pharring 已提交
158
                return buffer;
P
Pilchie 已提交
159
            }
P
Pharring 已提交
160 161

            return null;
P
Pilchie 已提交
162 163 164 165 166 167
        }

        /// <summary>
        /// If the MemoryStream was created with publiclyVisible=true, then we can access its buffer
        /// directly and save allocations in StreamReader. The input MemoryStream is not closed on exit.
        /// </summary>
P
Pharring 已提交
168 169
        /// <returns>True if a byte array could be created.</returns>
        private static bool TryGetByteArrayFromMemoryStream(MemoryStream data, out byte[] buffer)
P
Pilchie 已提交
170 171 172 173 174
        {
            Debug.Assert(data.Position == 0);

            try
            {
J
Jared Parsons 已提交
175 176 177 178 179 180 181 182
                if (PortableShim.MemoryStream.GetBuffer != null)
                {
                    buffer = (byte[])PortableShim.MemoryStream.GetBuffer.Invoke(data, null);
                    return true;
                }

                buffer = null;
                return false;
P
Pilchie 已提交
183
            }
J
Fix ups  
Jared Parsons 已提交
184
            catch (Exception)
P
Pilchie 已提交
185
            {
P
Pharring 已提交
186
                buffer = null;
P
Pilchie 已提交
187 188 189 190
                return false;
            }
        }

P
Pharring 已提交
191
        /// <summary>
J
Jared Parsons 已提交
192
        /// Read the contents of a FileStream into a byte array.
P
Pharring 已提交
193 194 195 196
        /// </summary>
        /// <param name="stream">The FileStream with encoded text.</param>
        /// <param name="buffer">A byte array filled with the contents of the file.</param>
        /// <returns>True if a byte array could be created.</returns>
J
Jared Parsons 已提交
197
        private static bool TryGetByteArrayFromFileStream(Stream stream, out byte[] buffer)
198
        {
P
Pharring 已提交
199 200 201 202 203
            Debug.Assert(stream != null);
            Debug.Assert(stream.Position == 0);

            int length = (int)stream.Length;
            if (length == 0)
204
            {
P
Pharring 已提交
205 206
                buffer = SpecializedCollections.EmptyBytes;
                return true;
207 208
            }

P
Pharring 已提交
209 210 211 212 213 214 215 216 217 218 219 220 221
            // PERF: While this is an obvious byte array allocation, it is still cheaper than
            // using StreamReader.ReadToEnd. The alternative allocates:
            // 1. A 1KB byte array in the StreamReader for buffered reads
            // 2. A 4KB byte array in the FileStream for buffered reads
            // 3. A StringBuilder and its associated char arrays (enough to represent the final decoded string)

            // TODO: Can this allocation be pooled?
            buffer = new byte[length];

            // Note: FileStream.Read may still allocate its internal buffer if length is less
            // than the buffer size. The default buffer size is 4KB, so this will incur a 4KB
            // allocation for any files less than 4KB. That's why, for example, the command
            // line compiler actually specifies a very small buffer size.
222
            return stream.TryReadAll(buffer, 0, length) == length;
P
Pharring 已提交
223
        }
P
Pilchie 已提交
224 225
    }
}