Improve XmlDictionaryWriter UTF8 encoding performance (#73336)

* Speed up text encoding * Update implementation * Add tests for binary xml strings * limit counting code to 256 bit vectors * reword comment * rename test * move bytesmax * Fix bytesMax after moving variable initialization * use unicode escape value in test * fix test typo "*" -> "+" * Update src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs Co-authored-by: N Stephen Toub <stoub@microsoft.com> * Remvoe vectorized code from UnsafeGetUTF8Length * Fix overfload * use for loop which seems faster * remove vector loop * make sealed encoding to allow devirtualisation * back some changes * use uint for UnsafeGetUTF8Chars comparison * revert more changes * Fix cutoff based on new measurements * use BinaryPrimitives.ReverseEndianness as suggested * Update cutoff from 24 to 32 chars before calling, due to regression for text based DataContractSerializer * Remove sealed encoding since it only improves XmlConvert --------- Co-authored-by: N Stephen Toub <stoub@microsoft.com>

Improve XmlDictionaryWriter UTF8 encoding performance (#73336)
* Speed up text encoding * Update implementation * Add tests for binary xml strings * limit counting code to 256 bit vectors * reword comment * rename test * move bytesmax * Fix bytesMax after moving variable initialization * use unicode escape value in test * fix test typo "*" -> "+" * Update src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs Co-authored-by: N Stephen Toub <stoub@microsoft.com> * Remvoe vectorized code from UnsafeGetUTF8Length * Fix overfload * use for loop which seems faster * remove vector loop * make sealed encoding to allow devirtualisation * back some changes * use uint for UnsafeGetUTF8Chars comparison * revert more changes * Fix cutoff based on new measurements * use BinaryPrimitives.ReverseEndianness as suggested * Update cutoff from 24 to 32 chars before calling, due to regression for text based DataContractSerializer * Remove sealed encoding since it only improves XmlConvert --------- Co-authored-by: N Stephen Toub <stoub@microsoft.com>
e0c94f84 · Daniel Svensson · GitHub · b54d6ef1 · e0c94f84 · e0c94f84
2 changed file
--- a/src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs
+++ b/src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.

+using System.Buffers.Binary;
 using System.IO;
 using System.Text;
+using System.Runtime.InteropServices;
 using System.Runtime.Serialization;
 using System.Threading.Tasks;
 using System.Diagnostics;
@@ -330,34 +332,26 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)
            }
        }

-        protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
+        protected static unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
        {
-            char* charsMax = chars + charCount;
-            while (chars < charsMax)
+            if (BitConverter.IsLittleEndian)
            {
-                char value = *chars++;
-                buffer[offset++] = (byte)value;
-                value >>= 8;
-                buffer[offset++] = (byte)value;
+                new ReadOnlySpan<char>(chars, charCount)
+                    .CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset)));
            }
+            else
+            {
+                BinaryPrimitives.ReverseEndianness(new ReadOnlySpan<short>(chars, charCount),
+                    MemoryMarshal.Cast<byte, short>(buffer.AsSpan(offset)));
+            }
+
            return charCount * 2;
        }

        protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
        {
-            char* charsMax = chars + charCount;
-            while (chars < charsMax)
-            {
-                if (*chars >= 0x80)
-                    break;
-
-                chars++;
-            }
-
-            if (chars == charsMax)
-                return charCount;
-
-            return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars));
+            // Length will always be at least ( 128 / maxBytesPerChar) = 42
+            return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount);
        }

        protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
@@ -366,39 +360,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
            {
                fixed (byte* _bytes = &buffer[offset])
                {
-                    byte* bytes = _bytes;
-                    byte* bytesMax = &bytes[buffer.Length - offset];
-                    char* charsMax = &chars[charCount];
-
-                    while (true)
+                    // Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible
+                    if ((uint)charCount < 32)
                    {
+                        byte* bytes = _bytes;
+                        char* charsMax = &chars[charCount];
+
                        while (chars < charsMax)
                        {
                            char t = *chars;
                            if (t >= 0x80)
-                                break;
+                                goto NonAscii;

                            *bytes = (byte)t;
                            bytes++;
                            chars++;
                        }
+                        return charCount;

-                        if (chars >= charsMax)
-                            break;
-
-                        char* charsStart = chars;
-                        while (chars < charsMax && *chars >= 0x80)
-                        {
-                            chars++;
-                        }
-
-                        bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));
-
-                        if (chars >= charsMax)
-                            break;
+                    NonAscii:
+                        byte* bytesMax = _bytes + buffer.Length - offset;
+                        return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
+                    }
+                    else
+                    {
+                        return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset);
                    }
-
-                    return (int)(bytes - _bytes);
                }
            }
            return 0;

--- a/src/libraries/System.Runtime.Serialization.Xml/tests/XmlDictionaryWriterTest.cs
+++ b/src/libraries/System.Runtime.Serialization.Xml/tests/XmlDictionaryWriterTest.cs
@@ -494,6 +494,71 @@ void AssertBytesWritten(Action<XmlDictionaryWriter> action, XmlBinaryNodeType no
        }
    }

+    [Fact]
+    public static void XmlBaseWriter_WriteString()
+    {
+        const byte Chars8Text = 152;
+        const byte Chars16Text = 154;
+        MemoryStream ms = new MemoryStream();
+        XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
+        writer.WriteStartElement("root");
+
+        int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
+        byte[] buffer = new byte[lengths.Max() + 1];
+
+        foreach (var length in lengths)
+        {
+            string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
+            {
+                for (int i = 0; i < chars.Length; ++i)
+                    chars[i] = (char)(i % 128);
+            });
+            string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
+            {
+                for (int i = 0; i < chars.Length; ++i)
+                    chars[i] = (char)(i % 128);
+                chars[^1] = '\u00E4'; // '' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
+            });
+
+            int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
+            Assert.True(numBytes == length, "Test setup wrong - allAscii");
+            ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));
+
+            numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
+            Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
+            ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
+        }
+
+        static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
+        {
+            writer.Flush();
+            ms.Seek(0, SeekOrigin.Begin);
+            ms.SetLength(0);
+            writer.WriteString(text);
+            writer.Flush();
+
+            ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
+            ReadOnlySpan<byte> buffer = arraySegment;
+
+            if (expected.Length <= byte.MaxValue)
+            {
+                Assert.Equal(Chars8Text, buffer[0]);
+                Assert.Equal(expected.Length, buffer[1]);
+                buffer = buffer.Slice(2);
+            }
+            else if (expected.Length <= ushort.MaxValue)
+            {
+                Assert.Equal(Chars16Text, buffer[0]);
+                Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
+                buffer = buffer.Slice(3);
+            }
+            else
+                Assert.Fail("test use to long length");
+
+            AssertExtensions.SequenceEqual(expected, buffer);
+        }
+    }
+
    private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
    {
        ms.Position = 0;