[runtime] Vectorized polynomial hash for strings

This is for consistency with Kotlin/JVM

[runtime] Vectorized polynomial hash for strings
This is for consistency with Kotlin/JVM
c8633d8b · Igor Chevdar · Vasily Levchenko · 95a92d86 · c8633d8b · c8633d8b
10 changed file
--- a/kotlin-native/runtime/src/main/cpp/KString.cpp
+++ b/kotlin-native/runtime/src/main/cpp/KString.cpp
@@ -28,6 +28,8 @@

 #include "utf8.h"

+#include "polyhash/PolyHash.h"
+
 namespace {

 typedef std::back_insert_iterator<KStdString> KStdStringInserter;
@@ -1165,10 +1167,7 @@ KInt Kotlin_String_lastIndexOfString(KString thiz, KString other, KInt fromIndex

 KInt Kotlin_String_hashCode(KString thiz) {
  // TODO: consider caching strings hashes.
-  // TODO: maybe use some simpler hashing algorithm?
-  // Note that we don't use Java's string hash.
-  return CityHash64(
-    CharArrayAddressOfElementAt(thiz, 0), thiz->count_ * sizeof(KChar));
+  return polyHash(thiz->count_, CharArrayAddressOfElementAt(thiz, 0));
 }

 const KChar* Kotlin_String_utf16pointer(KString message) {

--- a/kotlin-native/runtime/src/main/cpp/polyhash/PolyHash.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/PolyHash.cpp
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#include "polyhash/PolyHash.h"
+#include "polyhash/naive.h"
+#include "polyhash/x86.h"
+#include "polyhash/arm.h"
+
+int polyHash(int length, uint16_t const* str) {
+#if defined(__x86_64__) or defined(__i386__)
+    return polyHash_x86(length, str);
+#elif defined(__arm__) or defined(__aarch64__)
+    return polyHash_arm(length, str);
+#else
+    return polyHash_naive(length, str);
+#endif
+}
\ No newline at end of file
--- a/kotlin-native/runtime/src/main/cpp/polyhash/PolyHash.h
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/PolyHash.h
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#ifndef RUNTIME_POLYHASH_H
+#define RUNTIME_POLYHASH_H
+
+#include <stdint.h>
+
+// Computes polynomial hash with base = 31.
+int polyHash(int length, uint16_t const* str);
+
+#endif  // RUNTIME_POLYHASH_H
--- a/kotlin-native/runtime/src/main/cpp/polyhash/PolyHashTest.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/PolyHashTest.cpp
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#include "polyhash/PolyHash.h"
+#include "polyhash/naive.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+TEST(PolyHashTest, Correctness) {
+  const int maxLength = 10000;
+  uint16_t str[maxLength + 100];
+  for (int k = 1; k <= maxLength; ++k) {
+    for (int i = 0; i < k; ++i)
+      str[i] = k * maxLength + i;
+    str[k] = 0;
+
+    for (int shift = 0; shift < 8 && k - shift > 0; ++shift)
+      EXPECT_EQ(polyHash_naive(k - shift, str + shift), polyHash(k - shift, str + shift));
+  }
+}
+
+}
\ No newline at end of file
--- a/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#include "polyhash/common.h"
+#include "polyhash/arm.h"
+
+#if defined(__arm__) or defined(__aarch64__)
+
+#ifndef __ARM_NEON
+
+int polyHash_arm(int length, uint16_t const* str) {
+    return polyHash_naive(length, str);
+}
+
+#else
+
+#include <arm_neon.h>
+
+namespace {
+
+alignas(32) constexpr auto p32 = DecreasingPowers<32>(31);   // [base^31, base^30, .., base^2, base, 1]
+alignas(32) constexpr auto b32 = RepeatingPowers<8>(31, 32); // [base^32, base^32, .., base^32] (8)
+alignas(32) constexpr auto b16 = RepeatingPowers<8>(31, 16); // [base^16, base^16, .., base^16] (8)
+alignas(32) constexpr auto b8  = RepeatingPowers<8>(31, 8);  // [base^8,  base^8,  .., base^8 ] (8)
+alignas(32) constexpr auto b4  = RepeatingPowers<8>(31, 4);  // [base^4,  base^4,  .., base^4 ] (8)
+
+struct NeonTraits {
+    using VecType = uint32x4_t;
+    using Vec128Type = uint32x4_t;
+    using U16VecType = uint16x4_t;
+
+    ALWAYS_INLINE static VecType initVec() { return vdupq_n_u32(0); }
+    ALWAYS_INLINE static Vec128Type initVec128() { return vdupq_n_u32(0); }
+    ALWAYS_INLINE static int vec128toInt(Vec128Type x) { return vgetq_lane_u32(x, 0); }
+    ALWAYS_INLINE static VecType u16Load(U16VecType x) { return vmovl_u16(x); }
+    ALWAYS_INLINE static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return vmulq_u32(x, y); }
+    ALWAYS_INLINE static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return vaddq_u32(x, y); }
+    ALWAYS_INLINE static VecType vecMul(VecType x, VecType y) { return vmulq_u32(x, y); }
+    ALWAYS_INLINE static VecType vecAdd(VecType x, VecType y) { return vaddq_u32(x, y); }
+    ALWAYS_INLINE static Vec128Type squash2(VecType x, VecType y) {
+        return squash1(vaddq_u32(x, y)); // [x0 + y0, x1 + y1, x2 + y2, x3 + y3]
+    }
+
+    ALWAYS_INLINE static uint32x4_t squash1(uint32x4_t z) {
+    #ifdef __aarch64__
+        return vdupq_n_u32(vaddvq_u32(z)); // [z0..3, same, same, same]
+    #else
+        uint32x2_t lo = vget_low_u32(z);   // [z0, z1]
+        uint32x2_t hi = vget_high_u32(z);  // [z2, z3]
+        uint32x2_t sum = vadd_u32(lo, hi); // [z0 + z2, z1 + z3]
+        sum = vpadd_u32(sum, sum);         // [z0..3, same]
+        return vcombine_u32(sum, sum);     // [z0..3, same, same, same]
+    #endif
+    };
+
+    static int polyHashUnalignedUnrollUpTo16(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll4<NeonTraits>(n, str, res, &b16[0], &p32[16]);
+        polyHashUnroll2<NeonTraits>(n, str, res, &b8[0], &p32[24]);
+        polyHashTail<NeonTraits>(n, str, res, &b4[0], &p32[28]);
+
+        return vec128toInt(res);
+    }
+
+    static int polyHashUnalignedUnrollUpTo32(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll8<NeonTraits>(n, str, res, &b32[0], &p32[0]);
+        polyHashUnroll4<NeonTraits>(n, str, res, &b16[0], &p32[16]);
+        polyHashUnroll2<NeonTraits>(n, str, res, &b8[0], &p32[24]);
+        polyHashTail<NeonTraits>(n, str, res, &b4[0], &p32[28]);
+
+        return vec128toInt(res);
+    }
+};
+
+#if defined(__aarch64__)
+    const bool neonSupported = true; // AArch64 always supports Neon.
+#elif defined(__ANDROID__)
+    #include <cpu-features.h>
+    const bool neonSupported = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
+#elif defined(__APPLE__)
+    const bool neonSupported = true; // It is supported starting from iPhone 3GS.
+#elif defined(__linux__) or defined(__unix__)
+    #include <sys/auxv.h>
+    #include <asm/hwcap.h>
+    const bool neonSupported = getauxval(AT_HWCAP) & HWCAP_NEON;
+#else
+    #error "Not supported"
+#endif
+
+}
+
+int polyHash_arm(int length, uint16_t const* str) {
+    if (!neonSupported) {
+        // Vectorization is not supported.
+        return polyHash_naive(length, str);
+    }
+    int res;
+    if (length < 488)
+        res = NeonTraits::polyHashUnalignedUnrollUpTo16(length / 4, str);
+    else
+        res = NeonTraits::polyHashUnalignedUnrollUpTo32(length / 4, str);
+    // Handle the tail naively.
+    for (int i = length & 0xFFFFFFFC; i < length; ++i)
+        res = res * 31 + str[i];
+    return res;
+}
+
+#endif // __ARM_NEON
+
+#endif // defined(__arm__) or defined(__aarch64__)
\ No newline at end of file
--- a/kotlin-native/runtime/src/main/cpp/polyhash/arm.h
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/arm.h
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#ifndef RUNTIME_POLYHASH_ARM_H
+#define RUNTIME_POLYHASH_ARM_H
+
+int polyHash_arm(int length, uint16_t const* str);
+
+#endif  // RUNTIME_POLYHASH_ARM_H
--- a/kotlin-native/runtime/src/main/cpp/polyhash/common.h
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/common.h
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#ifndef RUNTIME_POLYHASH_COMMON_H
+#define RUNTIME_POLYHASH_COMMON_H
+
+#include <array>
+#include <cstdint>
+#include "polyhash/naive.h"
+#include "../Common.h"
+
+constexpr uint32_t Power(uint32_t base, uint8_t exponent) {
+    uint32_t result = 1;
+    for (uint8_t i = 0; i < exponent; ++i) {
+        result *= base;
+    }
+    return result;
+}
+
+template <uint8_t Exponent>
+constexpr std::array<uint32_t, Exponent> DecreasingPowers(uint32_t base) {
+    std::array<uint32_t, Exponent> result = {};
+    uint32_t current = 1;
+    for (auto it = result.rbegin(); it != result.rend(); ++it) {
+        *it = current;
+        current *= base;
+    }
+    return result;
+}
+
+template <size_t Count>
+constexpr std::array<uint32_t, Count> RepeatingPowers(uint32_t base, uint8_t exponent) {
+    std::array<uint32_t, Count> result = {};
+    uint32_t value = Power(base, exponent);
+    for (auto& element : result)
+        element = value;
+    return result;
+}
+
+#if defined(__x86_64__) or defined(__i386__)
+#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#endif
+
+template<typename Traits>
+ALWAYS_INLINE void polyHashTail(int& n, uint16_t const*& str, typename Traits::Vec128Type& res, uint32_t const* b, uint32_t const* p) {
+    using VecType = typename Traits::VecType;
+    using Vec128Type = typename Traits::Vec128Type;
+    using U16VecType = typename Traits::U16VecType;
+
+    const int vecLength = sizeof(VecType) / 4;
+    if (n < vecLength / 4) return;
+
+    VecType x = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
+    res = Traits::vec128Mul(res, *reinterpret_cast<Vec128Type const*>(b));
+    VecType z = Traits::vecMul(x, *reinterpret_cast<VecType const*>(p));
+    res = Traits::vec128Add(res, Traits::squash1(z));
+
+    str += vecLength;
+    n -= vecLength / 4;
+}
+
+template<typename Traits>
+ALWAYS_INLINE void polyHashUnroll2(int& n, uint16_t const*& str, typename Traits::Vec128Type& res, uint32_t const* b, uint32_t const* p) {
+    using VecType = typename Traits::VecType;
+    using Vec128Type = typename Traits::Vec128Type;
+    using U16VecType = typename Traits::U16VecType;
+
+    const int vecLength = sizeof(VecType) / 4;
+    if (n < vecLength / 2) return;
+
+    res = Traits::vec128Mul(res, *reinterpret_cast<Vec128Type const*>(b));
+
+    VecType res0 = Traits::initVec();
+    VecType res1 = Traits::initVec();
+
+    do {
+        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
+        res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
+        res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
+        VecType z0 = Traits::vecMul(x0, *reinterpret_cast<VecType const*>(p));
+        VecType z1 = Traits::vecMul(x1, *reinterpret_cast<VecType const*>(p + vecLength));
+        res0 = Traits::vecAdd(res0, z0);
+        res1 = Traits::vecAdd(res1, z1);
+
+        str += vecLength * 2;
+        n -= vecLength / 2;
+    } while (n >= vecLength / 2);
+
+    res = Traits::vec128Add(res, Traits::squash2(res0, res1));
+}
+
+template<typename Traits>
+ALWAYS_INLINE void polyHashUnroll4(int& n, uint16_t const*& str, typename Traits::Vec128Type& res, uint32_t const* b, uint32_t const* p) {
+    using VecType = typename Traits::VecType;
+    using Vec128Type = typename Traits::Vec128Type;
+    using U16VecType = typename Traits::U16VecType;
+
+    const int vecLength = sizeof(VecType) / 4;
+    if (n < vecLength) return;
+
+    res = Traits::vec128Mul(res, *reinterpret_cast<Vec128Type const*>(b));
+
+    VecType res0 = Traits::initVec();
+    VecType res1 = Traits::initVec();
+    VecType res2 = Traits::initVec();
+    VecType res3 = Traits::initVec();
+
+    do {
+        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
+        VecType x2 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 2));
+        VecType x3 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 3));
+        res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
+        res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
+        res2 = Traits::vecMul(res2, *reinterpret_cast<VecType const*>(b));
+        res3 = Traits::vecMul(res3, *reinterpret_cast<VecType const*>(b));
+        VecType z0 = Traits::vecMul(x0, *reinterpret_cast<VecType const*>(p));
+        VecType z1 = Traits::vecMul(x1, *reinterpret_cast<VecType const*>(p + vecLength));
+        VecType z2 = Traits::vecMul(x2, *reinterpret_cast<VecType const*>(p + vecLength * 2));
+        VecType z3 = Traits::vecMul(x3, *reinterpret_cast<VecType const*>(p + vecLength * 3));
+        res0 = Traits::vecAdd(res0, z0);
+        res1 = Traits::vecAdd(res1, z1);
+        res2 = Traits::vecAdd(res2, z2);
+        res3 = Traits::vecAdd(res3, z3);
+
+        str += vecLength * 4;
+        n -= vecLength;
+    } while (n >= vecLength);
+
+    res = Traits::vec128Add(res, Traits::vec128Add(Traits::squash2(res0, res1), Traits::squash2(res2, res3)));
+}
+
+template<typename Traits>
+ALWAYS_INLINE void polyHashUnroll8(int& n, uint16_t const*& str, typename Traits::Vec128Type& res, uint32_t const* b, uint32_t const* p) {
+    using VecType = typename Traits::VecType;
+    using Vec128Type = typename Traits::Vec128Type;
+    using U16VecType = typename Traits::U16VecType;
+
+    const int vecLength = sizeof(VecType) / 4;
+    if (n < vecLength * 2) return;
+
+    VecType res0 = Traits::initVec();
+    VecType res1 = Traits::initVec();
+    VecType res2 = Traits::initVec();
+    VecType res3 = Traits::initVec();
+    VecType res4 = Traits::initVec();
+    VecType res5 = Traits::initVec();
+    VecType res6 = Traits::initVec();
+    VecType res7 = Traits::initVec();
+
+    do {
+        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
+        VecType x2 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 2));
+        VecType x3 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 3));
+        VecType x4 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 4));
+        VecType x5 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 5));
+        VecType x6 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 6));
+        VecType x7 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 7));
+        res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
+        res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
+        res2 = Traits::vecMul(res2, *reinterpret_cast<VecType const*>(b));
+        res3 = Traits::vecMul(res3, *reinterpret_cast<VecType const*>(b));
+        res4 = Traits::vecMul(res4, *reinterpret_cast<VecType const*>(b));
+        res5 = Traits::vecMul(res5, *reinterpret_cast<VecType const*>(b));
+        res6 = Traits::vecMul(res6, *reinterpret_cast<VecType const*>(b));
+        res7 = Traits::vecMul(res7, *reinterpret_cast<VecType const*>(b));
+        VecType z0 = Traits::vecMul(x0, *reinterpret_cast<VecType const*>(p));
+        VecType z1 = Traits::vecMul(x1, *reinterpret_cast<VecType const*>(p + vecLength));
+        VecType z2 = Traits::vecMul(x2, *reinterpret_cast<VecType const*>(p + vecLength * 2));
+        VecType z3 = Traits::vecMul(x3, *reinterpret_cast<VecType const*>(p + vecLength * 3));
+        VecType z4 = Traits::vecMul(x4, *reinterpret_cast<VecType const*>(p + vecLength * 4));
+        VecType z5 = Traits::vecMul(x5, *reinterpret_cast<VecType const*>(p + vecLength * 5));
+        VecType z6 = Traits::vecMul(x6, *reinterpret_cast<VecType const*>(p + vecLength * 6));
+        VecType z7 = Traits::vecMul(x7, *reinterpret_cast<VecType const*>(p + vecLength * 7));
+        res0 = Traits::vecAdd(res0, z0);
+        res1 = Traits::vecAdd(res1, z1);
+        res2 = Traits::vecAdd(res2, z2);
+        res3 = Traits::vecAdd(res3, z3);
+        res4 = Traits::vecAdd(res4, z4);
+        res5 = Traits::vecAdd(res5, z5);
+        res6 = Traits::vecAdd(res6, z6);
+        res7 = Traits::vecAdd(res7, z7);
+
+        str += vecLength * 8;
+        n -= vecLength * 2;
+    } while (n >= vecLength * 2);
+
+    Vec128Type sum1 = Traits::vec128Add(Traits::squash2(res0, res1), Traits::squash2(res2, res3));
+    Vec128Type sum2 = Traits::vec128Add(Traits::squash2(res4, res5), Traits::squash2(res6, res7));
+    res = Traits::vec128Add(res, Traits::vec128Add(sum1, sum2));
+}
+
+#if defined(__x86_64__) or defined(__i386__)
+#pragma clang attribute pop
+#endif
+
+#endif  // RUNTIME_POLYHASH_COMMON_H
--- a/kotlin-native/runtime/src/main/cpp/polyhash/naive.h
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/naive.h
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#ifndef RUNTIME_POLYHASH_NAIVE_H
+#define RUNTIME_POLYHASH_NAIVE_H
+
+#include <cstdint>
+
+inline int polyHash_naive(int length, uint16_t const* str) {
+    int res = 0;
+    for (int i = 0; i < length; ++i)
+        res = res * 31 + str[i];
+    return res;
+}
+
+#endif  // RUNTIME_POLYHASH_NAIVE_H
--- a/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#include "polyhash/common.h"
+#include "polyhash/x86.h"
+
+#if defined(__x86_64__) or defined(__i386__)
+
+#include <immintrin.h>
+
+#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+
+namespace {
+
+alignas(32) constexpr auto p64 = DecreasingPowers<64>(31);   // [base^63, base^62, .., base^2, base, 1]
+alignas(32) constexpr auto b64 = RepeatingPowers<8>(31, 64); // [base^64, base^64, .., base^64] (8)
+alignas(32) constexpr auto b32 = RepeatingPowers<8>(31, 32); // [base^32, base^32, .., base^32] (8)
+alignas(32) constexpr auto b16 = RepeatingPowers<8>(31, 16); // [base^16, base^16, .., base^16] (8)
+alignas(32) constexpr auto b8  = RepeatingPowers<8>(31, 8);  // [base^8,  base^8,  .., base^8 ] (8)
+alignas(32) constexpr auto b4  = RepeatingPowers<8>(31, 4);  // [base^4,  base^4,  .., base^4 ] (8)
+
+struct SSETraits {
+    using VecType = __m128i;
+    using Vec128Type = __m128i;
+    using U16VecType = __m128i;
+
+    ALWAYS_INLINE static VecType initVec() { return _mm_setzero_si128(); }
+    ALWAYS_INLINE static Vec128Type initVec128() { return _mm_setzero_si128(); }
+    ALWAYS_INLINE static int vec128toInt(Vec128Type x) { return _mm_cvtsi128_si32(x); }
+    ALWAYS_INLINE static VecType u16Load(U16VecType x) { return _mm_cvtepu16_epi32(x); }
+    ALWAYS_INLINE static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return _mm_mullo_epi32(x, y); }
+    ALWAYS_INLINE static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return _mm_add_epi32(x, y); }
+    ALWAYS_INLINE static VecType vecMul(VecType x, VecType y) { return _mm_mullo_epi32(x, y); }
+    ALWAYS_INLINE static VecType vecAdd(VecType x, VecType y) { return _mm_add_epi32(x, y); }
+    ALWAYS_INLINE static Vec128Type squash2(VecType x, VecType y) {
+        return squash1(_mm_hadd_epi32(x, y)); // [x0 + x1, x2 + x3, y0 + y1, y2 + y3]
+    }
+
+    ALWAYS_INLINE static Vec128Type squash1(VecType z) {
+        VecType sum = _mm_hadd_epi32(z, z); // [z0 + z1, z2 + z3, z0 + z1, z2 + z3]
+        return _mm_hadd_epi32(sum, sum);    // [z0..3, same, same, same]
+    }
+
+    static int polyHashUnalignedUnrollUpTo8(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll2<SSETraits>(n, str, res, &b8[0], &p64[56]);
+        polyHashTail<SSETraits>(n, str, res, &b4[0], &p64[60]);
+
+        return vec128toInt(res);
+    }
+
+    static int polyHashUnalignedUnrollUpTo16(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll4<SSETraits>(n, str, res, &b16[0], &p64[48]);
+        polyHashUnroll2<SSETraits>(n, str, res, &b8[0], &p64[56]);
+        polyHashTail<SSETraits>(n, str, res, &b4[0], &p64[60]);
+
+        return vec128toInt(res);
+    }
+};
+
+struct AVX2Traits {
+    using VecType = __m256i;
+    using Vec128Type = __m128i;
+    using U16VecType = __m128i;
+
+    ALWAYS_INLINE static VecType initVec() { return _mm256_setzero_si256(); }
+    ALWAYS_INLINE static Vec128Type initVec128() { return _mm_setzero_si128(); }
+    ALWAYS_INLINE static int vec128toInt(Vec128Type x) { return _mm_cvtsi128_si32(x); }
+    ALWAYS_INLINE static VecType u16Load(U16VecType x) { return _mm256_cvtepu16_epi32(x); }
+    ALWAYS_INLINE static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return _mm_mullo_epi32(x, y); }
+    ALWAYS_INLINE static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return _mm_add_epi32(x, y); }
+    ALWAYS_INLINE static VecType vecMul(VecType x, VecType y) { return _mm256_mullo_epi32(x, y); }
+    ALWAYS_INLINE static VecType vecAdd(VecType x, VecType y) { return _mm256_add_epi32(x, y); }
+    ALWAYS_INLINE static Vec128Type squash2(VecType x, VecType y) {
+        return squash1(_mm256_hadd_epi32(x, y)); // [x0 + x1, x2 + x3, y0 + y1, y2 + y3, x4 + x5, x6 + x7, y4 + y5, y6 + y7]
+    }
+
+    ALWAYS_INLINE static Vec128Type squash1(VecType z) {
+        VecType sum = _mm256_hadd_epi32(z, z);            // [z0 + z1, z2 + z3, z0 + z1, z2 + z3, z4 + z5, z6 + z7, z4 + z5, z6 + z7]
+        sum = _mm256_hadd_epi32(sum, sum);                // [z0..3, z0..3, z0..3, z0..3, z4..7, z4..7, z4..7, z4..7]
+        Vec128Type lo = _mm256_extracti128_si256(sum, 0); // [z0..3, same, same, same]
+        Vec128Type hi = _mm256_extracti128_si256(sum, 1); // [z4..7, same, same, same]
+        return _mm_add_epi32(lo, hi);                     // [z0..7, same, same, same]
+    }
+
+    static int polyHashUnalignedUnrollUpTo16(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll2<AVX2Traits>(n, str, res, &b16[0], &p64[48]);
+        polyHashTail<AVX2Traits>(n, str, res, &b8[0], &p64[56]);
+        polyHashTail<SSETraits>(n, str, res, &b4[0], &p64[60]);
+
+        return vec128toInt(res);
+    }
+
+    static int polyHashUnalignedUnrollUpTo32(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll4<AVX2Traits>(n, str, res, &b32[0], &p64[32]);
+        polyHashUnroll2<AVX2Traits>(n, str, res, &b16[0], &p64[48]);
+        polyHashTail<AVX2Traits>(n, str, res, &b8[0], &p64[56]);
+        polyHashTail<SSETraits>(n, str, res, &b4[0], &p64[60]);
+
+        return vec128toInt(res);
+    }
+
+    static int polyHashUnalignedUnrollUpTo64(int n, uint16_t const* str) {
+        Vec128Type res = initVec128();
+
+        polyHashUnroll8<AVX2Traits>(n, str, res, &b64[0], &p64[0]);
+        polyHashUnroll4<AVX2Traits>(n, str, res, &b32[0], &p64[32]);
+        polyHashUnroll2<AVX2Traits>(n, str, res, &b16[0], &p64[48]);
+        polyHashTail<AVX2Traits>(n, str, res, &b8[0], &p64[56]);
+        polyHashTail<SSETraits>(n, str, res, &b4[0], &p64[60]);
+
+        return vec128toInt(res);
+    }
+};
+
+#if defined(__x86_64__)
+    const bool x64 = true;
+#else
+    const bool x64 = false;
+#endif
+    bool initialized = false;
+    bool sseSupported;
+    bool avx2Supported;
+
+}
+
+int polyHash_x86(int length, uint16_t const* str) {
+    if (!initialized) {
+        initialized = true;
+        sseSupported = __builtin_cpu_supports("sse4.1");
+        avx2Supported = __builtin_cpu_supports("avx2");
+    }
+    if (length < 16 || (!sseSupported && !avx2Supported)) {
+        // Either vectorization is not supported or the string is too short to gain from it.
+        return polyHash_naive(length, str);
+    }
+    int res;
+    if (length < 32)
+        res = SSETraits::polyHashUnalignedUnrollUpTo8(length / 4, str);
+    else if (!avx2Supported)
+        res = SSETraits::polyHashUnalignedUnrollUpTo16(length / 4, str);
+    else if (length < 128)
+        res = AVX2Traits::polyHashUnalignedUnrollUpTo16(length / 4, str);
+    else if (!x64 || length < 576)
+        res = AVX2Traits::polyHashUnalignedUnrollUpTo32(length / 4, str);
+    else // Such big unrolling requires 64-bit mode (in 32-bit mode there are only 8 vector registers)
+        res = AVX2Traits::polyHashUnalignedUnrollUpTo64(length / 4, str);
+
+    // Handle the tail naively.
+    for (int i = length & 0xFFFFFFFC; i < length; ++i)
+        res = res * 31 + str[i];
+    return res;
+}
+
+#pragma clang attribute pop
+
+#endif
--- a/kotlin-native/runtime/src/main/cpp/polyhash/x86.h
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/x86.h
+/*
+ * Copyright 2010-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+
+#ifndef RUNTIME_POLYHASH_X86_H
+#define RUNTIME_POLYHASH_X86_H
+
+int polyHash_x86(int length, uint16_t const* str);
+
+#endif  // RUNTIME_POLYHASH_X86_H