diff --git a/.circleci/config.yml b/.circleci/config.yml
index da4bb554126c44503dcd1c368ad1e34f2ee65651..412f6ef708d4b4ed7eae39fc4756f0e3e6f0e9ae 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,7 +31,7 @@ jobs:
       - checkout # check out the code in the project directory
       - run: pyenv global 3.5.2
       - run: sudo apt-get update -y && sudo apt-get install -y libgflags-dev
-      - run: SKIP_FORMAT_BUCK_CHECKS=1 PRINT_PARALLEL_OUTPUTS=1 ASSERT_STATUS_CHECKED=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" GTEST_THROW_ON_FAILURE=0 GTEST_OUTPUT="xml:/tmp/test-results/" make V=1 -j32 all check_some | .circleci/cat_ignore_eagain
+      - run: SKIP_FORMAT_BUCK_CHECKS=1 PRINT_PARALLEL_OUTPUTS=1 ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" GTEST_THROW_ON_FAILURE=0 GTEST_OUTPUT="xml:/tmp/test-results/" make V=1 -j32 all check_some | .circleci/cat_ignore_eagain
       - store_test_results:
           path: /tmp/test-results
 
diff --git a/Makefile b/Makefile
index 7c1cc8f0561c2848ce52e090e331de7f12025175..a2b1c3f1b2c2358e58641ca5dce3cbd7a8681141 100644
--- a/Makefile
+++ b/Makefile
@@ -410,6 +410,10 @@ ifdef TEST_CACHE_LINE_SIZE
   PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
   PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
 endif
+ifdef TEST_UINT128_COMPAT
+  PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1
+  PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1
+endif
 
 # This (the first rule) must depend on "all".
 default: all
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 094d35e6b2d59cd0cd181f78937acfa74043836c..805b7eab503438f451b45d551b917121ffd84e30 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -665,11 +665,18 @@ if test "$TRY_SSE_ETC"; then
   # It doesn't even really check that your current CPU is compatible.
   #
   # SSE4.2 available since nehalem, ca. 2008-2010
+  # Includes POPCNT for BitsSetToOne, BitParity
   TRY_SSE42="-msse4.2"
   # PCLMUL available since westmere, ca. 2010-2011
   TRY_PCLMUL="-mpclmul"
   # AVX2 available since haswell, ca. 2013-2015
   TRY_AVX2="-mavx2"
+  # BMI available since haswell, ca. 2013-2015
+  # Primarily for TZCNT for CountTrailingZeroBits
+  TRY_BMI="-mbmi"
+  # LZCNT available since haswell, ca. 2013-2015
+  # For FloorLog2
+  TRY_LZCNT="-mlzcnt"
 fi
 
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null <<EOF
@@ -718,6 +725,34 @@ elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
 fi
 
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_tzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_lzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
+fi
+
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   int main() {
diff --git a/util/hash.h b/util/hash.h
index 17490a366c80112ba0d8bb972b4cc3e8db1576a3..5bf5ce85e84edc4cd786fd12c965effcd379c510 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -88,12 +88,16 @@ inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
   return static_cast<uint32_t>(product >> 32);
 }
 
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
 // An alternative to % for mapping a 64-bit hash value to an arbitrary range
 // that fits in size_t. See https://github.com/lemire/fastrange
 // We find size_t more convenient than uint64_t for the range, with side
 // benefit of better optimization on 32-bit platforms.
 inline size_t fastrange64(uint64_t hash, size_t range) {
-#if defined(HAVE_UINT128_EXTENSION)
+#ifdef HAVE_UINT128_EXTENSION
   // Can use compiler's 128-bit type. Trust it to do the right thing.
   __uint128_t wide = __uint128_t{range} * hash;
   return static_cast<size_t>(wide >> 64);
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 9c3c6efe914964a19f31de00951e73bb491188b4..802fb27f3a81deba0bfa80cb43572480cf3b87c3 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -7,12 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "util/hash.h"
+
 #include <cstring>
 #include <vector>
 
 #include "test_util/testharness.h"
 #include "util/coding.h"
-#include "util/hash.h"
+#include "util/math128.h"
 
 using ROCKSDB_NAMESPACE::EncodeFixed32;
 using ROCKSDB_NAMESPACE::GetSliceHash64;
@@ -370,6 +372,159 @@ size_t fastrange64(uint64_t hash, size_t range) {
   return ROCKSDB_NAMESPACE::fastrange64(hash, range);
 }
 
+// Tests for math.h / math128.h (not worth a separate test binary)
+using ROCKSDB_NAMESPACE::BitParity;
+using ROCKSDB_NAMESPACE::BitsSetToOne;
+using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
+using ROCKSDB_NAMESPACE::DecodeFixed128;
+using ROCKSDB_NAMESPACE::EncodeFixed128;
+using ROCKSDB_NAMESPACE::FloorLog2;
+using ROCKSDB_NAMESPACE::Lower64of128;
+using ROCKSDB_NAMESPACE::Multiply64to128;
+using ROCKSDB_NAMESPACE::Unsigned128;
+using ROCKSDB_NAMESPACE::Upper64of128;
+
+template <typename T>
+static void test_BitOps() {
+  // This complex code is to generalize to 128-bit values. Otherwise
+  // we could just use = static_cast<T>(0x5555555555555555ULL);
+  T everyOtherBit = 0;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    everyOtherBit = (everyOtherBit << 8) | T{0x55};
+  }
+
+  // This one built using bit operations, as our 128-bit layer
+  // might not implement arithmetic such as subtraction.
+  T vm1 = 0;  // "v minus one"
+
+  for (int i = 0; i < int{8 * sizeof(T)}; ++i) {
+    T v = T{1} << i;
+    // If we could directly use arithmetic:
+    // T vm1 = static_cast<T>(v - 1);
+
+    // FloorLog2
+    if (v > 0) {
+      EXPECT_EQ(FloorLog2(v), i);
+    }
+    if (vm1 > 0) {
+      EXPECT_EQ(FloorLog2(vm1), i - 1);
+      EXPECT_EQ(FloorLog2(everyOtherBit & vm1), (i - 1) & ~1);
+    }
+
+    // CountTrailingZeroBits
+    if (v != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(v), i);
+    }
+    if (vm1 != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(vm1), 0);
+    }
+    if (i < int{8 * sizeof(T)} - 1) {
+      EXPECT_EQ(CountTrailingZeroBits(~vm1 & everyOtherBit), (i + 1) & ~1);
+    }
+
+    // BitsSetToOne
+    EXPECT_EQ(BitsSetToOne(v), 1);
+    EXPECT_EQ(BitsSetToOne(vm1), i);
+    EXPECT_EQ(BitsSetToOne(vm1 & everyOtherBit), (i + 1) / 2);
+
+    // BitParity
+    EXPECT_EQ(BitParity(v), 1);
+    EXPECT_EQ(BitParity(vm1), i & 1);
+    EXPECT_EQ(BitParity(vm1 & everyOtherBit), ((i + 1) / 2) & 1);
+
+    vm1 = (vm1 << 1) | 1;
+  }
+}
+
+TEST(MathTest, BitOps) {
+  test_BitOps<uint32_t>();
+  test_BitOps<uint64_t>();
+  test_BitOps<uint16_t>();
+  test_BitOps<uint8_t>();
+  test_BitOps<unsigned char>();
+  test_BitOps<unsigned short>();
+  test_BitOps<unsigned int>();
+  test_BitOps<unsigned long>();
+  test_BitOps<unsigned long long>();
+  test_BitOps<char>();
+  test_BitOps<size_t>();
+  test_BitOps<int32_t>();
+  test_BitOps<int64_t>();
+  test_BitOps<int16_t>();
+  test_BitOps<int8_t>();
+  test_BitOps<signed char>();
+  test_BitOps<short>();
+  test_BitOps<int>();
+  test_BitOps<long>();
+  test_BitOps<long long>();
+  test_BitOps<ptrdiff_t>();
+}
+
+TEST(MathTest, BitOps128) { test_BitOps<Unsigned128>(); }
+
+TEST(MathTest, Math128) {
+  const Unsigned128 sixteenHexOnes = 0x1111111111111111U;
+  const Unsigned128 thirtyHexOnes = (sixteenHexOnes << 56) | sixteenHexOnes;
+  const Unsigned128 sixteenHexTwos = 0x2222222222222222U;
+  const Unsigned128 thirtyHexTwos = (sixteenHexTwos << 56) | sixteenHexTwos;
+
+  // v will slide from all hex ones to all hex twos
+  Unsigned128 v = thirtyHexOnes;
+  for (int i = 0; i <= 30; ++i) {
+    // Test bitwise operations
+    EXPECT_EQ(BitsSetToOne(v), 30);
+    EXPECT_EQ(BitsSetToOne(~v), 128 - 30);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexOnes), 30 - i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexOnes), 30 + i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexOnes), 2 * i);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexTwos), i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexTwos), 60 - i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexTwos), 60 - 2 * i);
+
+    // Test comparisons
+    EXPECT_EQ(v == thirtyHexOnes, i == 0);
+    EXPECT_EQ(v == thirtyHexTwos, i == 30);
+    EXPECT_EQ(v > thirtyHexOnes, i > 0);
+    EXPECT_EQ(v > thirtyHexTwos, false);
+    EXPECT_EQ(v >= thirtyHexOnes, true);
+    EXPECT_EQ(v >= thirtyHexTwos, i == 30);
+    EXPECT_EQ(v < thirtyHexOnes, false);
+    EXPECT_EQ(v < thirtyHexTwos, i < 30);
+    EXPECT_EQ(v <= thirtyHexOnes, i == 0);
+    EXPECT_EQ(v <= thirtyHexTwos, true);
+
+    // Update v, clearing upper-most byte
+    v = ((v << 12) >> 8) | 0x2;
+  }
+
+  for (int i = 0; i < 128; ++i) {
+    // Test shifts
+    Unsigned128 sl = thirtyHexOnes << i;
+    Unsigned128 sr = thirtyHexOnes >> i;
+    EXPECT_EQ(BitsSetToOne(sl), std::min(30, 32 - i / 4));
+    EXPECT_EQ(BitsSetToOne(sr), std::max(0, 30 - (i + 3) / 4));
+    EXPECT_EQ(BitsSetToOne(sl & sr), i % 2 ? 0 : std::max(0, 30 - i / 2));
+  }
+
+  // Test 64x64->128 multiply
+  Unsigned128 product =
+      Multiply64to128(0x1111111111111111U, 0x2222222222222222U);
+  EXPECT_EQ(Lower64of128(product), 2295594818061633090U);
+  EXPECT_EQ(Upper64of128(product), 163971058432973792U);
+}
+
+TEST(MathTest, Coding128) {
+  const char *in = "_1234567890123456";
+  Unsigned128 decoded = DecodeFixed128(in + 1);
+  EXPECT_EQ(Lower64of128(decoded), 4050765991979987505U);
+  EXPECT_EQ(Upper64of128(decoded), 3906085646303834169U);
+  char out[18];
+  out[0] = '_';
+  EncodeFixed128(out + 1, decoded);
+  out[17] = '\0';
+  EXPECT_EQ(std::string(in), std::string(out));
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
diff --git a/util/math.h b/util/math.h
index 64cdb2f44af3f4ab9a5722b1bff060a20ec127c8..2e57c1c0842baaca0d5e73103a490d358505da69 100644
--- a/util/math.h
+++ b/util/math.h
@@ -13,24 +13,112 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+// Fast implementation of floor(log2(v)). Undefined for 0 or negative
+// numbers (in case of signed type).
 template <typename T>
-inline int BitsSetToOne(T v) {
+inline int FloorLog2(T v) {
   static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v > 0);
 #ifdef _MSC_VER
   static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
-  if (sizeof(T) > sizeof(uint32_t)) {
-    return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
+  unsigned long lz = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanReverse(&lz, static_cast<uint32_t>(v));
+  } else {
+    _BitScanReverse64(&lz, static_cast<uint64_t>(v));
+  }
+  return 63 - static_cast<int>(lz);
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    int lz = __builtin_clz(static_cast<unsigned int>(v));
+    return int{sizeof(unsigned int)} * 8 - 1 - lz;
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    int lz = __builtin_clzl(static_cast<unsigned long>(v));
+    return int{sizeof(unsigned long)} * 8 - 1 - lz;
+  } else {
+    int lz = __builtin_clzll(static_cast<unsigned long long>(v));
+    return int{sizeof(unsigned long long)} * 8 - 1 - lz;
+  }
+#endif
+}
+
+// Number of low-order zero bits before the first 1 bit. Undefined for 0.
+template <typename T>
+inline int CountTrailingZeroBits(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v != 0);
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  unsigned long tz = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanForward(&tz, static_cast<uint32_t>(v));
   } else {
+    _BitScanForward64(&tz, static_cast<uint64_t>(v));
+  }
+  return static_cast<int>(tz);
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    return __builtin_ctz(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_ctzl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_ctzll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+// Number of bits set to 1. Also known as "population count".
+template <typename T>
+inline int BitsSetToOne(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  if (sizeof(T) < sizeof(uint32_t)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(uint32_t) - 1;
+    // The bit mask is to neutralize sign extension on small signed types
+    constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
+    return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
+  } else if (sizeof(T) == sizeof(uint32_t)) {
     return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
+  } else {
+    return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
   }
 #else
   static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
-  if (sizeof(T) > sizeof(unsigned long)) {
-    return __builtin_popcountll(static_cast<unsigned long long>(v));
-  } else if (sizeof(T) > sizeof(unsigned int)) {
+  if (sizeof(T) < sizeof(unsigned int)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(unsigned int) - 1;
+    // This bit mask is to neutralize sign extension on small signed types
+    constexpr unsigned int m = (1U << ((8 * sizeof(T)) & mm)) - 1;
+    return __builtin_popcount(static_cast<unsigned int>(v) & m);
+  } else if (sizeof(T) == sizeof(unsigned int)) {
+    return __builtin_popcount(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
     return __builtin_popcountl(static_cast<unsigned long>(v));
   } else {
-    return __builtin_popcount(static_cast<unsigned int>(v));
+    return __builtin_popcountll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+template <typename T>
+inline int BitParity(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  // bit parity == oddness of popcount
+  return BitsSetToOne(v) & 1;
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    // On any sane systen, potential sign extension here won't change parity
+    return __builtin_parity(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_parityl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_parityll(static_cast<unsigned long long>(v));
   }
 #endif
 }
diff --git a/util/math128.h b/util/math128.h
new file mode 100644
index 0000000000000000000000000000000000000000..10e99714e83a033abde077ddd4df506c8a2139da
--- /dev/null
+++ b/util/math128.h
@@ -0,0 +1,223 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "util/coding.h"
+#include "util/math.h"
+
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Unsigned128 is a 128 bit value supporting (at least) bitwise operators,
+// shifts, and comparisons. __uint128_t is not always available.
+
+#ifdef HAVE_UINT128_EXTENSION
+using Unsigned128 = __uint128_t;
+#else
+struct Unsigned128 {
+  uint64_t lo;
+  uint64_t hi;
+
+  inline Unsigned128() {
+    static_assert(sizeof(Unsigned128) == 2 * sizeof(uint64_t),
+                  "unexpected overhead in representation");
+    lo = 0;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower) {
+    lo = lower;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower, uint64_t upper) {
+    lo = lower;
+    hi = upper;
+  }
+};
+
+inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.lo = 0;
+    rv.hi = lhs.lo << (shift & 63);
+  } else {
+    uint64_t tmp = lhs.lo;
+    rv.lo = tmp << shift;
+    // Ensure shift==0 shifts away everything. (This avoids another
+    // conditional branch on shift == 0.)
+    tmp = tmp >> 1 >> (63 - shift);
+    rv.hi = tmp | (lhs.hi << shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator<<=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs << shift;
+  return lhs;
+}
+
+inline Unsigned128 operator>>(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.hi = 0;
+    rv.lo = lhs.hi >> (shift & 63);
+  } else {
+    uint64_t tmp = lhs.hi;
+    rv.hi = tmp >> shift;
+    // Ensure shift==0 shifts away everything
+    tmp = tmp << 1 << (63 - shift);
+    rv.lo = tmp | (lhs.lo >> shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator>>=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs >> shift;
+  return lhs;
+}
+
+inline Unsigned128 operator&(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo & rhs.lo, lhs.hi & rhs.hi);
+}
+
+inline Unsigned128& operator&=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs & rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator|(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo | rhs.lo, lhs.hi | rhs.hi);
+}
+
+inline Unsigned128& operator|=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator^(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo ^ rhs.lo, lhs.hi ^ rhs.hi);
+}
+
+inline Unsigned128& operator^=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs ^ rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator~(const Unsigned128& v) {
+  return Unsigned128(~v.lo, ~v.hi);
+}
+
+inline bool operator==(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+}
+
+inline bool operator!=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
+}
+
+inline bool operator>(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo > rhs.lo);
+}
+
+inline bool operator<(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo < rhs.lo);
+}
+
+inline bool operator>=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo >= rhs.lo);
+}
+
+inline bool operator<=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo <= rhs.lo);
+}
+#endif
+
+inline uint64_t Lower64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v);
+#else
+  return v.lo;
+#endif
+}
+
+inline uint64_t Upper64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v >> 64);
+#else
+  return v.hi;
+#endif
+}
+
+// This generally compiles down to a single fast instruction on 64-bit.
+// This doesn't really make sense as operator* because it's not a
+// general 128x128 multiply and provides more output than 64x64 multiply.
+inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) {
+#ifdef HAVE_UINT128_EXTENSION
+  return Unsigned128{a} * Unsigned128{b};
+#else
+  // Full decomposition
+  // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit
+  // -> 128-bit multiplication and optimize it appropriately.
+  uint64_t tmp = uint64_t{b & 0xffffFFFF} * uint64_t{a & 0xffffFFFF};
+  uint64_t lower = tmp & 0xffffFFFF;
+  tmp >>= 32;
+  tmp += uint64_t{b & 0xffffFFFF} * uint64_t{a >> 32};
+  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+  uint64_t tmp2 = uint64_t{b >> 32} * uint64_t{a & 0xffffFFFF};
+  tmp += tmp2 & 0xffffFFFF;
+  lower |= tmp << 32;
+  tmp >>= 32;
+  tmp += tmp2 >> 32;
+  tmp += uint64_t{b >> 32} * uint64_t{a >> 32};
+  return Unsigned128(lower, tmp);
+#endif
+}
+
+template <>
+inline int FloorLog2(Unsigned128 v) {
+  if (Upper64of128(v) == 0) {
+    return FloorLog2(Lower64of128(v));
+  } else {
+    return FloorLog2(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int CountTrailingZeroBits(Unsigned128 v) {
+  if (Lower64of128(v) != 0) {
+    return CountTrailingZeroBits(Lower64of128(v));
+  } else {
+    return CountTrailingZeroBits(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int BitsSetToOne(Unsigned128 v) {
+  return BitsSetToOne(Lower64of128(v)) + BitsSetToOne(Upper64of128(v));
+}
+
+template <>
+inline int BitParity(Unsigned128 v) {
+  return BitParity(Lower64of128(v)) ^ BitParity(Upper64of128(v));
+}
+
+inline void EncodeFixed128(char* dst, Unsigned128 value) {
+  EncodeFixed64(dst, Lower64of128(value));
+  EncodeFixed64(dst + 8, Upper64of128(value));
+}
+
+inline Unsigned128 DecodeFixed128(const char* ptr) {
+  Unsigned128 rv = DecodeFixed64(ptr + 8);
+  return (rv << 64) | DecodeFixed64(ptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE