Add some more bit operations to internal APIs (#11660)

Summary: BottomNBits() - there is a single fast instruction for this on x86 since BMI2, but testing with godbolt indicates you need at least GCC 10 for the compiler to choose that instruction from the obvious C++ code. https://godbolt.org/z/5a7Ysd41h BitwiseAnd() - this is a convenience function that works around the language flaw that the type of the result of x & y is the larger of the two input types, when it should be the smaller. This can save some ugly static_cast. I expect to use both of these in coming HyperClockCache developments, and have applied them in a couple of places in existing code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11660 Test Plan: unit tests added Reviewed By: jowlyzhang Differential Revision: D47935531 Pulled By: pdillinger fbshipit-source-id: d148c43a1e51df4a1c549b93aaf2725a3f8d3bd6

Add some more bit operations to internal APIs (#11660)
Summary: BottomNBits() - there is a single fast instruction for this on x86 since BMI2, but testing with godbolt indicates you need at least GCC 10 for the compiler to choose that instruction from the obvious C++ code. https://godbolt.org/z/5a7Ysd41h BitwiseAnd() - this is a convenience function that works around the language flaw that the type of the result of x & y is the larger of the two input types, when it should be the smaller. This can save some ugly static_cast. I expect to use both of these in coming HyperClockCache developments, and have applied them in a couple of places in existing code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11660 Test Plan: unit tests added Reviewed By: jowlyzhang Differential Revision: D47935531 Pulled By: pdillinger fbshipit-source-id: d148c43a1e51df4a1c549b93aaf2725a3f8d3bd6
f4e4039f · Peter Dillinger · Facebook GitHub Bot · 946d1009 · f4e4039f · f4e4039f
6 changed file
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -24,6 +24,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
+#include "util/math.h"

 namespace ROCKSDB_NAMESPACE {

@@ -563,7 +564,7 @@ class HyperClockTable : public BaseClockTable {
 private:  // functions
  // Returns x mod 2^{length_bits_}.
  inline size_t ModTableSize(uint64_t x) {
-    return static_cast<size_t>(x) & length_bits_mask_;
+    return BitwiseAnd(x, length_bits_mask_);
  }

  // Returns the first slot in the probe sequence with a handle e such that

--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -38,7 +38,7 @@ uint32_t DetermineSeed(int32_t hash_seed_option) {
      return GetSliceHash(hostname) & kSeedMask;
    } else {
      // Fall back on something stable within the process.
-      return static_cast<uint32_t>(gen.GetBaseUpper()) & kSeedMask;
+      return BitwiseAnd(gen.GetBaseUpper(), kSeedMask);
    }
  } else {
    // for kQuasiRandomHashSeed and fallback

--- a/util/core_local.h
+++ b/util/core_local.h
@@ -13,6 +13,7 @@

 #include "port/likely.h"
 #include "port/port.h"
+#include "util/math.h"
 #include "util/random.h"

 namespace ROCKSDB_NAMESPACE {
@@ -70,7 +71,7 @@ std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
    // cpu id unavailable, just pick randomly
    core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_);
  } else {
-    core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1));
+    core_idx = static_cast<size_t>(BottomNBits(cpuid, size_shift_));
  }
  return {AccessAtCore(core_idx), core_idx};
 }

--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -565,6 +565,8 @@ size_t FastRange64(uint64_t hash, size_t range) {
 // Tests for math.h / math128.h (not worth a separate test binary)
 using ROCKSDB_NAMESPACE::BitParity;
 using ROCKSDB_NAMESPACE::BitsSetToOne;
+using ROCKSDB_NAMESPACE::BitwiseAnd;
+using ROCKSDB_NAMESPACE::BottomNBits;
 using ROCKSDB_NAMESPACE::ConstexprFloorLog2;
 using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
 using ROCKSDB_NAMESPACE::DecodeFixed128;
@@ -580,6 +582,19 @@ using ROCKSDB_NAMESPACE::Upper64of128;

 int blah(int x) { return DownwardInvolution(x); }

+template <typename T1, typename T2>
+static void test_BitwiseAnd(T1 v1, T2 v2) {
+  auto a = BitwiseAnd(v1, v2);
+  // Essentially repeating the implementation :-/
+  if constexpr (sizeof(T1) < sizeof(T2)) {
+    static_assert(std::is_same_v<decltype(a), T1>);
+    EXPECT_EQ(a, static_cast<T1>(v1 & v2));
+  } else {
+    static_assert(std::is_same_v<decltype(a), T2>);
+    EXPECT_EQ(a, static_cast<T2>(v1 & v2));
+  }
+}
+
 template <typename T>
 static void test_BitOps() {
  // This complex code is to generalize to 128-bit values. Otherwise
@@ -598,6 +613,22 @@ static void test_BitOps() {
    // If we could directly use arithmetic:
    // T vm1 = static_cast<T>(v - 1);

+    // BottomNBits
+    {
+      // An essentially full length value
+      T x = everyOtherBit;
+      if (i > 2) {
+        // Make it slightly irregular
+        x = x ^ (T{1} << (i / 2));
+      }
+      auto a = BottomNBits(x, i);
+      auto b = BottomNBits(~x, i);
+      EXPECT_EQ(x | a, x);
+      EXPECT_EQ(a | b, vm1);
+      EXPECT_EQ(a & b, T{0});
+      EXPECT_EQ(BottomNBits(x ^ a, i), T{0});
+    }
+
    // FloorLog2
    if (v > 0) {
      EXPECT_EQ(FloorLog2(v), i);
@@ -707,9 +738,22 @@ static void test_BitOps() {
      }
    }

+    // BitwiseAnd
+    {
+      test_BitwiseAnd(vm1, static_cast<char>(0x99));
+      test_BitwiseAnd(v, static_cast<char>(0x99));
+      test_BitwiseAnd(char{0x66}, vm1);
+      test_BitwiseAnd(char{0x66}, v);
+      test_BitwiseAnd(v, int16_t{0x6699});
+      test_BitwiseAnd(v, uint16_t{0x9966});
+      test_BitwiseAnd(int64_t{0x1234234534564567}, v);
+      test_BitwiseAnd(uint64_t{0x9876876576545432}, v);
+    }
+
    vm1 = (vm1 << 1) | 1;
  }

+  // ConstexprFloorLog2
  EXPECT_EQ(ConstexprFloorLog2(T{1}), 0);
  EXPECT_EQ(ConstexprFloorLog2(T{2}), 1);
  EXPECT_EQ(ConstexprFloorLog2(T{3}), 1);

--- a/util/math.h
+++ b/util/math.h
@@ -9,6 +9,9 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
+#ifdef __BMI2__
+#include <immintrin.h>
+#endif

 #include <cstdint>
 #include <type_traits>
@@ -20,11 +23,33 @@ ASSERT_FEATURE_COMPAT_HEADER();

 namespace ROCKSDB_NAMESPACE {

+// Fast implementation of extracting the bottom n bits of an integer.
+// To ensure fast implementation, undefined if n bits is full width or more.
+template <typename T>
+inline T BottomNBits(T v, int nbits) {
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
+  assert(nbits >= 0);
+  assert(nbits < int{8 * sizeof(T)});
+#ifdef __BMI2__
+  if constexpr (sizeof(T) <= 4) {
+    return static_cast<T>(_bzhi_u32(static_cast<uint32_t>(v), nbits));
+  }
+  if constexpr (sizeof(T) <= 8) {
+    return static_cast<T>(_bzhi_u64(static_cast<uint64_t>(v), nbits));
+  }
+#endif
+  // Newer compilers compile this down to bzhi on x86, but some older
+  // ones don't, thus the need for the intrinsic above.
+  return static_cast<T>(v & ((T{1} << nbits) - 1));
+}
+
 // Fast implementation of floor(log2(v)). Undefined for 0 or negative
 // numbers (in case of signed type).
 template <typename T>
 inline int FloorLog2(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
  assert(v > 0);
 #ifdef _MSC_VER
  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
@@ -63,6 +88,8 @@ inline int FloorLog2(T v) {
 // Constexpr version of FloorLog2
 template <typename T>
 constexpr int ConstexprFloorLog2(T v) {
+  // NOTE: not checking is_integral so that this works with Unsigned128
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
  int rv = 0;
  while (v > T{1}) {
    ++rv;
@@ -74,7 +101,8 @@ constexpr int ConstexprFloorLog2(T v) {
 // Number of low-order zero bits before the first 1 bit. Undefined for 0.
 template <typename T>
 inline int CountTrailingZeroBits(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
  assert(v != 0);
 #ifdef _MSC_VER
  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
@@ -115,6 +143,9 @@ namespace detail {

 template <typename T>
 int BitsSetToOneFallback(T v) {
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
+
  const int kBits = static_cast<int>(sizeof(T)) * 8;
  static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
  // we static_cast these bit patterns in order to truncate them to the correct
@@ -140,7 +171,9 @@ int BitsSetToOneFallback(T v) {
 // Number of bits set to 1. Also known as "population count".
 template <typename T>
 inline int BitsSetToOne(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
+
 #ifdef _MSC_VER
  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
  if (sizeof(T) < sizeof(uint32_t)) {
@@ -192,7 +225,9 @@ inline int BitsSetToOne(T v) {

 template <typename T>
 inline int BitParity(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
+
 #ifdef _MSC_VER
  // bit parity == oddness of popcount
  return BitsSetToOne(v) & 1;
@@ -214,7 +249,8 @@ inline int BitParity(T v) {
 // encode/decode big endian.
 template <typename T>
 inline T EndianSwapValue(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");

 #ifdef _MSC_VER
  if (sizeof(T) == 2) {
@@ -244,6 +280,9 @@ inline T EndianSwapValue(T v) {
 // Reverses the order of bits in an integral value
 template <typename T>
 inline T ReverseBits(T v) {
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
+
  T r = EndianSwapValue(v);
  const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8);
  const T kEveryByte = kHighestByte | (kHighestByte / 255);
@@ -277,7 +316,8 @@ inline T ReverseBits(T v) {
 // is that all square sub-matrices that include the top row are invertible.
 template <typename T>
 inline T DownwardInvolution(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(std::is_integral_v<T>, "non-integral type");
+  static_assert(!std::is_reference_v<T>, "use std::remove_reference_t");
  static_assert(sizeof(T) <= 8, "only supported up to 64 bits");

  uint64_t r = static_cast<uint64_t>(v);
@@ -296,4 +336,16 @@ inline T DownwardInvolution(T v) {
  return static_cast<T>(r);
 }

+// Bitwise-And with typing that allows you to avoid writing an explicit cast
+// to the smaller type, or the type of the right parameter if same size.
+template <typename A, typename B>
+inline std::conditional_t<sizeof(A) < sizeof(B), A, B> BitwiseAnd(A a, B b) {
+  static_assert(std::is_integral_v<A>, "non-integral type");
+  static_assert(std::is_integral_v<B>, "non-integral type");
+  static_assert(!std::is_reference_v<A>, "use std::remove_reference_t");
+  static_assert(!std::is_reference_v<B>, "use std::remove_reference_t");
+  using Smaller = std::conditional_t<sizeof(A) < sizeof(B), A, B>;
+  return static_cast<Smaller>(a & b);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/util/math128.h
+++ b/util/math128.h
@@ -41,13 +41,13 @@ struct Unsigned128 {
    hi = upper;
  }

-  explicit operator uint64_t() { return lo; }
-
-  explicit operator uint32_t() { return static_cast<uint32_t>(lo); }
-
-  explicit operator uint16_t() { return static_cast<uint16_t>(lo); }
-
-  explicit operator uint8_t() { return static_cast<uint8_t>(lo); }
+  // Convert to any integer 64 bits or less.
+  template <typename T,
+            typename = std::enable_if_t<std::is_integral_v<T> &&
+                                        sizeof(T) <= sizeof(uint64_t)> >
+  explicit operator T() {
+    return static_cast<T>(lo);
+  }
 };

 inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) {
@@ -190,6 +190,16 @@ inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) {
 #endif
 }

+template <>
+inline Unsigned128 BottomNBits(Unsigned128 v, int nbits) {
+  if (nbits < 64) {
+    return BottomNBits(Lower64of128(v), nbits);
+  } else {
+    return (Unsigned128{BottomNBits(Upper64of128(v), nbits - 64)} << 64) |
+           Lower64of128(v);
+  }
+}
+
 template <>
 inline int FloorLog2(Unsigned128 v) {
  if (Upper64of128(v) == 0) {
@@ -236,6 +246,18 @@ inline Unsigned128 DownwardInvolution(Unsigned128 v) {
         DownwardInvolution(Upper64of128(v) ^ Lower64of128(v));
 }

+template <typename A>
+inline std::remove_reference_t<A> BitwiseAnd(A a, Unsigned128 b) {
+  static_assert(sizeof(A) <= sizeof(Unsigned128));
+  return static_cast<A>(a & b);
+}
+
+template <typename B>
+inline std::remove_reference_t<B> BitwiseAnd(Unsigned128 a, B b) {
+  static_assert(sizeof(B) <= sizeof(Unsigned128));
+  return static_cast<B>(a & b);
+}
+
 template <typename T>
 struct IsUnsignedUpTo128
    : std::integral_constant<bool, std::is_unsigned<T>::value ||