support avx512 (#1195)

* #1122 support AVX-512 for IVFFLAT distance calculation Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 support AVX-512 for IVFSQ8 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix avx512 build option Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 remove global faiss compile option -march=skylake-avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add InstructionSet and unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 update InstructionSet unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add FaissHook Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add distances_simd_avx512.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add ScalarQuantizer_avx512.cpp and IndexScalarQuantizer_avx512.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix clang format Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 move test_instructionset.cpp to index/unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix clang format Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 set hook default to AVX Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add hidden config use_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss clone issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss ScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss ScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 clean code Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 remove ScalarQuantizer_avx512 and IndexScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 change back index_factory.cpp/index_read.cpp/index_write.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 split ScalarQuantizer to ScalarQuantizerCodec and Similarity Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 #1122 split ScalarQuantizer to ScalarQuantizerCodec Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 split ScalarQuantizerCodec.cpp to ScalarQuantizerOp.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 ScalarQuantizer support avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add hook sse for ScalarQuantizer Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 correct head file Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add changelog Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix CodeFactor Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com>

support avx512 (#1195)
* #1122 support AVX-512 for IVFFLAT distance calculation Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 support AVX-512 for IVFSQ8 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix avx512 build option Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 remove global faiss compile option -march=skylake-avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add InstructionSet and unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 update InstructionSet unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add FaissHook Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add distances_simd_avx512.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add ScalarQuantizer_avx512.cpp and IndexScalarQuantizer_avx512.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix clang format Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 move test_instructionset.cpp to index/unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix clang format Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 set hook default to AVX Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add hidden config use_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss clone issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss build issue Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss ScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix faiss ScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 clean code Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 remove ScalarQuantizer_avx512 and IndexScalarQuantizer_avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 change back index_factory.cpp/index_read.cpp/index_write.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 split ScalarQuantizer to ScalarQuantizerCodec and Similarity Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 #1122 split ScalarQuantizer to ScalarQuantizerCodec Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 split ScalarQuantizerCodec.cpp to ScalarQuantizerOp.cpp Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 ScalarQuantizer support avx512 Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add hook sse for ScalarQuantizer Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 correct head file Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 add changelog Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix CodeFactor Signed-off-by: N yudong.cai <yudong.cai@zilliz.com> * #1122 fix unittest Signed-off-by: N yudong.cai <yudong.cai@zilliz.com>
c3bf2704 · Cai Yudong · GitHub · 79ac5443 · c3bf2704 · c3bf2704
50 changed file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,7 +41,9 @@ Please mark all change in change log and use the issue from GitHub
 - \#823 - Support binary vector tanimoto/jaccard/hamming metric
 - \#853 - Support HNSW
 - \#910 - Change Milvus c++ standard to c++17
+- \#1122 - Support AVX-512 in FAISS
 - \#1204 - Add api to get table data information
+- \#1250 - Support CPU profiling
 - \#1302 - Get all record IDs in a segment by given a segment id

 ## Improvement

--- a/core/src/index/thirdparty/faiss/FaissHook.cpp
+++ b/core/src/index/thirdparty/faiss/FaissHook.cpp
+
+// -*- c++ -*-
+
+#include <iostream>
+#include <mutex>
+
+#include <faiss/FaissHook.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ScalarQuantizerDC.h>
+#include <faiss/impl/ScalarQuantizerDC_avx512.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/distances_avx512.h>
+#include <faiss/utils/instruction_set.h>
+
+namespace faiss {
+
+bool faiss_use_avx512 = true;
+
+/* set default to AVX */
+fvec_func_ptr fvec_inner_product = fvec_inner_product_avx;
+fvec_func_ptr fvec_L2sqr = fvec_L2sqr_avx;
+fvec_func_ptr fvec_L1 = fvec_L1_avx;
+fvec_func_ptr fvec_Linf = fvec_Linf_avx;
+
+sq_get_func_ptr sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx;
+sq_get_func_ptr sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx;
+sq_sel_func_ptr sq_sel_quantizer = sq_select_quantizer_avx;
+
+
+/*****************************************************************************/
+
+bool support_avx512() {
+    if (!faiss_use_avx512) return false;
+
+    InstructionSet& instruction_set_inst = InstructionSet::GetInstance();
+    return (instruction_set_inst.AVX512F() &&
+            instruction_set_inst.AVX512DQ() &&
+            instruction_set_inst.AVX512BW());
+}
+
+bool support_avx() {
+    InstructionSet& instruction_set_inst = InstructionSet::GetInstance();
+    return (instruction_set_inst.AVX2());
+}
+
+bool support_sse() {
+    InstructionSet& instruction_set_inst = InstructionSet::GetInstance();
+    return (instruction_set_inst.SSE());
+}
+
+void hook_init() {
+    static std::mutex hook_mutex;
+    std::lock_guard<std::mutex> lock(hook_mutex);
+
+    if (support_avx512()) {
+        /* for IVFFLAT */
+        fvec_inner_product = fvec_inner_product_avx512;
+        fvec_L2sqr = fvec_L2sqr_avx512;
+        fvec_L1 = fvec_L1_avx512;
+        fvec_Linf = fvec_Linf_avx512;
+
+        /* for IVFSQ */
+        sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx512;
+        sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx512;
+        sq_sel_quantizer = sq_select_quantizer_avx512;
+
+        std::cout << "FAISS hook AVX512" << std::endl;
+    } else if (support_avx()) {
+        /* for IVFFLAT */
+        fvec_inner_product = fvec_inner_product_avx;
+        fvec_L2sqr = fvec_L2sqr_avx;
+        fvec_L1 = fvec_L1_avx;
+        fvec_Linf = fvec_Linf_avx;
+
+        /* for IVFSQ */
+        sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx;
+        sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx;
+        sq_sel_quantizer = sq_select_quantizer_avx;
+
+        std::cout << "FAISS hook AVX" << std::endl;
+    } else if (support_sse()) {
+        /* for IVFFLAT */
+        fvec_inner_product = fvec_inner_product_sse;
+        fvec_L2sqr = fvec_L2sqr_sse;
+        fvec_L1 = fvec_L1_sse;
+        fvec_Linf = fvec_Linf_sse;
+
+        /* for IVFSQ */
+        sq_get_distance_computer_L2 = sq_get_distance_computer_L2_sse;
+        sq_get_distance_computer_IP = sq_get_distance_computer_IP_sse;
+        sq_sel_quantizer = sq_select_quantizer_sse;
+
+        std::cout << "FAISS hook SSE" << std::endl;
+    } else {
+        FAISS_ASSERT_MSG(false, "CPU not supported!");
+    }
+}
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/FaissHook.h
+++ b/core/src/index/thirdparty/faiss/FaissHook.h
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+#include <stddef.h>
+#include <faiss/impl/ScalarQuantizerOp.h>
+
+namespace faiss {
+
+typedef float (*fvec_func_ptr)(const float*, const float*, size_t);
+
+typedef SQDistanceComputer* (*sq_get_func_ptr)(QuantizerType, size_t, const std::vector<float>&);
+typedef Quantizer* (*sq_sel_func_ptr)(QuantizerType, size_t, const std::vector<float>&);
+
+
+extern bool faiss_use_avx512;
+
+extern fvec_func_ptr fvec_inner_product;
+extern fvec_func_ptr fvec_L2sqr;
+extern fvec_func_ptr fvec_L1;
+extern fvec_func_ptr fvec_Linf;
+
+extern sq_get_func_ptr sq_get_distance_computer_L2;
+extern sq_get_func_ptr sq_get_distance_computer_IP;
+extern sq_sel_func_ptr sq_sel_quantizer;
+
+extern bool support_avx512();
+
+extern void hook_init();
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/Index.cpp
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@@ -12,6 +12,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
+#include <faiss/FaissHook.h>

 #include <cstring>


--- a/core/src/index/thirdparty/faiss/Index2Layer.cpp
+++ b/core/src/index/thirdparty/faiss/Index2Layer.cpp
@@ -27,7 +27,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/utils/distances.h>
-
+#include <faiss/FaissHook.h>

 /*
 #include <faiss/utils/Heap.h>

--- a/core/src/index/thirdparty/faiss/IndexFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexFlat.cpp
@@ -16,7 +16,7 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/AuxIndexStructures.h>
-
+#include <faiss/FaissHook.h>

 namespace faiss {


--- a/core/src/index/thirdparty/faiss/IndexHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
@@ -37,7 +37,7 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/Index2Layer.h>
 #include <faiss/impl/AuxIndexStructures.h>
-
+#include <faiss/FaissHook.h>

 extern "C" {

@@ -860,7 +860,7 @@ void IndexHNSWPQ::train(idx_t n, const float* x)
 **************************************************************/


-IndexHNSWSQ::IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M):
+IndexHNSWSQ::IndexHNSWSQ(int d, QuantizerType qtype, int M):
    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
 {
    is_trained = false;

--- a/core/src/index/thirdparty/faiss/IndexHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.h
@@ -149,7 +149,7 @@ struct IndexHNSWPQ : IndexHNSW {
 */
 struct IndexHNSWSQ : IndexHNSW {
    IndexHNSWSQ();
-    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
+    IndexHNSWSQ(int d, QuantizerType qtype, int M);
 };

 /** 2-level code structure with fast random access

--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@@ -17,7 +17,7 @@
 #include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/AuxIndexStructures.h>
-
+#include <faiss/FaissHook.h>

 namespace faiss {


--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
@@ -22,6 +22,7 @@

 #include <faiss/Clustering.h>
 #include <faiss/IndexFlat.h>
+#include <faiss/FaissHook.h>

 #include <faiss/utils/hamming.h>


--- a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
@@ -14,7 +14,7 @@
 #include <faiss/utils/distances.h>

 #include <faiss/impl/FaissAssert.h>
-
+#include <faiss/FaissHook.h>

 namespace faiss {


--- a/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
@@ -26,7 +26,7 @@ namespace faiss {

 IndexIVFSQHybrid::IndexIVFSQHybrid (
            Index *quantizer, size_t d, size_t nlist,
-            ScalarQuantizer::QuantizerType qtype,
+            QuantizerType qtype,
            MetricType metric, bool encode_residual)
    : IndexIVF(quantizer, d, nlist, 0, metric),
      sq(d, qtype),
@@ -54,7 +54,7 @@ void IndexIVFSQHybrid::encode_vectors(idx_t n, const float* x,
                                             uint8_t * codes,
                                             bool include_listnos) const
 {
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant (sq.select_quantizer ());
    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
    memset(codes, 0, (code_size + coarse_size) * n);

@@ -85,7 +85,7 @@ void IndexIVFSQHybrid::encode_vectors(idx_t n, const float* x,
 void IndexIVFSQHybrid::sa_decode (idx_t n, const uint8_t *codes,
                                                 float *x) const
 {
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant (sq.select_quantizer ());
    size_t coarse_size = coarse_code_size ();

 #pragma omp parallel if(n > 1)
@@ -117,7 +117,7 @@ void IndexIVFSQHybrid::add_with_ids
    std::unique_ptr<int64_t []> idx (new int64_t [n]);
    quantizer->assign (n, x, idx.get());
    size_t nadd = 0;
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant(sq.select_quantizer ());

 #pragma omp parallel reduction(+: nadd)
    {

--- a/core/src/index/thirdparty/faiss/IndexSQHybrid.h
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.h
@@ -15,6 +15,7 @@

 #include <faiss/IndexIVF.h>
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/ScalarQuantizerOp.h>


 namespace faiss {
@@ -30,7 +31,7 @@ struct IndexIVFSQHybrid: IndexIVF {
    bool by_residual;

    IndexIVFSQHybrid(Index *quantizer, size_t d, size_t nlist,
-                            ScalarQuantizer::QuantizerType qtype,
+                            QuantizerType qtype,
                            MetricType metric = METRIC_L2,
                            bool encode_residual = true);


--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
@@ -18,6 +18,7 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/ScalarQuantizerOp.h>

 namespace faiss {

@@ -28,20 +29,20 @@ namespace faiss {
 ********************************************************************/

 IndexScalarQuantizer::IndexScalarQuantizer
-                      (int d, ScalarQuantizer::QuantizerType qtype,
+                      (int d, QuantizerType qtype,
                       MetricType metric):
          Index(d, metric),
          sq (d, qtype)
 {
    is_trained =
-        qtype == ScalarQuantizer::QT_fp16 ||
-        qtype == ScalarQuantizer::QT_8bit_direct;
+        qtype == QuantizerType::QT_fp16 ||
+        qtype == QuantizerType::QT_8bit_direct;
    code_size = sq.code_size;
 }


 IndexScalarQuantizer::IndexScalarQuantizer ():
-    IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit)
+    IndexScalarQuantizer(0, QuantizerType::QT_8bit)
 {}

 void IndexScalarQuantizer::train(idx_t n, const float* x)
@@ -105,8 +106,7 @@ void IndexScalarQuantizer::search(

 DistanceComputer *IndexScalarQuantizer::get_distance_computer () const
 {
-    ScalarQuantizer::SQDistanceComputer *dc =
-        sq.get_distance_computer (metric_type);
+    SQDistanceComputer *dc = sq.get_distance_computer (metric_type);
    dc->code_size = sq.code_size;
    dc->codes = codes.data();
    return dc;
@@ -122,7 +122,7 @@ void IndexScalarQuantizer::reset()
 void IndexScalarQuantizer::reconstruct_n(
             idx_t i0, idx_t ni, float* recons) const
 {
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant(sq.select_quantizer ());
    for (size_t i = 0; i < ni; i++) {
        squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
    }
@@ -161,7 +161,7 @@ void IndexScalarQuantizer::sa_decode (idx_t n, const uint8_t *bytes,

 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer (
            Index *quantizer, size_t d, size_t nlist,
-            ScalarQuantizer::QuantizerType qtype,
+            QuantizerType qtype,
            MetricType metric, bool encode_residual)
    : IndexIVF(quantizer, d, nlist, 0, metric),
      sq(d, qtype),
@@ -189,7 +189,7 @@ void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
                                             uint8_t * codes,
                                             bool include_listnos) const
 {
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant (sq.select_quantizer ());
    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
    memset(codes, 0, (code_size + coarse_size) * n);

@@ -220,7 +220,7 @@ void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
 void IndexIVFScalarQuantizer::sa_decode (idx_t n, const uint8_t *codes,
                                                 float *x) const
 {
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant (sq.select_quantizer ());
    size_t coarse_size = coarse_code_size ();

 #pragma omp parallel if(n > 1)
@@ -252,7 +252,7 @@ void IndexIVFScalarQuantizer::add_with_ids
    std::unique_ptr<int64_t []> idx (new int64_t [n]);
    quantizer->assign (n, x, idx.get());
    size_t nadd = 0;
-    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+    std::unique_ptr<Quantizer> squant(sq.select_quantizer ());

 #pragma omp parallel reduction(+: nadd)
    {

--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
@@ -15,6 +15,7 @@

 #include <faiss/IndexIVF.h>
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/ScalarQuantizerOp.h>


 namespace faiss {
@@ -44,7 +45,7 @@ struct IndexScalarQuantizer: Index {
     * @param nbits  number of bit per subvector index
     */
    IndexScalarQuantizer (int d,
-                          ScalarQuantizer::QuantizerType qtype,
+                          QuantizerType qtype,
                          MetricType metric = METRIC_L2);

    IndexScalarQuantizer ();
@@ -93,7 +94,7 @@ struct IndexIVFScalarQuantizer: IndexIVF {
    bool by_residual;

    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
-                            ScalarQuantizer::QuantizerType qtype,
+                            QuantizerType qtype,
                            MetricType metric = METRIC_L2,
                            bool encode_residual = true);


--- a/core/src/index/thirdparty/faiss/Makefile
+++ b/core/src/index/thirdparty/faiss/Makefile
@@ -7,6 +7,7 @@

 HEADERS     = $(wildcard *.h impl/*.h utils/*.h)
 SRC         = $(wildcard *.cpp impl/*.cpp utils/*.cpp)
+AVX512_SRC  = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp)
 OBJ         = $(SRC:.cpp=.o)
 INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss

@@ -41,6 +42,10 @@ libfaiss.$(SHAREDEXT): $(OBJ)
 %.o: %.cpp
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@

+# support avx512
+%avx512.o: %avx512.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@
+
 %.o: %.cu
 	$(NVCC) $(NVCCFLAGS) -c $< -o $@


--- a/core/src/index/thirdparty/faiss/VectorTransform.cpp
+++ b/core/src/index/thirdparty/faiss/VectorTransform.cpp
@@ -19,6 +19,7 @@
 #include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/IndexPQ.h>
+#include <faiss/FaissHook.h>

 using namespace faiss;


--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
@@ -43,7 +43,7 @@ GpuIndexIVFSQHybrid::GpuIndexIVFSQHybrid(
  GpuResources* resources,
  int dims,
  int nlist,
-  faiss::ScalarQuantizer::QuantizerType qtype,
+  faiss::QuantizerType qtype,
  faiss::MetricType metric,
  bool encodeResidual,
  GpuIndexIVFSQHybridConfig config) :

--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
@@ -37,7 +37,7 @@ class GpuIndexIVFSQHybrid : public GpuIndexIVF {
    GpuResources* resources,
    int dims,
    int nlist,
-    faiss::ScalarQuantizer::QuantizerType qtype,
+    faiss::QuantizerType qtype,
    faiss::MetricType metric = MetricType::METRIC_L2,
    bool encodeResidual = true,
    GpuIndexIVFSQHybridConfig config =

--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@@ -41,7 +41,7 @@ GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
  GpuResources* resources,
  int dims,
  int nlist,
-  faiss::ScalarQuantizer::QuantizerType qtype,
+  faiss::QuantizerType qtype,
  faiss::MetricType metric,
  bool encodeResidual,
  GpuIndexIVFScalarQuantizerConfig config) :

--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
@@ -37,7 +37,7 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
    GpuResources* resources,
    int dims,
    int nlist,
-    faiss::ScalarQuantizer::QuantizerType qtype,
+    faiss::QuantizerType qtype,
    faiss::MetricType metric = MetricType::METRIC_L2,
    bool encodeResidual = true,
    GpuIndexIVFScalarQuantizerConfig config =

--- a/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -14,14 +14,14 @@

 namespace faiss { namespace gpu {

-inline bool isSQSupported(ScalarQuantizer::QuantizerType qtype) {
+inline bool isSQSupported(QuantizerType qtype) {
  switch (qtype) {
-    case ScalarQuantizer::QuantizerType::QT_8bit:
-    case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
-    case ScalarQuantizer::QuantizerType::QT_8bit_direct:
-    case ScalarQuantizer::QuantizerType::QT_4bit:
-    case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
-    case ScalarQuantizer::QuantizerType::QT_fp16:
+    case QuantizerType::QT_8bit:
+    case QuantizerType::QT_8bit_uniform:
+    case QuantizerType::QT_8bit_direct:
+    case QuantizerType::QT_4bit:
+    case QuantizerType::QT_4bit_uniform:
+    case QuantizerType::QT_fp16:
      return true;
    default:
      return false;
@@ -107,7 +107,7 @@ struct CodecFloat {

 // Arbitrary dimension fp16
 template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
+struct Codec<(int)QuantizerType::QT_fp16, 1> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = 1;

@@ -145,7 +145,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {

 // dim % 2 == 0, ensures uint32 alignment
 template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2> {
+struct Codec<(int)QuantizerType::QT_fp16, 2> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = 2;

@@ -213,7 +213,7 @@ struct Get8BitType<4> { using T = uint32_t; };

 // Uniform quantization across all dimensions
 template <int DimMultiple>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, DimMultiple> {
+struct Codec<(int)QuantizerType::QT_8bit_uniform, DimMultiple> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = DimMultiple;
  using MemT = typename Get8BitType<DimMultiple>::T;
@@ -307,7 +307,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, DimMultiple> {

 // Uniform quantization per each dimension
 template <int DimMultiple>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
+struct Codec<(int)QuantizerType::QT_8bit, DimMultiple> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = DimMultiple;
  using MemT = typename Get8BitType<DimMultiple>::T;
@@ -421,7 +421,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
 };

 template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> {
+struct Codec<(int)QuantizerType::QT_8bit_direct, 1> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = 1;

@@ -465,7 +465,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> {

 // Uniform quantization across all dimensions
 template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> {
+struct Codec<(int)QuantizerType::QT_4bit_uniform, 1> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = 2;

@@ -525,7 +525,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> {
 };

 template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
+struct Codec<(int)QuantizerType::QT_4bit, 1> {
  /// How many dimensions per iteration we are handling for encoding or decoding
  static constexpr int kDimPerIter = 2;


--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFAppend.cu
@@ -286,17 +286,17 @@ runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
    RUN_APPEND;
  } else {
    switch (scalarQ->qtype) {
-      case ScalarQuantizer::QuantizerType::QT_8bit:
+      case QuantizerType::QT_8bit:
      {
        if (false) {
 //        if (dim % 4 == 0) {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 4>
+          Codec<(int)QuantizerType::QT_8bit, 4>
            codec(scalarQ->code_size,
                  scalarQ->gpuTrained.data(),
                  scalarQ->gpuTrained.data() + dim);
          RUN_APPEND;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>
+          Codec<(int)QuantizerType::QT_8bit, 1>
            codec(scalarQ->code_size,
                  scalarQ->gpuTrained.data(),
                  scalarQ->gpuTrained.data() + dim);
@@ -304,53 +304,53 @@ runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
+      case QuantizerType::QT_8bit_uniform:
      {
 //        if (dim % 4 == 0) {
        if (false) {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 4>
+          Codec<(int)QuantizerType::QT_8bit_uniform, 4>
            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
          RUN_APPEND;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1>
+          Codec<(int)QuantizerType::QT_8bit_uniform, 1>
            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
          RUN_APPEND;
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_fp16:
+      case QuantizerType::QT_fp16:
      {
 //        if (dim % 2 == 0) {
        if (false) {
-          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2>
+          Codec<(int)QuantizerType::QT_fp16, 2>
            codec(scalarQ->code_size);
          RUN_APPEND;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>
+          Codec<(int)QuantizerType::QT_fp16, 1>
            codec(scalarQ->code_size);
          RUN_APPEND;
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_8bit_direct:
+      case QuantizerType::QT_8bit_direct:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1>
+        Codec<(int)QuantizerType::QT_8bit_direct, 1>
          codec(scalarQ->code_size);
        RUN_APPEND;
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_4bit:
+      case QuantizerType::QT_4bit:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>
+        Codec<(int)QuantizerType::QT_4bit, 1>
          codec(scalarQ->code_size,
                scalarQ->gpuTrained.data(),
                scalarQ->gpuTrained.data() + dim);
        RUN_APPEND;
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
+      case QuantizerType::QT_4bit_uniform:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1>
+        Codec<(int)QuantizerType::QT_4bit_uniform, 1>
          codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
        RUN_APPEND;
      }

--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
@@ -20,6 +20,7 @@
 #include <faiss/gpu/utils/PtxUtils.cuh>
 #include <faiss/gpu/utils/Reductions.cuh>
 #include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/ScalarQuantizerOp.h>
 #include <thrust/host_vector.h>

 namespace faiss { namespace gpu {
@@ -181,8 +182,8 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
  // Check the amount of shared memory per block available based on our type is
  // sufficient
  if (scalarQ &&
-      (scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_8bit ||
-       scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_4bit)) {
+      (scalarQ->qtype == QuantizerType::QT_8bit ||
+       scalarQ->qtype == QuantizerType::QT_4bit)) {
    int maxDim = getMaxSharedMemPerBlockCurrentDevice() /
      (sizeof(float) * 2);

@@ -230,18 +231,18 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
    HANDLE_METRICS;
  } else {
    switch (scalarQ->qtype) {
-      case ScalarQuantizer::QuantizerType::QT_8bit:
+      case QuantizerType::QT_8bit:
      {
        // FIXME: investigate 32 bit load perf issues
 //        if (dim % 4 == 0) {
        if (false) {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 4>
+          Codec<(int)QuantizerType::QT_8bit, 4>
            codec(scalarQ->code_size,
                  scalarQ->gpuTrained.data(),
                  scalarQ->gpuTrained.data() + dim);
          HANDLE_METRICS;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>
+          Codec<(int)QuantizerType::QT_8bit, 1>
            codec(scalarQ->code_size,
                  scalarQ->gpuTrained.data(),
                  scalarQ->gpuTrained.data() + dim);
@@ -249,55 +250,55 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
+      case QuantizerType::QT_8bit_uniform:
      {
        // FIXME: investigate 32 bit load perf issues
        if (false) {
 //        if (dim % 4 == 0) {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 4>
+          Codec<(int)QuantizerType::QT_8bit_uniform, 4>
            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
          HANDLE_METRICS;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1>
+          Codec<(int)QuantizerType::QT_8bit_uniform, 1>
            codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
          HANDLE_METRICS;
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_fp16:
+      case QuantizerType::QT_fp16:
      {
        if (false) {
          // FIXME: investigate 32 bit load perf issues
 //        if (dim % 2 == 0) {
-          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 2>
+          Codec<(int)QuantizerType::QT_fp16, 2>
            codec(scalarQ->code_size);
          HANDLE_METRICS;
        } else {
-          Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>
+          Codec<(int)QuantizerType::QT_fp16, 1>
            codec(scalarQ->code_size);
          HANDLE_METRICS;
        }
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_8bit_direct:
+      case QuantizerType::QT_8bit_direct:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1>
+        Codec<(int)QuantizerType::QT_8bit_direct, 1>
          codec(scalarQ->code_size);
        HANDLE_METRICS;
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_4bit:
+      case QuantizerType::QT_4bit:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>
+        Codec<(int)QuantizerType::QT_4bit, 1>
          codec(scalarQ->code_size,
                scalarQ->gpuTrained.data(),
                scalarQ->gpuTrained.data() + dim);
        HANDLE_METRICS;
      }
      break;
-      case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
+      case QuantizerType::QT_4bit_uniform:
      {
-        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1>
+        Codec<(int)QuantizerType::QT_4bit_uniform, 1>
          codec(scalarQ->code_size, scalarQ->trained[0], scalarQ->trained[1]);
        HANDLE_METRICS;
      }

--- a/core/src/index/thirdparty/faiss/impl/PolysemousTraining.cpp
+++ b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.cpp
@@ -22,6 +22,7 @@
 #include <faiss/utils/hamming.h>

 #include <faiss/impl/FaissAssert.h>
+#include <faiss/FaissHook.h>

 /*****************************************
 * Mixed PQ / Hamming

--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
@@ -10,7 +10,7 @@
 #pragma once

 #include <faiss/IndexIVF.h>
-#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizerOp.h>


 namespace faiss {
@@ -23,30 +23,12 @@ namespace faiss {

 struct ScalarQuantizer {

-    enum QuantizerType {
-        QT_8bit,             ///< 8 bits per component
-        QT_4bit,             ///< 4 bits per component
-        QT_8bit_uniform,     ///< same, shared range for all dimensions
-        QT_4bit_uniform,
-        QT_fp16,
-        QT_8bit_direct,      /// fast indexing of uint8s
-        QT_6bit,             ///< 6 bits per component
-    };
-
    QuantizerType qtype;

    /** The uniform encoder can estimate the range of representable
     * values of the unform encoder using different statistics. Here
     * rs = rangestat_arg */

-    // rangestat_arg.
-    enum RangeStat {
-        RS_minmax,           ///< [min - rs*(max-min), max + rs*(max-min)]
-        RS_meanstd,          ///< [mean - std * rs, mean + std * rs]
-        RS_quantiles,        ///< [Q(rs), Q(1-rs)]
-        RS_optim,            ///< alternate optimization of reconstruction error
-    };
-
    RangeStat rangestat;
    float rangestat_arg;

@@ -85,27 +67,8 @@ struct ScalarQuantizer {
     * computation and inverted list scanning
     *****************************************************/

-    struct Quantizer {
-        // encodes one vector. Assumes code is filled with 0s on input!
-        virtual void encode_vector(const float *x, uint8_t *code) const = 0;
-        virtual void decode_vector(const uint8_t *code, float *x) const = 0;
-
-        virtual ~Quantizer() {}
-    };
-
    Quantizer * select_quantizer() const;

-    struct SQDistanceComputer: DistanceComputer {
-
-        const float *q;
-        const uint8_t *codes;
-        size_t code_size;
-
-        SQDistanceComputer (): q(nullptr), codes (nullptr), code_size (0)
-        {}
-
-    };
-
    SQDistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
        const;


--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/ScalarQuantizerDC.h>
+#include <faiss/impl/ScalarQuantizerCodec.h>
+
+namespace faiss {
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+
+/*******************************************************************
+ * ScalarQuantizer Distance Computer
+ ********************************************************************/
+
+/* AVX */
+SQDistanceComputer *
+sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_distance_computer<SimilarityL2<8>> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_distance_computer<SimilarityL2<1>> (qtype, dim, trained);
+    }
+}
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_distance_computer<SimilarityIP<8>> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_distance_computer<SimilarityIP<1>> (qtype, dim, trained);
+    }
+}
+
+Quantizer *
+sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_quantizer_1<8> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_quantizer_1<1> (qtype, dim, trained);
+    }
+}
+
+/* SSE */
+SQDistanceComputer *
+sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+    return select_distance_computer<SimilarityL2<1>> (qtype, dim, trained);
+}
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+    return select_distance_computer<SimilarityIP<1>> (qtype, dim, trained);
+}
+
+Quantizer *
+sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+    return select_quantizer_1<1> (qtype, dim, trained);
+}
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/impl/ScalarQuantizerOp.h>
+
+namespace faiss {
+
+
+SQDistanceComputer *
+sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+Quantizer *
+sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+
+SQDistanceComputer *
+sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+Quantizer *
+sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/impl/ScalarQuantizerDC_avx512.h>
+#include <faiss/impl/ScalarQuantizerCodec_avx512.h>
+
+namespace faiss {
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__))
+#define USE_AVX_512
+#endif
+
+/*******************************************************************
+ * ScalarQuantizer Distance Computer
+ ********************************************************************/
+
+SQDistanceComputer *
+sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX_512
+    if (dim % 16 == 0) {
+        return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
+    } else
+#endif
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_distance_computer_avx512<SimilarityL2_avx512<8>> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_distance_computer_avx512<SimilarityL2_avx512<1>> (qtype, dim, trained);
+    }
+}
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX_512
+    if (dim % 16 == 0) {
+        return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
+    } else
+#endif
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_distance_computer_avx512<SimilarityIP_avx512<8>> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_distance_computer_avx512<SimilarityIP_avx512<1>> (qtype, dim, trained);
+    }
+}
+
+Quantizer *
+sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
+#ifdef USE_AVX_512
+    if (dim % 16 == 0) {
+        return select_quantizer_1_avx512<16> (qtype, dim, trained);
+    } else
+#endif
+#ifdef USE_AVX
+    if (dim % 8 == 0) {
+        return select_quantizer_1_avx512<8> (qtype, dim, trained);
+    } else
+#endif
+    {
+        return select_quantizer_1_avx512<1> (qtype, dim, trained);
+    }
+}
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+#include <faiss/impl/ScalarQuantizerOp.h>
+
+namespace faiss {
+
+
+SQDistanceComputer *
+sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+SQDistanceComputer *
+sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+Quantizer *
+sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerOp.cpp
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerOp.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ScalarQuantizerOp.h>
+
+namespace faiss {
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+
+#ifdef USE_AVX
+
+uint16_t encode_fp16 (float x) {
+    __m128 xf = _mm_set1_ps (x);
+    __m128i xi = _mm_cvtps_ph (
+         xf, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+    return _mm_cvtsi128_si32 (xi) & 0xffff;
+}
+
+
+float decode_fp16 (uint16_t x) {
+    __m128i xi = _mm_set1_epi16 (x);
+    __m128 xf = _mm_cvtph_ps (xi);
+    return _mm_cvtss_f32 (xf);
+}
+
+#else
+
+// non-intrinsic FP16 <-> FP32 code adapted from
+// https://github.com/ispc/ispc/blob/master/stdlib.ispc
+
+float floatbits (uint32_t x) {
+    void *xptr = &x;
+    return *(float*)xptr;
+}
+
+uint32_t intbits (float f) {
+    void *fptr = &f;
+    return *(uint32_t*)fptr;
+}
+
+
+uint16_t encode_fp16 (float f) {
+    // via Fabian "ryg" Giesen.
+    // https://gist.github.com/2156668
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    uint32_t fint = intbits(f);
+    uint32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    // NOTE all the integer compares in this function can be safely
+    // compiled into signed compares since all operands are below
+    // 0x80000000. Important if you want fast straight SSE2 code (since
+    // there's no unsigned PCMPGTD).
+
+    // Inf or NaN (all exponent bits set)
+    // NaN->qNaN and Inf->Inf
+    // unconditional assignment here, will override with right value for
+    // the regular case below.
+    uint32_t f32infty = 255u << 23;
+    o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+
+    const uint32_t round_mask = ~0xfffu;
+    const uint32_t magic = 15u << 23;
+
+    // Shift exponent down, denormalize if necessary.
+    // NOTE This represents half-float denormals using single
+    // precision denormals.  The main reason to do this is that
+    // there's no shift with per-lane variable shifts in SSE*, which
+    // we'd otherwise need. It has some funky side effects though:
+    // - This conversion will actually respect the FTZ (Flush To Zero)
+    //   flag in MXCSR - if it's set, no half-float denormals will be
+    //   generated. I'm honestly not sure whether this is good or
+    //   bad. It's definitely interesting.
+    // - If the underlying HW doesn't support denormals (not an issue
+    //   with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
+    //   you will always get flush-to-zero behavior. This is bad,
+    //   unless you're on a CPU where you don't care.
+    // - Denormals tend to be slow. FP32 denormals are rare in
+    //   practice outside of things like recursive filters in DSP -
+    //   not a typical half-float application. Whether FP16 denormals
+    //   are rare in practice, I don't know. Whatever slow path your
+    //   HW may or may not have for denormals, this may well hit it.
+    float fscale = floatbits(fint & round_mask) * floatbits(magic);
+    fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
+    int32_t fint2 = intbits(fscale) - round_mask;
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+float decode_fp16 (uint16_t h) {
+    // https://gist.github.com/2144712
+    // Fabian "ryg" Giesen.
+
+    const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fffu)) << 13;     // exponent/mantissa bits
+    int32_t exp = shifted_exp & o;   // just the exponent
+    o += (int32_t)(127 - 15) << 23;        // exponent adjust
+
+    int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
+    int32_t zerodenorm_val = intbits(
+                 floatbits(o + (1u<<23)) - floatbits(113u << 23));
+    int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
+
+    int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
+    return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
+}
+
+#endif
+
+
+/*******************************************************************
+ * Quantizer range training
+ */
+
+static float sqr (float x) {
+    return x * x;
+}
+
+
+void train_Uniform(RangeStat rs, float rs_arg,
+                   idx_t n, int k, const float *x,
+                   std::vector<float> & trained)
+{
+    trained.resize (2);
+    float & vmin = trained[0];
+    float & vmax = trained[1];
+
+    if (rs == RangeStat::RS_minmax) {
+        vmin = HUGE_VAL; vmax = -HUGE_VAL;
+        for (size_t i = 0; i < n; i++) {
+            if (x[i] < vmin) vmin = x[i];
+            if (x[i] > vmax) vmax = x[i];
+        }
+        float vexp = (vmax - vmin) * rs_arg;
+        vmin -= vexp;
+        vmax += vexp;
+    } else if (rs == RangeStat::RS_meanstd) {
+        double sum = 0, sum2 = 0;
+        for (size_t i = 0; i < n; i++) {
+            sum += x[i];
+            sum2 += x[i] * x[i];
+        }
+        float mean = sum / n;
+        float var = sum2 / n - mean * mean;
+        float std = var <= 0 ? 1.0 : sqrt(var);
+
+        vmin = mean - std * rs_arg ;
+        vmax = mean + std * rs_arg ;
+    } else if (rs == RangeStat::RS_quantiles) {
+        std::vector<float> x_copy(n);
+        memcpy(x_copy.data(), x, n * sizeof(*x));
+        // TODO just do a qucikselect
+        std::sort(x_copy.begin(), x_copy.end());
+        int o = int(rs_arg * n);
+        if (o < 0) o = 0;
+        if (o > n - o) o = n / 2;
+        vmin = x_copy[o];
+        vmax = x_copy[n - 1 - o];
+
+    } else if (rs == RangeStat::RS_optim) {
+        float a, b;
+        float sx = 0;
+        {
+            vmin = HUGE_VAL, vmax = -HUGE_VAL;
+            for (size_t i = 0; i < n; i++) {
+                if (x[i] < vmin) vmin = x[i];
+                if (x[i] > vmax) vmax = x[i];
+                sx += x[i];
+            }
+            b = vmin;
+            a = (vmax - vmin) / (k - 1);
+        }
+        int verbose = false;
+        int niter = 2000;
+        float last_err = -1;
+        int iter_last_err = 0;
+        for (int it = 0; it < niter; it++) {
+            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
+
+            for (idx_t i = 0; i < n; i++) {
+                float xi = x[i];
+                float ni = floor ((xi - b) / a + 0.5);
+                if (ni < 0) ni = 0;
+                if (ni >= k) ni = k - 1;
+                err1 += sqr (xi - (ni * a + b));
+                sn  += ni;
+                sn2 += ni * ni;
+                sxn += ni * xi;
+            }
+
+            if (err1 == last_err) {
+                iter_last_err ++;
+                if (iter_last_err == 16) break;
+            } else {
+                last_err = err1;
+                iter_last_err = 0;
+            }
+
+            float det = sqr (sn) - sn2 * n;
+
+            b = (sn * sxn - sn2 * sx) / det;
+            a = (sn * sx - n * sxn) / det;
+            if (verbose) {
+                printf ("it %d, err1=%g            \r", it, err1);
+                fflush(stdout);
+            }
+        }
+        if (verbose) printf("\n");
+
+        vmin = b;
+        vmax = b + a * (k - 1);
+
+    } else {
+        FAISS_THROW_MSG ("Invalid qtype");
+    }
+    vmax -= vmin;
+}
+
+void train_NonUniform(RangeStat rs, float rs_arg,
+                      idx_t n, int d, int k, const float *x,
+                      std::vector<float> & trained)
+{
+    trained.resize (2 * d);
+    float * vmin = trained.data();
+    float * vmax = trained.data() + d;
+    if (rs == RangeStat::RS_minmax) {
+        memcpy (vmin, x, sizeof(*x) * d);
+        memcpy (vmax, x, sizeof(*x) * d);
+        for (size_t i = 1; i < n; i++) {
+            const float *xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                if (xi[j] < vmin[j]) vmin[j] = xi[j];
+                if (xi[j] > vmax[j]) vmax[j] = xi[j];
+            }
+        }
+        float *vdiff = vmax;
+        for (size_t j = 0; j < d; j++) {
+            float vexp = (vmax[j] - vmin[j]) * rs_arg;
+            vmin[j] -= vexp;
+            vmax[j] += vexp;
+            vdiff [j] = vmax[j] - vmin[j];
+        }
+    } else {
+        // transpose
+        std::vector<float> xt(n * d);
+        for (size_t i = 1; i < n; i++) {
+            const float *xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                xt[j * n + i] = xi[j];
+            }
+        }
+        std::vector<float> trained_d(2);
+#pragma omp parallel for
+        for (size_t j = 0; j < d; j++) {
+            train_Uniform(rs, rs_arg,
+                          n, k, xt.data() + j * n,
+                          trained_d);
+            vmin[j] = trained_d[0];
+            vmax[j] = trained_d[1];
+        }
+    }
+}
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerOp.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerOp.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+typedef Index::idx_t idx_t;
+
+enum class QuantizerType {
+    QT_8bit = 0,         ///< 8 bits per component
+    QT_4bit,             ///< 4 bits per component
+    QT_8bit_uniform,     ///< same, shared range for all dimensions
+    QT_4bit_uniform,
+    QT_fp16,
+    QT_8bit_direct,      /// fast indexing of uint8s
+    QT_6bit,             ///< 6 bits per component
+};
+
+// rangestat_arg.
+enum class RangeStat {
+    RS_minmax = 0,       ///< [min - rs*(max-min), max + rs*(max-min)]
+    RS_meanstd,          ///< [mean - std * rs, mean + std * rs]
+    RS_quantiles,        ///< [Q(rs), Q(1-rs)]
+    RS_optim,            ///< alternate optimization of reconstruction error
+};
+
+struct Quantizer {
+    // encodes one vector. Assumes code is filled with 0s on input!
+    virtual void encode_vector(const float *x, uint8_t *code) const = 0;
+    virtual void decode_vector(const uint8_t *code, float *x) const = 0;
+
+    virtual ~Quantizer() {}
+};
+
+struct SQDistanceComputer: DistanceComputer {
+    const float *q;
+    const uint8_t *codes;
+    size_t code_size;
+
+    SQDistanceComputer (): q(nullptr), codes (nullptr), code_size (0)
+    {}
+};
+
+extern uint16_t encode_fp16 (float x);
+extern float decode_fp16 (uint16_t x);
+
+extern void train_Uniform(RangeStat rs, float rs_arg,
+                   idx_t n, int k, const float *x,
+                   std::vector<float> & trained);
+extern void train_NonUniform(RangeStat rs, float rs_arg,
+                      idx_t n, int d, int k, const float *x,
+                      std::vector<float> & trained);
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/impl/lattice_Zn.cpp
+++ b/core/src/index/thirdparty/faiss/impl/lattice_Zn.cpp
@@ -20,6 +20,7 @@
 #include <algorithm>

 #include <faiss/utils/distances.h>
+#include <faiss/FaissHook.h>

 namespace faiss {


--- a/core/src/index/thirdparty/faiss/index_factory.cpp
+++ b/core/src/index/thirdparty/faiss/index_factory.cpp
@@ -195,12 +195,12 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
            }
        } else if (!index && (stok == "SQ8" || stok == "SQ4" || stok == "SQ6" ||
                              stok == "SQfp16")) {
-            ScalarQuantizer::QuantizerType qt =
-                stok == "SQ8" ? ScalarQuantizer::QT_8bit :
-                stok == "SQ6" ? ScalarQuantizer::QT_6bit :
-                stok == "SQ4" ? ScalarQuantizer::QT_4bit :
-                stok == "SQfp16" ? ScalarQuantizer::QT_fp16 :
-                ScalarQuantizer::QT_4bit;
+            QuantizerType qt =
+                stok == "SQ8" ? QuantizerType::QT_8bit :
+                stok == "SQ6" ? QuantizerType::QT_6bit :
+                stok == "SQ4" ? QuantizerType::QT_4bit :
+                stok == "SQfp16" ? QuantizerType::QT_fp16 :
+                QuantizerType::QT_4bit;
            if (coarse_quantizer) {
                FAISS_THROW_IF_NOT (!use_2layer);
                IndexIVFScalarQuantizer *index_ivf =
@@ -216,12 +216,12 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
            }
        } else if (!index && (stok == "SQ8Hybrid" || stok == "SQ4Hybrid" || stok == "SQ6Hybrid" ||
                              stok == "SQfp16Hybrid")) {
-            ScalarQuantizer::QuantizerType qt =
-                    stok == "SQ8Hybrid" ? ScalarQuantizer::QT_8bit :
-                    stok == "SQ6Hybrid" ? ScalarQuantizer::QT_6bit :
-                    stok == "SQ4Hybrid" ? ScalarQuantizer::QT_4bit :
-                    stok == "SQfp16Hybrid" ? ScalarQuantizer::QT_fp16 :
-                    ScalarQuantizer::QT_4bit;
+            QuantizerType qt =
+                    stok == "SQ8Hybrid" ? QuantizerType::QT_8bit :
+                    stok == "SQ6Hybrid" ? QuantizerType::QT_6bit :
+                    stok == "SQ4Hybrid" ? QuantizerType::QT_4bit :
+                    stok == "SQfp16Hybrid" ? QuantizerType::QT_fp16 :
+                    QuantizerType::QT_4bit;
            FAISS_THROW_IF_NOT_MSG(coarse_quantizer,
                                   "SQ Hybrid only with an IVF");
            FAISS_THROW_IF_NOT (!use_2layer);
@@ -299,7 +299,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
        } else if (!index &&
                   sscanf (tok, "HNSW%d_SQ%d", &M, &pq_m) == 2 &&
                   pq_m == 8) {
-            index_1 = new IndexHNSWSQ (d, ScalarQuantizer::QT_8bit, M);
+            index_1 = new IndexHNSWSQ (d, QuantizerType::QT_8bit, M);
        } else if (!index && (stok == "LSH" || stok == "LSHr" ||
                              stok == "LSHrt" || stok == "LSHt")) {
            bool rotate_data = strstr(tok, "r") != nullptr;

--- a/core/src/index/thirdparty/faiss/utils/distances.cpp
+++ b/core/src/index/thirdparty/faiss/utils/distances.cpp
@@ -16,6 +16,7 @@

 #include <omp.h>

+#include <faiss/FaissHook.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/ConcurrentBitset.h>

--- a/core/src/index/thirdparty/faiss/utils/distances.h
+++ b/core/src/index/thirdparty/faiss/utils/distances.h
@@ -24,29 +24,52 @@ namespace faiss {
 * Optimized distance/norm/inner prod computations
 *********************************************************/

-
+#ifdef __AVX__
 /// Squared L2 distance between two vectors
-float fvec_L2sqr (
+float fvec_L2sqr_avx (
        const float * x,
        const float * y,
        size_t d);

 /// inner product
-float  fvec_inner_product (
+float  fvec_inner_product_avx (
        const float * x,
        const float * y,
        size_t d);

 /// L1 distance
-float fvec_L1 (
+float fvec_L1_avx (
+        const float * x,
+        const float * y,
+        size_t d);
+
+float fvec_Linf_avx (
+        const float * x,
+        const float * y,
+        size_t d);
+#endif
+
+#ifdef __SSE__
+float fvec_L2sqr_sse (
+        const float * x,
+        const float * y,
+        size_t d);
+
+float  fvec_inner_product_sse (
+        const float * x,
+        const float * y,
+        size_t d);
+
+float fvec_L1_sse (
        const float * x,
        const float * y,
        size_t d);

-float fvec_Linf (
+float fvec_Linf_sse (
        const float * x,
        const float * y,
        size_t d);
+#endif

 float fvec_jaccard (
        const float * x,

--- a/core/src/index/thirdparty/faiss/utils/distances_avx512.h
+++ b/core/src/index/thirdparty/faiss/utils/distances_avx512.h
--- a/core/src/index/thirdparty/faiss/utils/distances_simd.cpp
+++ b/core/src/index/thirdparty/faiss/utils/distances_simd.cpp
--- a/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp
+++ b/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp
--- a/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
+++ b/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
@@ -12,7 +12,7 @@
 #include <cmath>
 #include <omp.h>

-
+#include <faiss/FaissHook.h>
 #include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/AuxIndexStructures.h>

--- a/core/src/index/thirdparty/faiss/utils/instruction_set.h
+++ b/core/src/index/thirdparty/faiss/utils/instruction_set.h
--- a/core/src/index/unittest/CMakeLists.txt
+++ b/core/src/index/unittest/CMakeLists.txt
@@ -117,6 +117,12 @@ if (KNOWHERE_GPU_VERSION)
    target_link_libraries(test_customized_index ${depend_libs} ${unittest_libs} ${basic_libs})
 endif ()

+#<INSTRUCTIONSET-TEST>
+if (NOT TARGET test_instructionset)
+    add_executable(test_instructionset test_instructionset.cpp)
+endif ()
+target_link_libraries(test_instructionset ${depend_libs} ${unittest_libs})
+
 if (NOT TARGET test_knowhere_common)
    add_executable(test_knowhere_common test_common.cpp ${util_srcs})
 endif ()

--- a/core/src/index/unittest/test_instructionset.cpp
+++ b/core/src/index/unittest/test_instructionset.cpp
--- a/core/src/server/Config.cpp
+++ b/core/src/server/Config.cpp
--- a/core/src/server/Config.h
+++ b/core/src/server/Config.h
--- a/core/src/wrapper/KnowhereResource.cpp
+++ b/core/src/wrapper/KnowhereResource.cpp
--- a/core/unittest/server/test_config.cpp
+++ b/core/unittest/server/test_config.cpp