Merge pull request #327 from wangjiawei04/develop

Add quantization to General Dist KV Op

Merge pull request #327 from wangjiawei04/develop
Add quantization to General Dist KV Op
ff1ec490 · Dong Daxiang · GitHub · 54fe8136 · 933b1f6a · ff1ec490
16 changed file
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -56,6 +56,7 @@ message ResourceConf {
  optional string general_model_file = 4;
  optional string cube_config_path = 5;
  optional string cube_config_file = 6;
+  optional int32 cube_quant_bits = 7; // set 0 if no quant.
 };

 // DAG node depency info

--- a/core/general-server/op/CMakeLists.txt
+++ b/core/general-server/op/CMakeLists.txt
-FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../predictor/tools/quant.cpp)
 LIST(APPEND serving_srcs ${op_srcs})
--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/general_dist_kv_quant_infer_op.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+#include "core/cube/cube-api/include/cube_api.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/predictor/tools/quant.h"
+#include "core/util/include/timer.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::FetchInst;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+
+int GeneralDistKVQuantInferOp::inference() {
+  VLOG(2) << "Going to run inference";
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name());
+  VLOG(2) << "Get precedent op name: " << pre_name();
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+
+  if (!input_blob) {
+    LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name();
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+  int batch_size = input_blob->GetBatchSize();
+  VLOG(2) << "input batch size: " << batch_size;
+  std::vector<uint64_t> keys;
+  std::vector<rec::mcube::CubeValue> values;
+  int sparse_count = 0;
+  int dense_count = 0;
+  std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
+  size_t key_len = 0;
+  for (size_t i = 0; i < in->size(); ++i) {
+    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
+      ++dense_count;
+      continue;
+    }
+    ++sparse_count;
+    size_t elem_num = 1;
+    for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
+      elem_num *= in->at(i).shape[s];
+    }
+    key_len += elem_num;
+    int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
+    dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
+  }
+  keys.resize(key_len);
+  int key_idx = 0;
+  for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
+    std::copy(dataptr_size_pairs[i].first,
+              dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
+              keys.begin() + key_idx);
+    key_idx += dataptr_size_pairs[i].second;
+  }
+  rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
+  std::vector<std::string> table_names = cube->get_table_names();
+  if (table_names.size() == 0) {
+    LOG(ERROR) << "cube init error or cube config not given.";
+    return -1;
+  }
+  int ret = cube->seek(table_names[0], keys, &values);
+
+  if (values.size() != keys.size() || values[0].buff.size() == 0) {
+    LOG(ERROR) << "cube value return null";
+  }
+
+  TensorVector sparse_out;
+  sparse_out.resize(sparse_count);
+  TensorVector dense_out;
+  dense_out.resize(dense_count);
+  int cube_val_idx = 0;
+  int sparse_idx = 0;
+  int dense_idx = 0;
+  std::unordered_map<int, int> in_out_map;
+  baidu::paddle_serving::predictor::Resource &resource =
+      baidu::paddle_serving::predictor::Resource::instance();
+  std::shared_ptr<PaddleGeneralModelConfig> model_config =
+      resource.get_general_model_config();
+  int cube_quant_bits = resource.get_cube_quant_bits();
+  size_t EMBEDDING_SIZE = 0;
+  if (cube_quant_bits == 0) {
+    EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+  } else {
+    EMBEDDING_SIZE = values[0].buff.size() - 2 * sizeof(float);
+  }
+
+  for (size_t i = 0; i < in->size(); ++i) {
+    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
+      dense_out[dense_idx] = in->at(i);
+      ++dense_idx;
+      continue;
+    }
+
+    sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
+    for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
+      sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
+      std::copy(in->at(i).lod[x].begin(),
+                in->at(i).lod[x].end(),
+                sparse_out[sparse_idx].lod[x].begin());
+    }
+    sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
+    sparse_out[sparse_idx].shape.push_back(
+        sparse_out[sparse_idx].lod[0].back());
+    sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
+    sparse_out[sparse_idx].name = model_config->_feed_name[i];
+    sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
+                                       EMBEDDING_SIZE * sizeof(float));
+    // END HERE
+    float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
+    for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
+      float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
+      if (cube_quant_bits == 0) {
+        memcpy(data_ptr,
+               values[cube_val_idx].buff.data(),
+               values[cube_val_idx].buff.size());
+      } else {
+        // min (float), max (float), num, num, num... (Byte)
+        size_t num_of_float =
+            values[cube_val_idx].buff.size() - 2 * sizeof(float);
+        float *float_ptr = new float[num_of_float];
+        char *src_ptr = new char[values[cube_val_idx].buff.size()];
+        memcpy(src_ptr,
+               values[cube_val_idx].buff.data(),
+               values[cube_val_idx].buff.size());
+        float *minmax = reinterpret_cast<float *>(src_ptr);
+        dequant(src_ptr + 2 * sizeof(float),
+                float_ptr,
+                minmax[0],
+                minmax[1],
+                num_of_float,
+                cube_quant_bits);
+        memcpy(data_ptr, float_ptr, sizeof(float) * num_of_float);
+        delete float_ptr;
+        delete src_ptr;
+      }
+      cube_val_idx++;
+    }
+    ++sparse_idx;
+  }
+  TensorVector infer_in;
+  infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
+  infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
+
+  output_blob->SetBatchSize(batch_size);
+
+  VLOG(2) << "infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  if (InferManager::instance().infer(
+          GENERAL_MODEL_NAME, &infer_in, out, batch_size)) {
+    LOG(ERROR) << "Failed do infer in fluid model: " << GENERAL_MODEL_NAME;
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+DEFINE_OP(GeneralDistKVQuantInferOp);
+
+}  // namespace serving
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-server/op/general_dist_kv_quant_infer_op.h
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
+#else
+#include "paddle_inference_api.h"  // NOLINT
+#endif
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class GeneralDistKVQuantInferOp
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+ public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(GeneralDistKVQuantInferOp);
+
+  int inference();
+};
+
+}  // namespace serving
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -151,6 +151,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
    std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
                                       "/" + resource_conf.cube_config_file();
    this->cube_config_fullpath = cube_config_fullpath;
+    this->cube_quant_bits = resource_conf.has_cube_quant_bits()
+                                ? resource_conf.cube_quant_bits()
+                                : 0;
+    if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) {
+      LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8.";
+      return -1;
+    }
+    if (this->cube_quant_bits == 0) {
+      LOG(INFO) << "cube quant mode OFF";
+    } else {
+      LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits;
+    }
  }

  THREAD_SETSPECIFIC(_tls_bspec_key, NULL);
@@ -258,38 +270,6 @@ int Resource::general_model_initialize(const std::string& path,
  return 0;
 }

-int Resource::cube_initialize(const std::string& path,
-                              const std::string& file) {
-  // cube
-  if (!FLAGS_enable_cube) {
-    return 0;
-  }
-
-  ResourceConf resource_conf;
-  if (configure::read_proto_conf(path, file, &resource_conf) != 0) {
-    LOG(ERROR) << "Failed initialize resource from: " << path << "/" << file;
-    return -1;
-  }
-
-  int err = 0;
-  std::string cube_config_file = resource_conf.cube_config_file();
-  if (err != 0) {
-    LOG(ERROR) << "reade cube_config_file failed, path[" << path << "], file["
-               << cube_config_file << "]";
-    return -1;
-  }
-  err = CubeAPI::instance()->init(cube_config_file.c_str());
-  if (err != 0) {
-    LOG(ERROR) << "failed initialize cube, config: " << cube_config_file
-               << " error code : " << err;
-    return -1;
-  }
-
-  LOG(INFO) << "Successfully initialize cube";
-
-  return 0;
-}
-
 int Resource::thread_initialize() {
  // mempool
  if (MempoolWrapper::instance().thread_initialize() != 0) {
@@ -373,6 +353,7 @@ int Resource::thread_clear() {
  // ...
  return 0;
 }
+size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; }

 int Resource::reload() {
  if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) {

--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -82,7 +82,6 @@ class Resource {
  }

  int initialize(const std::string& path, const std::string& file);
-  int cube_initialize(const std::string& path, const std::string& file);

  int general_model_initialize(const std::string& path,
                               const std::string& file);
@@ -104,11 +103,13 @@ class Resource {
    return reinterpret_cast<DynamicResource*>(
        THREAD_GETSPECIFIC(_tls_bspec_key));
  }
+  size_t get_cube_quant_bits();

 private:
  int thread_finalize() { return 0; }
  std::shared_ptr<PaddleGeneralModelConfig> _config;
  std::string cube_config_fullpath;
+  int cube_quant_bits;  // 0 if no empty

  THREAD_KEY_T _tls_bspec_key;
 };

--- a/core/predictor/src/pdserving.cpp
+++ b/core/predictor/src/pdserving.cpp
@@ -202,14 +202,6 @@ int main(int argc, char** argv) {
  }
  VLOG(2) << "Succ call pthread worker start function";

-  if (Resource::instance().cube_initialize(FLAGS_resource_path,
-                                           FLAGS_resource_file) != 0) {
-    LOG(ERROR) << "Failed initialize cube, conf: " << FLAGS_resource_path << "/"
-               << FLAGS_resource_file;
-    return -1;
-  }
-  VLOG(2) << "Succ initialize cube";
-
 #ifndef BCLOUD

  if (Resource::instance().general_model_initialize(FLAGS_resource_path,

--- a/core/predictor/tools/CMakeLists.txt
+++ b/core/predictor/tools/CMakeLists.txt
-set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp  ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp)
+set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp  ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp ${CMAKE_CURRENT_LIST_DIR}/quant.cpp)
 LIST(APPEND seq_gen_src ${PROTO_SRCS})
 add_executable(seq_generator ${seq_gen_src})
 target_link_libraries(seq_generator protobuf -lpthread)
--- a/core/predictor/tools/quant.cpp
+++ b/core/predictor/tools/quant.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "quant.h"
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "seq_file.h"
+using paddle::framework::proto::VarType;
+
+float compute_loss(float* a, float* b, int emb_size) {
+  float sum = 0;
+  for (size_t i = 0; i < emb_size; i++) {
+    sum += (a[i] - b[i]) * (a[i] - b[i]);
+  }
+  return sum;
+}
+
+float* transfer(
+    float* in, float* out, float min, float max, int emb_size, int bits) {
+  float scale = (max - min) / pow(2, bits);
+  for (size_t i = 0; i < emb_size; i++) {
+    float x = in[i];
+    int val = round((x - min) / (max - min) * (pow(2, bits) - 1));
+    val = std::max(0, val);
+    val = std::min((int)pow(2, bits) - 1, val);
+    out[i] = val * scale + min;
+  }
+  return out;
+}
+
+char* quant(
+    float* in, char** out, float min, float max, int emb_size, int bits) {
+  float scale = (max - min) / pow(2, bits);
+  for (size_t i = 0; i < emb_size; ++i) {
+    float x = in[i];
+    int val = round((x - min) / (max - min) * (pow(2, bits) - 1));
+    val = std::max(0, val);
+    val = std::min((int)pow(2, bits) - 1, val);
+    *out[emb_size] = val;
+  }
+  return *out;
+}
+
+float* dequant(
+    char* in, float* out, float min, float max, int emb_size, int bits) {
+  float scale = (max - min) / pow(2, bits);
+  for (size_t i = 0; i < emb_size; ++i) {
+    float x =
+        scale * (((int)in[i] + (int)pow(2, bits)) % (int)pow(2, bits)) + min;
+    out[i] = x;
+  }
+  return out;
+}
+
+void greedy_search(float* in,
+                   float& xmin,
+                   float& xmax,
+                   float& loss,
+                   size_t emb_size,
+                   int bits) {
+  int b = 200;
+  float r = 0.16;
+  xmin = 2147483647;
+  xmax = -2147483648;
+  float cur_min = xmin;
+  float cur_max = xmax;
+  for (size_t i = 0; i < emb_size; i++) {
+    xmin = std::min(xmin, in[i]);
+    xmax = std::max(xmax, in[i]);
+  }
+  cur_min = xmin;
+  cur_max = xmax;
+  float out[emb_size];
+  loss = compute_loss(
+      in, transfer(in, out, cur_min, cur_max, emb_size, bits), emb_size);
+  float stepsize = (cur_max - cur_min) / b;
+  float min_steps = b * (1 - r) * stepsize;
+  while (cur_min + min_steps < cur_max) {
+    float loss_l = compute_loss(
+        in,
+        transfer(in, out, cur_min + stepsize, cur_max, emb_size, bits),
+        emb_size);
+    float loss_r = compute_loss(
+        in,
+        transfer(in, out, cur_min, cur_max - stepsize, emb_size, bits),
+        emb_size);
+    if (loss_l < loss) {
+      cur_min = cur_min + stepsize;
+      if (loss_l < loss_r) {
+        loss = loss_l;
+        xmin = cur_min;
+      }
+    } else {
+      cur_max = cur_max - stepsize;
+      if (loss_r < loss) {
+        loss = loss_r;
+        xmax = cur_max;
+      }
+    }
+  }
+}
--- a/core/predictor/tools/quant.h
+++ b/core/predictor/tools/quant.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "core/predictor/framework.pb.h"
+#include "seq_file.h"
+using paddle::framework::proto::VarType;
+void greedy_search(float* in,
+                   float& xmin,
+                   float& xmax,
+                   float& loss,
+                   size_t emb_size,
+                   int bits);
+// std::mutex g_mtx;
+
+float compute_loss(float* a, float* b, int emb_size);
+float* transfer(
+    float* in, float* out, float min, float max, int emb_size, int bits);
+char* quant(
+    float* in, char** out, float min, float max, int emb_size, int bits);
+float* dequant(
+    char* in, float* out, float min, float max, int emb_size, int bits);
+void greedy_search(float* in,
+                   float& xmin,
+                   float& xmax,
+                   float& loss,
+                   size_t emb_size,
+                   int bits);
--- a/core/predictor/tools/seq_generator.cpp
+++ b/core/predictor/tools/seq_generator.cpp
@@ -16,7 +16,9 @@
 #include <iostream>
 #include <memory>
 #include "core/predictor/framework.pb.h"
+#include "quant.h"
 #include "seq_file.h"
+
 using paddle::framework::proto::VarType;
 std::map<int, size_t> var_type_size;
 void reg_var_types() {
@@ -31,6 +33,7 @@ void reg_var_types() {
  var_type_size[static_cast<int>(VarType::UINT8)] = sizeof(uint8_t);
  var_type_size[static_cast<int>(VarType::INT8)] = sizeof(int8_t);
 }
+
 int dump_parameter(const char *input_file, const char *output_file) {
  std::ifstream is(input_file);
  // the 1st field, unit32_t version for LoDTensor
@@ -105,12 +108,127 @@ int dump_parameter(const char *input_file, const char *output_file) {
  }
  return 0;
 }
+
+int compress_parameter(const char *file1, const char *file2, int bits) {
+  std::ifstream is(file1);
+  // Step 1: is read version, os write version
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  if (version != 0) {
+    std::cout << "Version number " << version << " not supported" << std::endl;
+    return -1;
+  }
+  std::cout << "Version size: " << sizeof(version) << std::endl;
+  // Step 2: is read LoD level, os write LoD level
+  uint64_t lod_level;
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+  std::vector<std::vector<size_t>> lod;
+  lod.resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+
+    std::vector<size_t> tmp(size / sizeof(size_t));
+    is.read(reinterpret_cast<char *>(tmp.data()),
+            static_cast<std::streamsize>(size));
+    lod[i] = tmp;
+  }
+  // Step 3: is read Protobuf os Write Protobuf
+  // Note: duplicate version field
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  if (version != 0) {
+    std::cout << "Version number " << version << " not supported" << std::endl;
+    return -1;
+  }
+
+  // Step 4: is read Tensor Data, os write  min/max/quant data
+  VarType::TensorDesc desc;
+  int32_t size;
+  is.read(reinterpret_cast<char *>(&size), sizeof(size));
+  std::unique_ptr<char[]> buf(new char[size]);
+  is.read(reinterpret_cast<char *>(buf.get()), size);
+  if (!desc.ParseFromArray(buf.get(), size)) {
+    std::cout << "Cannot parse tensor desc" << std::endl;
+    return -1;
+  }
+  // read tensor
+  std::vector<int64_t> dims;
+  dims.reserve(static_cast<size_t>(desc.dims().size()));
+  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+
+  std::cout << "Dims:";
+  for (auto x : dims) {
+    std::cout << " " << x;
+  }
+  std::cout << std::endl;
+
+  if (dims.size() != 2) {
+    std::cout << "Parameter dims not 2D" << std::endl;
+    return -1;
+  }
+
+  size_t numel = 1;
+  for (auto x : dims) {
+    numel *= x;
+  }
+  size_t buf_size = numel * var_type_size[desc.data_type()];
+  std::cout << buf_size << std::endl;
+  char *tensor_buf = new char[buf_size];
+  is.read(static_cast<char *>(tensor_buf), buf_size);
+  float *tensor_float_buf = reinterpret_cast<float *>(tensor_buf);
+  size_t per_line_size = dims[1] * 1 + 2 * sizeof(float);
+  char *tensor_out = new char[per_line_size * dims[0]];
+
+  float loss = 0;
+  float all_loss = 0;
+  std::cout << "Start Quant" << std::endl;
+  SeqFileWriter seq_file_writer(file2);
+
+  size_t offset = 0;
+
+  for (int64_t i = 0; i < dims[0]; ++i) {
+    float xmin = 0, xmax = 0, loss = 0;
+    size_t scale = dims[1];
+    char *tensor_temp = new char[per_line_size];
+    greedy_search(
+        tensor_float_buf + i * dims[1], xmin, xmax, loss, scale, bits);
+    for (size_t e = 0; e < dims[1]; ++e) {
+      float x = *(tensor_float_buf + i * dims[1] + e);
+      int val = round((x - xmin) / (xmax - xmin) * (pow(2, bits) - 1));
+      val = std::max(0, val);
+      val = std::min((int)pow(2, bits) - 1, val);
+      char *min_ptr = tensor_temp;
+      char *max_ptr = tensor_temp + sizeof(float);
+      memcpy(min_ptr, &xmin, sizeof(float));
+      memcpy(max_ptr, &xmax, sizeof(float));
+      *(tensor_temp + 2 * sizeof(float) + e) = val;
+      float unit = (xmax - xmin) / pow(2, bits);
+      float trans_val = unit * val + xmin;
+    }
+    seq_file_writer.write((char *)&i, sizeof(i), tensor_temp, per_line_size);
+  }
+  return 0;
+}
 int main(int argc, char **argv) {
-  if (argc != 3) {
-    std::cout << "Usage: seq_generator PARAMETER_FILE OUTPUT_FILE" << std::endl;
+  if (argc < 3 || argc > 4) {
+    std::cout << "Usage: if no compress, please follow:" << std::endl;
+    std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE\n" << std::endl;
+    std::cout << "if compress, please follow: " << std::endl;
+    std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE QUANT_BITS"
+              << std::endl;
+    std::cout << "Now it only support 8 bit." << std::endl;
    return -1;
  }
  reg_var_types();
-  dump_parameter(argv[1], argv[2]);
+  if (argc == 3) {
+    std::cout << "generate normal sparse param sequence file" << std::endl;
+    dump_parameter(argv[1], argv[2]);
+    return 0;
+  }
+  if (argc == 4) {
+    std::cout << "generate compressed sparse param sequence file" << std::endl;
+    compress_parameter(argv[1], argv[2], atoi(argv[3]));
+    return 0;
+  }
 }
 /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
--- a/python/examples/criteo_ctr_with_cube/clean.sh
+++ b/python/examples/criteo_ctr_with_cube/clean.sh
 ps -ef | grep cube | awk {'print $2'} | xargs kill -9
-ps -ef | grep SimpleHTTPServer | awk {'print $2'} | xargs kill -9
 rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
 ps -ef | grep test | awk {'print $2'} | xargs kill -9
 ps -ef | grep serving | awk {'print $2'} | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash

 mkdir -p cube_model
 mkdir -p cube/data
-./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature  
 ./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
 mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
 cd cube && ./cube 
--- a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube 
--- a/python/examples/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_quant.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import os
+import sys
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
+response_op = op_maker.create('general_response')
+
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -33,7 +33,7 @@ class OpMaker(object):
            "general_text_response": "GeneralTextResponseOp",
            "general_single_kv": "GeneralSingleKVOp",
            "general_dist_kv_infer": "GeneralDistKVInferOp",
-            "general_dist_kv": "GeneralDistKVOp",
+            "general_dist_kv_quant_infer": "GeneralDistKVQuantInferOp",
            "general_copy": "GeneralCopyOp"
        }

@@ -164,6 +164,8 @@ class Server(object):
                    if "dist_kv" in node.name:
                        self.resource_conf.cube_config_path = workdir
                        self.resource_conf.cube_config_file = self.cube_config_fn
+                        if "quant" in node.name:
+                            self.resource_conf.cube_quant_bits = 8
            self.resource_conf.model_toolkit_path = workdir
            self.resource_conf.model_toolkit_file = self.model_toolkit_fn
            self.resource_conf.general_model_path = workdir