diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index bbc4487d69757636178048da05ea608be0b9466a..4bdc233099cffbc7949a6b5cf8627fe6461f565c 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -56,6 +56,7 @@ message ResourceConf { optional string general_model_file = 4; optional string cube_config_path = 5; optional string cube_config_file = 6; + optional int32 cube_quant_bits = 7; // set 0 if no quant. }; // DAG node depency info diff --git a/core/general-server/op/CMakeLists.txt b/core/general-server/op/CMakeLists.txt index 9287408e5e64fa284acbbdb18563703510114e87..137bc9b236398d238e3b11c8f99a6088883abfec 100644 --- a/core/general-server/op/CMakeLists.txt +++ b/core/general-server/op/CMakeLists.txt @@ -1,2 +1,2 @@ -FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp) +FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../predictor/tools/quant.cpp) LIST(APPEND serving_srcs ${op_srcs}) diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.cpp b/core/general-server/op/general_dist_kv_quant_infer_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..583384b79ed5ec69d14cb31b7c8239c3f786c33d --- /dev/null +++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp @@ -0,0 +1,198 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "core/general-server/op/general_dist_kv_quant_infer_op.h" +#include +#include +#include +#include +#include +#include +#include "core/cube/cube-api/include/cube_api.h" +#include "core/predictor/framework/infer.h" +#include "core/predictor/framework/memory.h" +#include "core/predictor/framework/resource.h" +#include "core/predictor/tools/quant.h" +#include "core/util/include/timer.h" + +namespace baidu { +namespace paddle_serving { +namespace serving { + +using baidu::paddle_serving::Timer; +using baidu::paddle_serving::predictor::MempoolWrapper; +using baidu::paddle_serving::predictor::general_model::Tensor; +using baidu::paddle_serving::predictor::general_model::Response; +using baidu::paddle_serving::predictor::general_model::Request; +using baidu::paddle_serving::predictor::general_model::FetchInst; +using baidu::paddle_serving::predictor::InferManager; +using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; + +int GeneralDistKVQuantInferOp::inference() { + VLOG(2) << "Going to run inference"; + const GeneralBlob *input_blob = get_depend_argument(pre_name()); + VLOG(2) << "Get precedent op name: " << pre_name(); + GeneralBlob *output_blob = mutable_data(); + + if (!input_blob) { + LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name(); + return -1; + } + + const TensorVector *in = &input_blob->tensor_vector; + TensorVector *out = &output_blob->tensor_vector; + int batch_size = input_blob->GetBatchSize(); + VLOG(2) << "input batch size: " << batch_size; + std::vector keys; + std::vector values; + int sparse_count = 0; + int dense_count = 0; + std::vector> dataptr_size_pairs; + size_t key_len = 0; + for (size_t i = 0; i < in->size(); ++i) { + if (in->at(i).dtype != paddle::PaddleDType::INT64) { + ++dense_count; + continue; + } + ++sparse_count; + size_t elem_num = 1; + for (size_t s = 0; s < in->at(i).shape.size(); ++s) { + elem_num *= in->at(i).shape[s]; + } + key_len += elem_num; + int64_t *data_ptr = static_cast(in->at(i).data.data()); + dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num)); + } + keys.resize(key_len); + int key_idx = 0; + for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) { + std::copy(dataptr_size_pairs[i].first, + dataptr_size_pairs[i].first + dataptr_size_pairs[i].second, + keys.begin() + key_idx); + key_idx += dataptr_size_pairs[i].second; + } + rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance(); + std::vector table_names = cube->get_table_names(); + if (table_names.size() == 0) { + LOG(ERROR) << "cube init error or cube config not given."; + return -1; + } + int ret = cube->seek(table_names[0], keys, &values); + + if (values.size() != keys.size() || values[0].buff.size() == 0) { + LOG(ERROR) << "cube value return null"; + } + + TensorVector sparse_out; + sparse_out.resize(sparse_count); + TensorVector dense_out; + dense_out.resize(dense_count); + int cube_val_idx = 0; + int sparse_idx = 0; + int dense_idx = 0; + std::unordered_map in_out_map; + baidu::paddle_serving::predictor::Resource &resource = + baidu::paddle_serving::predictor::Resource::instance(); + std::shared_ptr model_config = + resource.get_general_model_config(); + int cube_quant_bits = resource.get_cube_quant_bits(); + size_t EMBEDDING_SIZE = 0; + if (cube_quant_bits == 0) { + EMBEDDING_SIZE = values[0].buff.size() / sizeof(float); + } else { + EMBEDDING_SIZE = values[0].buff.size() - 2 * sizeof(float); + } + + for (size_t i = 0; i < in->size(); ++i) { + if (in->at(i).dtype != paddle::PaddleDType::INT64) { + dense_out[dense_idx] = in->at(i); + ++dense_idx; + continue; + } + + sparse_out[sparse_idx].lod.resize(in->at(i).lod.size()); + for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) { + sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size()); + std::copy(in->at(i).lod[x].begin(), + in->at(i).lod[x].end(), + sparse_out[sparse_idx].lod[x].begin()); + } + sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32; + sparse_out[sparse_idx].shape.push_back( + sparse_out[sparse_idx].lod[0].back()); + sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE); + sparse_out[sparse_idx].name = model_config->_feed_name[i]; + sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() * + EMBEDDING_SIZE * sizeof(float)); + // END HERE + float *dst_ptr = static_cast(sparse_out[sparse_idx].data.data()); + for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) { + float *data_ptr = dst_ptr + x * EMBEDDING_SIZE; + if (cube_quant_bits == 0) { + memcpy(data_ptr, + values[cube_val_idx].buff.data(), + values[cube_val_idx].buff.size()); + } else { + // min (float), max (float), num, num, num... (Byte) + size_t num_of_float = + values[cube_val_idx].buff.size() - 2 * sizeof(float); + float *float_ptr = new float[num_of_float]; + char *src_ptr = new char[values[cube_val_idx].buff.size()]; + memcpy(src_ptr, + values[cube_val_idx].buff.data(), + values[cube_val_idx].buff.size()); + float *minmax = reinterpret_cast(src_ptr); + dequant(src_ptr + 2 * sizeof(float), + float_ptr, + minmax[0], + minmax[1], + num_of_float, + cube_quant_bits); + memcpy(data_ptr, float_ptr, sizeof(float) * num_of_float); + delete float_ptr; + delete src_ptr; + } + cube_val_idx++; + } + ++sparse_idx; + } + TensorVector infer_in; + infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end()); + infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end()); + + output_blob->SetBatchSize(batch_size); + + VLOG(2) << "infer batch size: " << batch_size; + + Timer timeline; + int64_t start = timeline.TimeStampUS(); + timeline.Start(); + + if (InferManager::instance().infer( + GENERAL_MODEL_NAME, &infer_in, out, batch_size)) { + LOG(ERROR) << "Failed do infer in fluid model: " << GENERAL_MODEL_NAME; + return -1; + } + + int64_t end = timeline.TimeStampUS(); + CopyBlobInfo(input_blob, output_blob); + AddBlobInfo(output_blob, start); + AddBlobInfo(output_blob, end); + return 0; +} +DEFINE_OP(GeneralDistKVQuantInferOp); + +} // namespace serving +} // namespace paddle_serving +} // namespace baidu diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.h b/core/general-server/op/general_dist_kv_quant_infer_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e153311a2a2e2df1bd12720e2ce6cbe9ddb31ec0 --- /dev/null +++ b/core/general-server/op/general_dist_kv_quant_infer_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#ifdef BCLOUD +#ifdef WITH_GPU +#include "paddle/paddle_inference_api.h" +#else +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#endif +#else +#include "paddle_inference_api.h" // NOLINT +#endif +#include "core/general-server/general_model_service.pb.h" +#include "core/general-server/op/general_infer_helper.h" + +namespace baidu { +namespace paddle_serving { +namespace serving { + +class GeneralDistKVQuantInferOp + : public baidu::paddle_serving::predictor::OpWithChannel { + public: + typedef std::vector TensorVector; + + DECLARE_OP(GeneralDistKVQuantInferOp); + + int inference(); +}; + +} // namespace serving +} // namespace paddle_serving +} // namespace baidu diff --git a/core/predictor/framework/resource.cpp b/core/predictor/framework/resource.cpp index 11b015bfcb2f3c4f72a67a5687c4fdfc95481a31..ca219519e2dcf20bc961d991e3f2eb0ad060f38f 100644 --- a/core/predictor/framework/resource.cpp +++ b/core/predictor/framework/resource.cpp @@ -151,6 +151,18 @@ int Resource::initialize(const std::string& path, const std::string& file) { std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() + "/" + resource_conf.cube_config_file(); this->cube_config_fullpath = cube_config_fullpath; + this->cube_quant_bits = resource_conf.has_cube_quant_bits() + ? resource_conf.cube_quant_bits() + : 0; + if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) { + LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8."; + return -1; + } + if (this->cube_quant_bits == 0) { + LOG(INFO) << "cube quant mode OFF"; + } else { + LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits; + } } THREAD_SETSPECIFIC(_tls_bspec_key, NULL); @@ -258,38 +270,6 @@ int Resource::general_model_initialize(const std::string& path, return 0; } -int Resource::cube_initialize(const std::string& path, - const std::string& file) { - // cube - if (!FLAGS_enable_cube) { - return 0; - } - - ResourceConf resource_conf; - if (configure::read_proto_conf(path, file, &resource_conf) != 0) { - LOG(ERROR) << "Failed initialize resource from: " << path << "/" << file; - return -1; - } - - int err = 0; - std::string cube_config_file = resource_conf.cube_config_file(); - if (err != 0) { - LOG(ERROR) << "reade cube_config_file failed, path[" << path << "], file[" - << cube_config_file << "]"; - return -1; - } - err = CubeAPI::instance()->init(cube_config_file.c_str()); - if (err != 0) { - LOG(ERROR) << "failed initialize cube, config: " << cube_config_file - << " error code : " << err; - return -1; - } - - LOG(INFO) << "Successfully initialize cube"; - - return 0; -} - int Resource::thread_initialize() { // mempool if (MempoolWrapper::instance().thread_initialize() != 0) { @@ -373,6 +353,7 @@ int Resource::thread_clear() { // ... return 0; } +size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; } int Resource::reload() { if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) { diff --git a/core/predictor/framework/resource.h b/core/predictor/framework/resource.h index 1a648f0fa363efa4eb915a64553949206ec96153..56b87666892fab45c2099a4de9bfbc1296b65b11 100644 --- a/core/predictor/framework/resource.h +++ b/core/predictor/framework/resource.h @@ -82,7 +82,6 @@ class Resource { } int initialize(const std::string& path, const std::string& file); - int cube_initialize(const std::string& path, const std::string& file); int general_model_initialize(const std::string& path, const std::string& file); @@ -104,11 +103,13 @@ class Resource { return reinterpret_cast( THREAD_GETSPECIFIC(_tls_bspec_key)); } + size_t get_cube_quant_bits(); private: int thread_finalize() { return 0; } std::shared_ptr _config; std::string cube_config_fullpath; + int cube_quant_bits; // 0 if no empty THREAD_KEY_T _tls_bspec_key; }; diff --git a/core/predictor/src/pdserving.cpp b/core/predictor/src/pdserving.cpp index fe8693de9349ac1c4c2ae94a07e21dc8be448da3..157d52cee1adaea0524ebde01f75a90a6b2adc2f 100644 --- a/core/predictor/src/pdserving.cpp +++ b/core/predictor/src/pdserving.cpp @@ -202,14 +202,6 @@ int main(int argc, char** argv) { } VLOG(2) << "Succ call pthread worker start function"; - if (Resource::instance().cube_initialize(FLAGS_resource_path, - FLAGS_resource_file) != 0) { - LOG(ERROR) << "Failed initialize cube, conf: " << FLAGS_resource_path << "/" - << FLAGS_resource_file; - return -1; - } - VLOG(2) << "Succ initialize cube"; - #ifndef BCLOUD if (Resource::instance().general_model_initialize(FLAGS_resource_path, diff --git a/core/predictor/tools/CMakeLists.txt b/core/predictor/tools/CMakeLists.txt index f74e3ea571933665b4b8a3fc795ce4db3f1b1493..73e0d2a4b3a36681fbddd0b8789b394e89e792ff 100644 --- a/core/predictor/tools/CMakeLists.txt +++ b/core/predictor/tools/CMakeLists.txt @@ -1,4 +1,4 @@ -set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp) +set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp ${CMAKE_CURRENT_LIST_DIR}/quant.cpp) LIST(APPEND seq_gen_src ${PROTO_SRCS}) add_executable(seq_generator ${seq_gen_src}) target_link_libraries(seq_generator protobuf -lpthread) diff --git a/core/predictor/tools/quant.cpp b/core/predictor/tools/quant.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e2c5ebfa059f333fd0659ff43b2b99172923407a --- /dev/null +++ b/core/predictor/tools/quant.cpp @@ -0,0 +1,117 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "quant.h" +#include +#include +#include +#include +#include +#include +#include +#include "seq_file.h" +using paddle::framework::proto::VarType; + +float compute_loss(float* a, float* b, int emb_size) { + float sum = 0; + for (size_t i = 0; i < emb_size; i++) { + sum += (a[i] - b[i]) * (a[i] - b[i]); + } + return sum; +} + +float* transfer( + float* in, float* out, float min, float max, int emb_size, int bits) { + float scale = (max - min) / pow(2, bits); + for (size_t i = 0; i < emb_size; i++) { + float x = in[i]; + int val = round((x - min) / (max - min) * (pow(2, bits) - 1)); + val = std::max(0, val); + val = std::min((int)pow(2, bits) - 1, val); + out[i] = val * scale + min; + } + return out; +} + +char* quant( + float* in, char** out, float min, float max, int emb_size, int bits) { + float scale = (max - min) / pow(2, bits); + for (size_t i = 0; i < emb_size; ++i) { + float x = in[i]; + int val = round((x - min) / (max - min) * (pow(2, bits) - 1)); + val = std::max(0, val); + val = std::min((int)pow(2, bits) - 1, val); + *out[emb_size] = val; + } + return *out; +} + +float* dequant( + char* in, float* out, float min, float max, int emb_size, int bits) { + float scale = (max - min) / pow(2, bits); + for (size_t i = 0; i < emb_size; ++i) { + float x = + scale * (((int)in[i] + (int)pow(2, bits)) % (int)pow(2, bits)) + min; + out[i] = x; + } + return out; +} + +void greedy_search(float* in, + float& xmin, + float& xmax, + float& loss, + size_t emb_size, + int bits) { + int b = 200; + float r = 0.16; + xmin = 2147483647; + xmax = -2147483648; + float cur_min = xmin; + float cur_max = xmax; + for (size_t i = 0; i < emb_size; i++) { + xmin = std::min(xmin, in[i]); + xmax = std::max(xmax, in[i]); + } + cur_min = xmin; + cur_max = xmax; + float out[emb_size]; + loss = compute_loss( + in, transfer(in, out, cur_min, cur_max, emb_size, bits), emb_size); + float stepsize = (cur_max - cur_min) / b; + float min_steps = b * (1 - r) * stepsize; + while (cur_min + min_steps < cur_max) { + float loss_l = compute_loss( + in, + transfer(in, out, cur_min + stepsize, cur_max, emb_size, bits), + emb_size); + float loss_r = compute_loss( + in, + transfer(in, out, cur_min, cur_max - stepsize, emb_size, bits), + emb_size); + if (loss_l < loss) { + cur_min = cur_min + stepsize; + if (loss_l < loss_r) { + loss = loss_l; + xmin = cur_min; + } + } else { + cur_max = cur_max - stepsize; + if (loss_r < loss) { + loss = loss_r; + xmax = cur_max; + } + } + } +} diff --git a/core/predictor/tools/quant.h b/core/predictor/tools/quant.h new file mode 100644 index 0000000000000000000000000000000000000000..8a8cde4bd8c9f0c80c16401fa7e12d1f1c6edd6d --- /dev/null +++ b/core/predictor/tools/quant.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include "core/predictor/framework.pb.h" +#include "seq_file.h" +using paddle::framework::proto::VarType; +void greedy_search(float* in, + float& xmin, + float& xmax, + float& loss, + size_t emb_size, + int bits); +// std::mutex g_mtx; + +float compute_loss(float* a, float* b, int emb_size); +float* transfer( + float* in, float* out, float min, float max, int emb_size, int bits); +char* quant( + float* in, char** out, float min, float max, int emb_size, int bits); +float* dequant( + char* in, float* out, float min, float max, int emb_size, int bits); +void greedy_search(float* in, + float& xmin, + float& xmax, + float& loss, + size_t emb_size, + int bits); diff --git a/core/predictor/tools/seq_generator.cpp b/core/predictor/tools/seq_generator.cpp index 2efda77485b919a3fd14d05b43fa4729c97234fb..d384b9310a965503358ea3bc80e4fa8c13e7b39a 100644 --- a/core/predictor/tools/seq_generator.cpp +++ b/core/predictor/tools/seq_generator.cpp @@ -16,7 +16,9 @@ #include #include #include "core/predictor/framework.pb.h" +#include "quant.h" #include "seq_file.h" + using paddle::framework::proto::VarType; std::map var_type_size; void reg_var_types() { @@ -31,6 +33,7 @@ void reg_var_types() { var_type_size[static_cast(VarType::UINT8)] = sizeof(uint8_t); var_type_size[static_cast(VarType::INT8)] = sizeof(int8_t); } + int dump_parameter(const char *input_file, const char *output_file) { std::ifstream is(input_file); // the 1st field, unit32_t version for LoDTensor @@ -105,12 +108,127 @@ int dump_parameter(const char *input_file, const char *output_file) { } return 0; } + +int compress_parameter(const char *file1, const char *file2, int bits) { + std::ifstream is(file1); + // Step 1: is read version, os write version + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + if (version != 0) { + std::cout << "Version number " << version << " not supported" << std::endl; + return -1; + } + std::cout << "Version size: " << sizeof(version) << std::endl; + // Step 2: is read LoD level, os write LoD level + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + std::vector> lod; + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + // Step 3: is read Protobuf os Write Protobuf + // Note: duplicate version field + is.read(reinterpret_cast(&version), sizeof(version)); + if (version != 0) { + std::cout << "Version number " << version << " not supported" << std::endl; + return -1; + } + + // Step 4: is read Tensor Data, os write min/max/quant data + VarType::TensorDesc desc; + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + if (!desc.ParseFromArray(buf.get(), size)) { + std::cout << "Cannot parse tensor desc" << std::endl; + return -1; + } + // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + + std::cout << "Dims:"; + for (auto x : dims) { + std::cout << " " << x; + } + std::cout << std::endl; + + if (dims.size() != 2) { + std::cout << "Parameter dims not 2D" << std::endl; + return -1; + } + + size_t numel = 1; + for (auto x : dims) { + numel *= x; + } + size_t buf_size = numel * var_type_size[desc.data_type()]; + std::cout << buf_size << std::endl; + char *tensor_buf = new char[buf_size]; + is.read(static_cast(tensor_buf), buf_size); + float *tensor_float_buf = reinterpret_cast(tensor_buf); + size_t per_line_size = dims[1] * 1 + 2 * sizeof(float); + char *tensor_out = new char[per_line_size * dims[0]]; + + float loss = 0; + float all_loss = 0; + std::cout << "Start Quant" << std::endl; + SeqFileWriter seq_file_writer(file2); + + size_t offset = 0; + + for (int64_t i = 0; i < dims[0]; ++i) { + float xmin = 0, xmax = 0, loss = 0; + size_t scale = dims[1]; + char *tensor_temp = new char[per_line_size]; + greedy_search( + tensor_float_buf + i * dims[1], xmin, xmax, loss, scale, bits); + for (size_t e = 0; e < dims[1]; ++e) { + float x = *(tensor_float_buf + i * dims[1] + e); + int val = round((x - xmin) / (xmax - xmin) * (pow(2, bits) - 1)); + val = std::max(0, val); + val = std::min((int)pow(2, bits) - 1, val); + char *min_ptr = tensor_temp; + char *max_ptr = tensor_temp + sizeof(float); + memcpy(min_ptr, &xmin, sizeof(float)); + memcpy(max_ptr, &xmax, sizeof(float)); + *(tensor_temp + 2 * sizeof(float) + e) = val; + float unit = (xmax - xmin) / pow(2, bits); + float trans_val = unit * val + xmin; + } + seq_file_writer.write((char *)&i, sizeof(i), tensor_temp, per_line_size); + } + return 0; +} int main(int argc, char **argv) { - if (argc != 3) { - std::cout << "Usage: seq_generator PARAMETER_FILE OUTPUT_FILE" << std::endl; + if (argc < 3 || argc > 4) { + std::cout << "Usage: if no compress, please follow:" << std::endl; + std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE\n" << std::endl; + std::cout << "if compress, please follow: " << std::endl; + std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE QUANT_BITS" + << std::endl; + std::cout << "Now it only support 8 bit." << std::endl; return -1; } reg_var_types(); - dump_parameter(argv[1], argv[2]); + if (argc == 3) { + std::cout << "generate normal sparse param sequence file" << std::endl; + dump_parameter(argv[1], argv[2]); + return 0; + } + if (argc == 4) { + std::cout << "generate compressed sparse param sequence file" << std::endl; + compress_parameter(argv[1], argv[2], atoi(argv[3])); + return 0; + } } /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/python/examples/criteo_ctr_with_cube/clean.sh b/python/examples/criteo_ctr_with_cube/clean.sh index 522a602a4e1ea8c9fb8902d8cd0d2d872cba6edd..99a4819802178f1910c5fced7d4c5a39c3037e4a 100755 --- a/python/examples/criteo_ctr_with_cube/clean.sh +++ b/python/examples/criteo_ctr_with_cube/clean.sh @@ -1,5 +1,4 @@ ps -ef | grep cube | awk {'print $2'} | xargs kill -9 -ps -ef | grep SimpleHTTPServer | awk {'print $2'} | xargs kill -9 rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO ps -ef | grep test | awk {'print $2'} | xargs kill -9 ps -ef | grep serving | awk {'print $2'} | xargs kill -9 diff --git a/python/examples/criteo_ctr_with_cube/cube_prepare.sh b/python/examples/criteo_ctr_with_cube/cube_prepare.sh index ceeda0a603e7474f0845333ef94e05d923bde4f4..2d0efaa56f06e9ad8d1590f1316e64bcc65f268d 100755 --- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh +++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh @@ -1,7 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=doc-string-missing +#! /bin/bash mkdir -p cube_model mkdir -p cube/data -./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature +./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature ./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false mv ./cube/data/0_0/test_dict_part0/* ./cube/data/ cd cube && ./cube diff --git a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh new file mode 100755 index 0000000000000000000000000000000000000000..7c794e103baa3a97d09966c470dd48eb56579500 --- /dev/null +++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh @@ -0,0 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=doc-string-missing +#! /bin/bash + +mkdir -p cube_model +mkdir -p cube/data +./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8 +./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false +mv ./cube/data/0_0/test_dict_part0/* ./cube/data/ +cd cube && ./cube diff --git a/python/examples/criteo_ctr_with_cube/test_server_quant.py b/python/examples/criteo_ctr_with_cube/test_server_quant.py new file mode 100755 index 0000000000000000000000000000000000000000..fc278f755126cdeb204644cbc91838b1b038379e --- /dev/null +++ b/python/examples/criteo_ctr_with_cube/test_server_quant.py @@ -0,0 +1,37 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=doc-string-missing + +import os +import sys +from paddle_serving_server import OpMaker +from paddle_serving_server import OpSeqMaker +from paddle_serving_server import Server + +op_maker = OpMaker() +read_op = op_maker.create('general_reader') +general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer') +response_op = op_maker.create('general_response') + +op_seq_maker = OpSeqMaker() +op_seq_maker.add_op(read_op) +op_seq_maker.add_op(general_dist_kv_infer_op) +op_seq_maker.add_op(response_op) + +server = Server() +server.set_op_sequence(op_seq_maker.get_op_sequence()) +server.set_num_threads(4) +server.load_model_config(sys.argv[1]) +server.prepare_server(workdir="work_dir1", port=9292, device="cpu") +server.run_server() diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py index 55c5f8b4ae2f4728be87a9f5ed15e28e584b66c6..d893861e2b0cc1615701191a66aa6ff0bcb53305 100644 --- a/python/paddle_serving_server/__init__.py +++ b/python/paddle_serving_server/__init__.py @@ -33,7 +33,7 @@ class OpMaker(object): "general_text_response": "GeneralTextResponseOp", "general_single_kv": "GeneralSingleKVOp", "general_dist_kv_infer": "GeneralDistKVInferOp", - "general_dist_kv": "GeneralDistKVOp", + "general_dist_kv_quant_infer": "GeneralDistKVQuantInferOp", "general_copy": "GeneralCopyOp" } @@ -164,6 +164,8 @@ class Server(object): if "dist_kv" in node.name: self.resource_conf.cube_config_path = workdir self.resource_conf.cube_config_file = self.cube_config_fn + if "quant" in node.name: + self.resource_conf.cube_quant_bits = 8 self.resource_conf.model_toolkit_path = workdir self.resource_conf.model_toolkit_file = self.model_toolkit_fn self.resource_conf.general_model_path = workdir