提交 80093bab 编写于 作者: D Dong Daxiang 提交者: GitHub

Merge pull request #327 from wangjiawei04/develop

Add quantization to General Dist KV Op
......@@ -56,6 +56,7 @@ message ResourceConf {
optional string general_model_file = 4;
optional string cube_config_path = 5;
optional string cube_config_file = 6;
optional int32 cube_quant_bits = 7; // set 0 if no quant.
};
// DAG node depency info
......
FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../predictor/tools/quant.cpp)
LIST(APPEND serving_srcs ${op_srcs})
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_dist_kv_quant_infer_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include <unordered_map>
#include <utility>
#include "core/cube/cube-api/include/cube_api.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
#include "core/predictor/tools/quant.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralDistKVQuantInferOp::inference() {
VLOG(2) << "Going to run inference";
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name());
VLOG(2) << "Get precedent op name: " << pre_name();
GeneralBlob *output_blob = mutable_data<GeneralBlob>();
if (!input_blob) {
LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name();
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
TensorVector *out = &output_blob->tensor_vector;
int batch_size = input_blob->GetBatchSize();
VLOG(2) << "input batch size: " << batch_size;
std::vector<uint64_t> keys;
std::vector<rec::mcube::CubeValue> values;
int sparse_count = 0;
int dense_count = 0;
std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
size_t key_len = 0;
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
++dense_count;
continue;
}
++sparse_count;
size_t elem_num = 1;
for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
elem_num *= in->at(i).shape[s];
}
key_len += elem_num;
int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
}
keys.resize(key_len);
int key_idx = 0;
for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
std::copy(dataptr_size_pairs[i].first,
dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
keys.begin() + key_idx);
key_idx += dataptr_size_pairs[i].second;
}
rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
std::vector<std::string> table_names = cube->get_table_names();
if (table_names.size() == 0) {
LOG(ERROR) << "cube init error or cube config not given.";
return -1;
}
int ret = cube->seek(table_names[0], keys, &values);
if (values.size() != keys.size() || values[0].buff.size() == 0) {
LOG(ERROR) << "cube value return null";
}
TensorVector sparse_out;
sparse_out.resize(sparse_count);
TensorVector dense_out;
dense_out.resize(dense_count);
int cube_val_idx = 0;
int sparse_idx = 0;
int dense_idx = 0;
std::unordered_map<int, int> in_out_map;
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config();
int cube_quant_bits = resource.get_cube_quant_bits();
size_t EMBEDDING_SIZE = 0;
if (cube_quant_bits == 0) {
EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
} else {
EMBEDDING_SIZE = values[0].buff.size() - 2 * sizeof(float);
}
for (size_t i = 0; i < in->size(); ++i) {
if (in->at(i).dtype != paddle::PaddleDType::INT64) {
dense_out[dense_idx] = in->at(i);
++dense_idx;
continue;
}
sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
std::copy(in->at(i).lod[x].begin(),
in->at(i).lod[x].end(),
sparse_out[sparse_idx].lod[x].begin());
}
sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
sparse_out[sparse_idx].shape.push_back(
sparse_out[sparse_idx].lod[0].back());
sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
sparse_out[sparse_idx].name = model_config->_feed_name[i];
sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
EMBEDDING_SIZE * sizeof(float));
// END HERE
float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
if (cube_quant_bits == 0) {
memcpy(data_ptr,
values[cube_val_idx].buff.data(),
values[cube_val_idx].buff.size());
} else {
// min (float), max (float), num, num, num... (Byte)
size_t num_of_float =
values[cube_val_idx].buff.size() - 2 * sizeof(float);
float *float_ptr = new float[num_of_float];
char *src_ptr = new char[values[cube_val_idx].buff.size()];
memcpy(src_ptr,
values[cube_val_idx].buff.data(),
values[cube_val_idx].buff.size());
float *minmax = reinterpret_cast<float *>(src_ptr);
dequant(src_ptr + 2 * sizeof(float),
float_ptr,
minmax[0],
minmax[1],
num_of_float,
cube_quant_bits);
memcpy(data_ptr, float_ptr, sizeof(float) * num_of_float);
delete float_ptr;
delete src_ptr;
}
cube_val_idx++;
}
++sparse_idx;
}
TensorVector infer_in;
infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
output_blob->SetBatchSize(batch_size);
VLOG(2) << "infer batch size: " << batch_size;
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
if (InferManager::instance().infer(
GENERAL_MODEL_NAME, &infer_in, out, batch_size)) {
LOG(ERROR) << "Failed do infer in fluid model: " << GENERAL_MODEL_NAME;
return -1;
}
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, output_blob);
AddBlobInfo(output_blob, start);
AddBlobInfo(output_blob, end);
return 0;
}
DEFINE_OP(GeneralDistKVQuantInferOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#ifdef BCLOUD
#ifdef WITH_GPU
#include "paddle/paddle_inference_api.h"
#else
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#endif
#else
#include "paddle_inference_api.h" // NOLINT
#endif
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralDistKVQuantInferOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralDistKVQuantInferOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
......@@ -151,6 +151,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
"/" + resource_conf.cube_config_file();
this->cube_config_fullpath = cube_config_fullpath;
this->cube_quant_bits = resource_conf.has_cube_quant_bits()
? resource_conf.cube_quant_bits()
: 0;
if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) {
LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8.";
return -1;
}
if (this->cube_quant_bits == 0) {
LOG(INFO) << "cube quant mode OFF";
} else {
LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits;
}
}
THREAD_SETSPECIFIC(_tls_bspec_key, NULL);
......@@ -258,38 +270,6 @@ int Resource::general_model_initialize(const std::string& path,
return 0;
}
int Resource::cube_initialize(const std::string& path,
const std::string& file) {
// cube
if (!FLAGS_enable_cube) {
return 0;
}
ResourceConf resource_conf;
if (configure::read_proto_conf(path, file, &resource_conf) != 0) {
LOG(ERROR) << "Failed initialize resource from: " << path << "/" << file;
return -1;
}
int err = 0;
std::string cube_config_file = resource_conf.cube_config_file();
if (err != 0) {
LOG(ERROR) << "reade cube_config_file failed, path[" << path << "], file["
<< cube_config_file << "]";
return -1;
}
err = CubeAPI::instance()->init(cube_config_file.c_str());
if (err != 0) {
LOG(ERROR) << "failed initialize cube, config: " << cube_config_file
<< " error code : " << err;
return -1;
}
LOG(INFO) << "Successfully initialize cube";
return 0;
}
int Resource::thread_initialize() {
// mempool
if (MempoolWrapper::instance().thread_initialize() != 0) {
......@@ -373,6 +353,7 @@ int Resource::thread_clear() {
// ...
return 0;
}
size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; }
int Resource::reload() {
if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) {
......
......@@ -82,7 +82,6 @@ class Resource {
}
int initialize(const std::string& path, const std::string& file);
int cube_initialize(const std::string& path, const std::string& file);
int general_model_initialize(const std::string& path,
const std::string& file);
......@@ -104,11 +103,13 @@ class Resource {
return reinterpret_cast<DynamicResource*>(
THREAD_GETSPECIFIC(_tls_bspec_key));
}
size_t get_cube_quant_bits();
private:
int thread_finalize() { return 0; }
std::shared_ptr<PaddleGeneralModelConfig> _config;
std::string cube_config_fullpath;
int cube_quant_bits; // 0 if no empty
THREAD_KEY_T _tls_bspec_key;
};
......
......@@ -202,14 +202,6 @@ int main(int argc, char** argv) {
}
VLOG(2) << "Succ call pthread worker start function";
if (Resource::instance().cube_initialize(FLAGS_resource_path,
FLAGS_resource_file) != 0) {
LOG(ERROR) << "Failed initialize cube, conf: " << FLAGS_resource_path << "/"
<< FLAGS_resource_file;
return -1;
}
VLOG(2) << "Succ initialize cube";
#ifndef BCLOUD
if (Resource::instance().general_model_initialize(FLAGS_resource_path,
......
set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp)
set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIST_DIR}/seq_file.cpp ${CMAKE_CURRENT_LIST_DIR}/quant.cpp)
LIST(APPEND seq_gen_src ${PROTO_SRCS})
add_executable(seq_generator ${seq_gen_src})
target_link_libraries(seq_generator protobuf -lpthread)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "quant.h"
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <mutex>
#include <string>
#include "seq_file.h"
using paddle::framework::proto::VarType;
float compute_loss(float* a, float* b, int emb_size) {
float sum = 0;
for (size_t i = 0; i < emb_size; i++) {
sum += (a[i] - b[i]) * (a[i] - b[i]);
}
return sum;
}
float* transfer(
float* in, float* out, float min, float max, int emb_size, int bits) {
float scale = (max - min) / pow(2, bits);
for (size_t i = 0; i < emb_size; i++) {
float x = in[i];
int val = round((x - min) / (max - min) * (pow(2, bits) - 1));
val = std::max(0, val);
val = std::min((int)pow(2, bits) - 1, val);
out[i] = val * scale + min;
}
return out;
}
char* quant(
float* in, char** out, float min, float max, int emb_size, int bits) {
float scale = (max - min) / pow(2, bits);
for (size_t i = 0; i < emb_size; ++i) {
float x = in[i];
int val = round((x - min) / (max - min) * (pow(2, bits) - 1));
val = std::max(0, val);
val = std::min((int)pow(2, bits) - 1, val);
*out[emb_size] = val;
}
return *out;
}
float* dequant(
char* in, float* out, float min, float max, int emb_size, int bits) {
float scale = (max - min) / pow(2, bits);
for (size_t i = 0; i < emb_size; ++i) {
float x =
scale * (((int)in[i] + (int)pow(2, bits)) % (int)pow(2, bits)) + min;
out[i] = x;
}
return out;
}
void greedy_search(float* in,
float& xmin,
float& xmax,
float& loss,
size_t emb_size,
int bits) {
int b = 200;
float r = 0.16;
xmin = 2147483647;
xmax = -2147483648;
float cur_min = xmin;
float cur_max = xmax;
for (size_t i = 0; i < emb_size; i++) {
xmin = std::min(xmin, in[i]);
xmax = std::max(xmax, in[i]);
}
cur_min = xmin;
cur_max = xmax;
float out[emb_size];
loss = compute_loss(
in, transfer(in, out, cur_min, cur_max, emb_size, bits), emb_size);
float stepsize = (cur_max - cur_min) / b;
float min_steps = b * (1 - r) * stepsize;
while (cur_min + min_steps < cur_max) {
float loss_l = compute_loss(
in,
transfer(in, out, cur_min + stepsize, cur_max, emb_size, bits),
emb_size);
float loss_r = compute_loss(
in,
transfer(in, out, cur_min, cur_max - stepsize, emb_size, bits),
emb_size);
if (loss_l < loss) {
cur_min = cur_min + stepsize;
if (loss_l < loss_r) {
loss = loss_l;
xmin = cur_min;
}
} else {
cur_max = cur_max - stepsize;
if (loss_r < loss) {
loss = loss_r;
xmax = cur_max;
}
}
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <mutex>
#include <string>
#include "core/predictor/framework.pb.h"
#include "seq_file.h"
using paddle::framework::proto::VarType;
void greedy_search(float* in,
float& xmin,
float& xmax,
float& loss,
size_t emb_size,
int bits);
// std::mutex g_mtx;
float compute_loss(float* a, float* b, int emb_size);
float* transfer(
float* in, float* out, float min, float max, int emb_size, int bits);
char* quant(
float* in, char** out, float min, float max, int emb_size, int bits);
float* dequant(
char* in, float* out, float min, float max, int emb_size, int bits);
void greedy_search(float* in,
float& xmin,
float& xmax,
float& loss,
size_t emb_size,
int bits);
......@@ -16,7 +16,9 @@
#include <iostream>
#include <memory>
#include "core/predictor/framework.pb.h"
#include "quant.h"
#include "seq_file.h"
using paddle::framework::proto::VarType;
std::map<int, size_t> var_type_size;
void reg_var_types() {
......@@ -31,6 +33,7 @@ void reg_var_types() {
var_type_size[static_cast<int>(VarType::UINT8)] = sizeof(uint8_t);
var_type_size[static_cast<int>(VarType::INT8)] = sizeof(int8_t);
}
int dump_parameter(const char *input_file, const char *output_file) {
std::ifstream is(input_file);
// the 1st field, unit32_t version for LoDTensor
......@@ -105,12 +108,127 @@ int dump_parameter(const char *input_file, const char *output_file) {
}
return 0;
}
int compress_parameter(const char *file1, const char *file2, int bits) {
std::ifstream is(file1);
// Step 1: is read version, os write version
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
if (version != 0) {
std::cout << "Version number " << version << " not supported" << std::endl;
return -1;
}
std::cout << "Version size: " << sizeof(version) << std::endl;
// Step 2: is read LoD level, os write LoD level
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
std::vector<std::vector<size_t>> lod;
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
// Step 3: is read Protobuf os Write Protobuf
// Note: duplicate version field
is.read(reinterpret_cast<char *>(&version), sizeof(version));
if (version != 0) {
std::cout << "Version number " << version << " not supported" << std::endl;
return -1;
}
// Step 4: is read Tensor Data, os write min/max/quant data
VarType::TensorDesc desc;
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
if (!desc.ParseFromArray(buf.get(), size)) {
std::cout << "Cannot parse tensor desc" << std::endl;
return -1;
}
// read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
std::cout << "Dims:";
for (auto x : dims) {
std::cout << " " << x;
}
std::cout << std::endl;
if (dims.size() != 2) {
std::cout << "Parameter dims not 2D" << std::endl;
return -1;
}
size_t numel = 1;
for (auto x : dims) {
numel *= x;
}
size_t buf_size = numel * var_type_size[desc.data_type()];
std::cout << buf_size << std::endl;
char *tensor_buf = new char[buf_size];
is.read(static_cast<char *>(tensor_buf), buf_size);
float *tensor_float_buf = reinterpret_cast<float *>(tensor_buf);
size_t per_line_size = dims[1] * 1 + 2 * sizeof(float);
char *tensor_out = new char[per_line_size * dims[0]];
float loss = 0;
float all_loss = 0;
std::cout << "Start Quant" << std::endl;
SeqFileWriter seq_file_writer(file2);
size_t offset = 0;
for (int64_t i = 0; i < dims[0]; ++i) {
float xmin = 0, xmax = 0, loss = 0;
size_t scale = dims[1];
char *tensor_temp = new char[per_line_size];
greedy_search(
tensor_float_buf + i * dims[1], xmin, xmax, loss, scale, bits);
for (size_t e = 0; e < dims[1]; ++e) {
float x = *(tensor_float_buf + i * dims[1] + e);
int val = round((x - xmin) / (xmax - xmin) * (pow(2, bits) - 1));
val = std::max(0, val);
val = std::min((int)pow(2, bits) - 1, val);
char *min_ptr = tensor_temp;
char *max_ptr = tensor_temp + sizeof(float);
memcpy(min_ptr, &xmin, sizeof(float));
memcpy(max_ptr, &xmax, sizeof(float));
*(tensor_temp + 2 * sizeof(float) + e) = val;
float unit = (xmax - xmin) / pow(2, bits);
float trans_val = unit * val + xmin;
}
seq_file_writer.write((char *)&i, sizeof(i), tensor_temp, per_line_size);
}
return 0;
}
int main(int argc, char **argv) {
if (argc != 3) {
std::cout << "Usage: seq_generator PARAMETER_FILE OUTPUT_FILE" << std::endl;
if (argc < 3 || argc > 4) {
std::cout << "Usage: if no compress, please follow:" << std::endl;
std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE\n" << std::endl;
std::cout << "if compress, please follow: " << std::endl;
std::cout << "seq_generator PARAMETER_FILE OUTPUT_FILE QUANT_BITS"
<< std::endl;
std::cout << "Now it only support 8 bit." << std::endl;
return -1;
}
reg_var_types();
dump_parameter(argv[1], argv[2]);
if (argc == 3) {
std::cout << "generate normal sparse param sequence file" << std::endl;
dump_parameter(argv[1], argv[2]);
return 0;
}
if (argc == 4) {
std::cout << "generate compressed sparse param sequence file" << std::endl;
compress_parameter(argv[1], argv[2], atoi(argv[3]));
return 0;
}
}
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
ps -ef | grep cube | awk {'print $2'} | xargs kill -9
ps -ef | grep SimpleHTTPServer | awk {'print $2'} | xargs kill -9
rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
ps -ef | grep test | awk {'print $2'} | xargs kill -9
ps -ef | grep serving | awk {'print $2'} | xargs kill -9
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
#! /bin/bash
mkdir -p cube_model
mkdir -p cube/data
./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature
./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature
./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false
mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
cd cube && ./cube
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
#! /bin/bash
mkdir -p cube_model
mkdir -p cube/data
./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8
./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1 -only_build=false
mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
cd cube && ./cube
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server import OpMaker
from paddle_serving_server import OpSeqMaker
from paddle_serving_server import Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_dist_kv_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(4)
server.load_model_config(sys.argv[1])
server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
server.run_server()
......@@ -33,7 +33,7 @@ class OpMaker(object):
"general_text_response": "GeneralTextResponseOp",
"general_single_kv": "GeneralSingleKVOp",
"general_dist_kv_infer": "GeneralDistKVInferOp",
"general_dist_kv": "GeneralDistKVOp",
"general_dist_kv_quant_infer": "GeneralDistKVQuantInferOp",
"general_copy": "GeneralCopyOp"
}
......@@ -164,6 +164,8 @@ class Server(object):
if "dist_kv" in node.name:
self.resource_conf.cube_config_path = workdir
self.resource_conf.cube_config_file = self.cube_config_fn
if "quant" in node.name:
self.resource_conf.cube_quant_bits = 8
self.resource_conf.model_toolkit_path = workdir
self.resource_conf.model_toolkit_file = self.model_toolkit_fn
self.resource_conf.general_model_path = workdir
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册