diff --git a/demo-client/src/bert_service.cpp b/demo-client/src/bert_service.cpp index c0b89deb08d1f5ca787730f52a402c164ae25647..c76b979c3b71a9a3e659991f61185ab9823e8c64 100644 --- a/demo-client/src/bert_service.cpp +++ b/demo-client/src/bert_service.cpp @@ -22,6 +22,7 @@ #include "sdk-cpp/bert_service.pb.h" #include "sdk-cpp/include/common.h" #include "sdk-cpp/include/predictor_sdk.h" +#include "data_pre.h" using baidu::paddle_serving::sdk_cpp::Predictor; using baidu::paddle_serving::sdk_cpp::PredictorApi; @@ -31,31 +32,17 @@ using baidu::paddle_serving::predictor::bert_service::BertResInstance; using baidu::paddle_serving::predictor::bert_service::BertReqInstance; using baidu::paddle_serving::predictor::bert_service::Embedding_values; -int batch_size = 49; -int max_seq_len = 82; -int layer_num = 12; -int emb_size = 768; -int thread_num = 1; +extern int batch_size = 1; +extern int max_seq_len = 128; +extern int layer_num = 12; +extern int emb_size = 768; +extern int thread_num = 1; std::atomic g_concurrency(0); std::vector> response_time; char* data_filename = "./data/bert/demo_wiki_train"; -std::vector split(const std::string& str, - const std::string& pattern) { - std::vector res; - if (str == "") return res; - std::string strs = str + pattern; - size_t pos = strs.find(pattern); - while (pos != strs.npos) { - std::string temp = strs.substr(0, pos); - res.push_back(temp); - strs = strs.substr(pos + 1, strs.size()); - pos = strs.find(pattern); - } - return res; -} -/* +#if 1 int create_req(Request* req, const std::vector& data_list, int data_index, @@ -90,59 +77,13 @@ int create_req(Request* req, ins->add_input_masks(0.0); } } + ins->set_max_seq_len(max_seq_len); } return 0; } -*/ - -int create_req(Request* req, - const std::vector& data_list, - int data_index, - int batch_size) { - // add data - // avoid out of boundary - int cur_index = data_index; - if (cur_index >= data_list.size()) { - cur_index = cur_index % data_list.size(); - } - - std::vector feature_list = split(data_list[cur_index], ";"); - - std::vector src_field = split(feature_list[0], ":"); - std::vector src_ids = split(src_field[1], " "); - - std::vector pos_field = split(feature_list[1], ":"); - std::vector pos_ids = split(pos_field[1], " "); - - std::vector sent_field = split(feature_list[2], ":"); - std::vector sent_ids = split(sent_field[1], " "); - - std::vector mask_field = split(feature_list[3], ":"); - std::vector input_mask = split(mask_field[1], " "); - - std::vector shape; - std::vector shapes = split(src_field[0], " "); - for (auto x: shapes) { - shape.push_back(std::stoi(x)); - } +#else - for (int i = 0; i < batch_size && i < shape[0]; ++i) { - BertReqInstance* ins = req->add_instances(); - if (!ins) { - LOG(ERROR) << "Failed create req instance"; - return -1; - } - for (int fi = 0; fi < max_seq_len; fi++) { - ins->add_token_ids(std::stoi(src_ids[i * max_seq_len + fi])); - ins->add_position_ids(std::stoi(pos_ids[i * max_seq_len + fi])); - ins->add_sentence_type_ids(std::stoi(sent_ids[i * max_seq_len + fi])); - ins->add_input_masks(std::stof(input_mask[i * max_seq_len + fi])); - } - } - return 0; -} -#if 0 int create_req(Request* req, const std::vector& data_list, int data_index, @@ -167,11 +108,11 @@ int create_req(Request* req, std::vector seg_list = split(feature_list[3], " "); std::vector mask_list = split(feature_list[4], " "); for (int fi = 0; fi < max_seq_len; fi++) { - if (fi < std::stoi(shape_list[1])) { - ins->add_token_ids(std::stoi(token_list[fi + (i * max_seq_len)])); - ins->add_sentence_type_ids(std::stoll(seg_list[fi + (i * max_seq_len)])); - ins->add_position_ids(std::stoll(pos_list[fi + (i * max_seq_len)])); - ins->add_input_masks(std::stof(mask_list[fi + (i * max_seq_len)])); + if (fi < token_list.size()) { + ins->add_token_ids(std::stoi(token_list[fi])); + ins->add_sentence_type_ids(std::stoll(seg_list[fi])); + ins->add_position_ids(std::stoll(pos_list[fi])); + ins->add_input_masks(std::stof(mask_list[fi])); } else { ins->add_token_ids(0); ins->add_sentence_type_ids(0); @@ -182,6 +123,7 @@ int create_req(Request* req, } return 0; } + #endif void print_res(const Request& req, @@ -232,17 +174,11 @@ void thread_worker(PredictorApi* api, } g_concurrency++; LOG(INFO) << "Current concurrency " << g_concurrency.load(); -#if 0 int data_index = turns * batch_size; if (create_req(&req, data_list, data_index, batch_size) != 0) { return; } -#else - if (create_req(&req, data_list, turns, batch_size) != 0) { - return; - } -#endif - if (predictor->inference(&req, &res) != 0) { + if (predictor->inference(&req, &res) != 0) { LOG(ERROR) << "failed call predictor with req:" << req.ShortDebugString(); return; } @@ -310,6 +246,11 @@ int main(int argc, char** argv) { PredictorApi api; response_time.resize(thread_num); int server_concurrency = thread_num; + if (argc > 1) { + thread_num = std::stoi(argv[1]); + batch_size = std::stoi(argv[2]); + max_seq_len = std::stoi(argv[3]); + } // log set #ifdef BCLOUD logging::LoggingSettings settings; diff --git a/demo-serving/op/bert_service_op.cpp b/demo-serving/op/bert_service_op.cpp index 3ca21e64baa14f38046351793b5038f4ff728b76..c8a3cda5a4ead6a4f1be775ca0e742d357e12199 100644 --- a/demo-serving/op/bert_service_op.cpp +++ b/demo-serving/op/bert_service_op.cpp @@ -17,9 +17,6 @@ #include #include "predictor/framework/infer.h" #include "predictor/framework/memory.h" -#if 1 -#include -#endif namespace baidu { namespace paddle_serving { namespace serving { @@ -31,7 +28,7 @@ using baidu::paddle_serving::predictor::bert_service::BertReqInstance; using baidu::paddle_serving::predictor::bert_service::Request; using baidu::paddle_serving::predictor::bert_service::Embedding_values; -const uint32_t MAX_SEQ_LEN = 82; +extern int64_t MAX_SEQ_LEN = 128; const bool POOLING = true; const int LAYER_NUM = 12; const int EMB_SIZE = 768; @@ -48,6 +45,8 @@ int BertServiceOp::inference() { return 0; } + MAX_SEQ_LEN = req->instances(0).max_seq_len(); + paddle::PaddleTensor src_ids; paddle::PaddleTensor pos_ids; paddle::PaddleTensor seg_ids; @@ -96,6 +95,7 @@ int BertServiceOp::inference() { memcpy(src_data, req_instance.token_ids().data(), sizeof(int64_t) * MAX_SEQ_LEN); +#if 1 memcpy(pos_data, req_instance.position_ids().data(), sizeof(int64_t) * MAX_SEQ_LEN); @@ -105,54 +105,27 @@ int BertServiceOp::inference() { memcpy(input_masks_data, req_instance.input_masks().data(), sizeof(float) * MAX_SEQ_LEN); +#endif index += MAX_SEQ_LEN; } -#if 0 - int64_t *src_data = static_cast(src_ids.data.data()); - std::ostringstream oss; - oss << "src_ids: "; - for (int i = 0; i < MAX_SEQ_LEN * batch_size; ++i) { - oss << src_data[i] << " "; - } - LOG(INFO) << oss.str(); - -#endif in->push_back(src_ids); in->push_back(pos_ids); in->push_back(seg_ids); in->push_back(input_masks); TensorVector *out = butil::get_object(); -// TensorVector out; -/* if (!out) { LOG(ERROR) << "Failed get tls output object"; return -1; } + +/* + float* example = (float*)(*in)[3].data.data(); + for(uint32_t i = 0; i < MAX_SEQ_LEN; i++){ + LOG(INFO) << *(example + i); */ - LOG(INFO) << "batch_size : " << batch_size; - for (int j = 0; j < 3; j ++) { - LOG(INFO) << "name : " << (*in)[j].name << " shape : " << (*in)[j].shape[0] - << " " << (*in)[j].shape[1] << " " << (*in)[j].shape[2]; - int64_t* example = (int64_t*)(*in)[j].data.data(); - std::ostringstream oss; - for(uint32_t i = MAX_SEQ_LEN * (batch_size - 1); i < MAX_SEQ_LEN * batch_size; i++){ - oss << *(example + i); - } - LOG(INFO) << "data : " << oss.str(); - } - for (int j =3; j < 4; j++) { - LOG(INFO) << "name : " << (*in)[j].name << " shape : " << (*in)[j].shape[0] - << " " << (*in)[j].shape[1] << " " << (*in)[j].shape[2]; - float* example = (float*)(*in)[j].data.data(); - std::ostringstream oss; - for(uint32_t i = MAX_SEQ_LEN * (batch_size - 1); i < MAX_SEQ_LEN * batch_size; i++){ - oss << *(example + i); - } - LOG(INFO) << "data : " << oss.str(); - } if (predictor::InferManager::instance().infer( BERT_MODEL_NAME, in, out, batch_size)) { @@ -160,20 +133,12 @@ int BertServiceOp::inference() { return -1; } -/* - paddle::NativeConfig config; - config.model_dir = "./data/model/paddle/fluid/bert"; - auto predictor = CreatePaddlePredictor(config); - predictor->Run(*in, &out); -*/ #if 0 - // float *out_data = static_cast(out->at(0).data.data()); - LOG(INFO) << "check point"; - /* LOG(INFO) << "batch_size : " << out->at(0).shape[0] << " seq_len : " << out->at(0).shape[1] << " emb_size : " << out->at(0).shape[2]; + float *out_data = (float*) out->at(0).data.data(); for (uint32_t bi = 0; bi < batch_size; bi++) { BertResInstance *res_instance = res->add_instances(); for (uint32_t si = 0; si < MAX_SEQ_LEN; si++) { @@ -184,7 +149,22 @@ int BertServiceOp::inference() { } } } +#else + LOG(INFO) << "batch_size : " << out->at(0).shape[0] + << " emb_size : " << out->at(0).shape[1]; + float *out_data = (float*) out->at(0).data.data(); + for (uint32_t bi = 0; bi < batch_size; bi++) { + BertResInstance *res_instance = res->add_instances(); + for (uint32_t si = 0; si < 1; si++) { + Embedding_values *emb_instance = res_instance->add_instances(); + for (uint32_t ei = 0; ei < EMB_SIZE; ei++) { + uint32_t index = bi * MAX_SEQ_LEN * EMB_SIZE + si * EMB_SIZE + ei; + emb_instance->add_values(out_data[index]); + } + } + } +#endif for (size_t i = 0; i < in->size(); ++i) { (*in)[i].shape.clear(); } @@ -196,43 +176,6 @@ int BertServiceOp::inference() { } out->clear(); butil::return_object(out); - */ - -#else - float *out_data = static_cast(out->at(0).data.data()); - std::ostringstream oss; - oss << "Shape: ["; - - for (auto x: out->at(0).shape) { - oss << x << " "; - } - oss << "]"; - - LOG(INFO) << oss.str(); - - // Output shape is [batch_size x 3] - for (uint32_t bi = 0; bi < batch_size; bi++) { - BertResInstance *res_instance = res->add_instances(); - std::ostringstream oss; - oss << "Sample " << bi << " ["; - oss << out_data[bi * 3 + 0] << " " - << out_data[bi * 3 + 1] << " " - << out_data[bi * 3 + 2] << "]"; - LOG(INFO) << oss.str(); - } - - for (size_t i = 0; i < in->size(); ++i) { - (*in)[i].shape.clear(); - } - in->clear(); - butil::return_object(in); - - for (size_t i = 0; i < out->size(); ++i) { - (*out)[i].shape.clear(); - } - out->clear(); - butil::return_object(out); -#endif return 0; } diff --git a/demo-serving/proto/bert_service.proto b/demo-serving/proto/bert_service.proto index 2d78ba557c68e3701d783bd25151909a889f498c..ce1ceeb5ee440c5fd6c8ff2e573d46c30fb4e8ff 100644 --- a/demo-serving/proto/bert_service.proto +++ b/demo-serving/proto/bert_service.proto @@ -25,6 +25,7 @@ message BertReqInstance { repeated int64 sentence_type_ids = 2; repeated int64 position_ids = 3; repeated float input_masks = 4; + required int64 max_seq_len = 5; }; message Request { repeated BertReqInstance instances = 1; }; diff --git a/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/inferencer-fluid-gpu/include/fluid_gpu_engine.h index 226e114f8cea32eda110cde6d7810506f7236d61..81c20d69121b19e0f43b03630c476dc8c2ae3d4f 100644 --- a/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -200,6 +200,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableUseGpu(100, FLAGS_gpuid); analysis_config.SwitchSpecifyInputNames(true); analysis_config.SetCpuMathLibraryNumThreads(1); + analysis_config.SwitchIrOptim(true); if (params.enable_memory_optimization()) { analysis_config.EnableMemoryOptim(params.static_optimization(), diff --git a/sdk-cpp/proto/bert_service.proto b/sdk-cpp/proto/bert_service.proto index ae078f41c2ac94d9060a4ba7323fc0131c420fcd..6d130fa3245d4111442fe12792203527af4813d1 100644 --- a/sdk-cpp/proto/bert_service.proto +++ b/sdk-cpp/proto/bert_service.proto @@ -25,6 +25,7 @@ message BertReqInstance { repeated int64 sentence_type_ids = 2; repeated int64 position_ids = 3; repeated float input_masks = 4; + required int64 max_seq_len = 5; }; message Request { repeated BertReqInstance instances = 1; };