“506092146d05aa94d2430c5b3dfb63d699dca858”上不存在“python/paddle/distributed/run/__main__.py”
未验证 提交 5699ab64 编写于 作者: J Jiawei Wang 提交者: GitHub

Merge branch 'develop' into cube_062

...@@ -30,7 +30,7 @@ find_package(Threads REQUIRED) ...@@ -30,7 +30,7 @@ find_package(Threads REQUIRED)
find_package(CUDA QUIET) find_package(CUDA QUIET)
include(simd) include(simd)
# SET(CMAKE_BUILD_TYPE "Debug")
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
......
...@@ -175,9 +175,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p ...@@ -175,9 +175,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
| Argument | Type | Default | Description | | Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- | | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `thread` | int | `4` | Concurrency of current service | | `thread` | int | `2` | Number of brpc service thread |
| `op_num` | int[]| `0` | Thread Number for each model in asynchronous mode |
| `op_max_batch` | int[]| `0` | Batch Number for each model in asynchronous mode |
| `gpu_ids` | str[]| `"-1"` | Gpu card id for each model |
| `port` | int | `9292` | Exposed port of current service to users | | `port` | int | `9292` | Exposed port of current service to users |
| `model` | str | `""` | Path of paddle model directory to be served | | `model` | str[]| `""` | Path of paddle model directory to be served |
| `mem_optim_off` | - | - | Disable memory / graphic memory optimization | | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
| `ir_optim` | bool | False | Enable analysis and optimization of calculation graph | | `ir_optim` | bool | False | Enable analysis and optimization of calculation graph |
| `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
...@@ -186,7 +189,24 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p ...@@ -186,7 +189,24 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
| `use_xpu` | - | - | Run PaddleLite inference with Baidu Kunlun XPU | | `use_xpu` | - | - | Run PaddleLite inference with Baidu Kunlun XPU |
| `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 | | `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 |
| `use_calib` | bool | False | Only for deployment with TensorRT | | `use_calib` | bool | False | Only for deployment with TensorRT |
| `gpu_multi_stream` | bool | False | EnableGpuMultiStream to get larger QPS |
#### Description of asynchronous model
Asynchronous mode is suitable for 1. When the number of requests is very large, 2. When multiple models are concatenated and you want to specify the concurrency number of each model.
Asynchronous mode helps to improve the throughput (QPS) of service, but for a single request, the delay will increase slightly.
In asynchronous mode, each model will start n threads of the number you specify, and each thread contains a model instance. In other words, each model is equivalent to a thread pool containing N threads, and the task is taken from the task queue of the thread pool to execute.
In asynchronous mode, each RPC server thread is only responsible for putting the request into the task queue of the model thread pool. After the task is executed, the completed task is removed from the task queue.
In the above table, the number of RPC server threads is specified by --thread, and the default value is 2.
--op_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
--op_max_batch specifies the number of batches for each model. The default value is 32. It takes effect when --op_num is not 0.
#### When you want a model to use multiple GPU cards.
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
#### When you want 2 models.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
#### When you want 2 models, and want each of them use multiple GPU cards.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
#### When a service contains two models, and each model needs to specify multiple GPU cards, and needs asynchronous mode, each model specifies different concurrency number.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
</center> </center>
```python ```python
......
...@@ -172,19 +172,40 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p ...@@ -172,19 +172,40 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
``` ```
<center> <center>
| Argument | Type | Default | Description | | Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ------------------------------------------------------ | | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `thread` | int | `4` | Concurrency of current service | | `thread` | int | `2` | Number of brpc service thread |
| `port` | int | `9292` | Exposed port of current service to users | | `op_num` | int[]| `0` | Thread Number for each model in asynchronous mode |
| `name` | str | `""` | Service name, can be used to generate HTTP request url | | `op_max_batch` | int[]| `32` | Batch Number for each model in asynchronous mode |
| `model` | str | `""` | Path of paddle model directory to be served | | `gpu_ids` | str[]| `"-1"` | Gpu card id for each model |
| `mem_optim_off` | - | - | Disable memory optimization | | `port` | int | `9292` | Exposed port of current service to users |
| `ir_optim` | bool | False | Enable analysis and optimization of calculation graph | | `model` | str[]| `""` | Path of paddle model directory to be served |
| `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
| `use_trt` (Only for Cuda>=10.1 version) | - | - | Run inference with TensorRT | | `ir_optim` | bool | False | Enable analysis and optimization of calculation graph |
| `use_lite` (Only for Intel x86 CPU or ARM CPU) | - | - | Run PaddleLite inference | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
| `use_xpu` | - | - | Run PaddleLite inference with Baidu Kunlun XPU | | `use_trt` (Only for trt version) | - | - | Run inference with TensorRT |
| `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 | | `use_lite` (Only for Intel x86 CPU or ARM CPU) | - | - | Run PaddleLite inference |
| `use_xpu` | - | - | Run PaddleLite inference with Baidu Kunlun XPU |
| `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 |
| `use_calib` | bool | False | Only for deployment with TensorRT |
| `gpu_multi_stream` | bool | False | EnableGpuMultiStream to get larger QPS |
#### 异步模型的说明
异步模式适用于1、请求数量非常大的情况,2、多模型串联,想要分别指定每个模型的并发数的情况。
异步模式有助于提高Service服务的吞吐(QPS),但对于单次请求而言,时延会有少量增加。
异步模式中,每个模型会启动您指定个数的N个线程,每个线程中包含一个模型实例,换句话说每个模型相当于包含N个线程的线程池,从线程池的任务队列中取任务来执行。
异步模式中,各个RPC Server的线程只负责将Request请求放入模型线程池的任务队列中,等任务被执行完毕后,再从任务队列中取出已完成的任务。
上表中通过 --thread 10 指定的是RPC Server的线程数量,默认值为2,--op_num 指定的是各个模型的线程池中线程数N,默认值为0,表示不使用异步模式。
--op_max_batch 指定的各个模型的batch数量,默认值为32,该参数只有当--op_num不为0时才生效。
#### 当您的某个模型想使用多张GPU卡部署时.
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
#### 当您的一个服务包含两个模型部署时.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
#### 当您的一个服务包含两个模型,且每个模型都需要指定多张GPU卡部署时.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
#### 当您的一个服务包含两个模型,且每个模型都需要指定多张GPU卡,且需要异步模式每个模型指定不同的并发数时.
python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
</center> </center>
......
文件模式从 100755 更改为 100644
...@@ -33,9 +33,7 @@ if (WITH_PYTHON) ...@@ -33,9 +33,7 @@ if (WITH_PYTHON)
add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(general_model_config_py_proto general_model_config_py_proto_init) add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
if (CLIENT) if (CLIENT)
py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto) py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
...@@ -53,11 +51,7 @@ if (WITH_PYTHON) ...@@ -53,11 +51,7 @@ if (WITH_PYTHON)
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto." COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
if (APP) if (APP)
...@@ -84,11 +78,6 @@ if (WITH_PYTHON) ...@@ -84,11 +78,6 @@ if (WITH_PYTHON)
COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto." COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
endif() endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package baidu.paddle_serving.multi_lang;
option java_multiple_files = true;
option java_package = "io.paddle.serving.grpc";
option java_outer_classname = "ServingProto";
message Tensor {
optional bytes data = 1;
repeated int32 int_data = 2;
repeated int64 int64_data = 3;
repeated float float_data = 4;
optional int32 elem_type = 5;
repeated int32 shape = 6;
repeated int32 lod = 7; // only for fetch tensor currently
};
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst { repeated Tensor tensor_array = 1; };
message InferenceRequest {
repeated FeedInst insts = 1;
repeated string feed_var_names = 2;
repeated string fetch_var_names = 3;
required bool is_python = 4 [ default = false ];
required uint64 log_id = 5 [ default = 0 ];
};
message InferenceResponse {
repeated ModelOutput outputs = 1;
optional string tag = 2;
required int32 err_code = 3;
};
message ModelOutput {
repeated FetchInst insts = 1;
optional string engine_name = 2;
}
message SetTimeoutRequest { required int32 timeout_ms = 1; }
message SimpleResponse { required int32 err_code = 1; }
message GetClientConfigRequest {}
message GetClientConfigResponse { required string client_config_str = 1; }
service MultiLangGeneralModelService {
rpc Inference(InferenceRequest) returns (InferenceResponse) {}
rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
rpc GetClientConfig(GetClientConfigRequest)
returns (GetClientConfigResponse) {}
};
...@@ -21,11 +21,12 @@ message EngineDesc { ...@@ -21,11 +21,12 @@ message EngineDesc {
required string reloadable_meta = 3; required string reloadable_meta = 3;
required string reloadable_type = 4; required string reloadable_type = 4;
required string model_dir = 5; required string model_dir = 5;
required int32 runtime_thread_num = 6; repeated int32 gpu_ids = 6;
required int32 batch_infer_size = 7; required int32 runtime_thread_num = 7;
required int32 enable_batch_align = 8; required int32 batch_infer_size = 8;
optional string version_file = 9; required int32 enable_batch_align = 9;
optional string version_type = 10; optional string version_file = 10;
optional string version_type = 11;
/* /*
* Sparse Parameter Service type. Valid types are: * Sparse Parameter Service type. Valid types are:
...@@ -38,16 +39,17 @@ message EngineDesc { ...@@ -38,16 +39,17 @@ message EngineDesc {
LOCAL = 1; LOCAL = 1;
REMOTE = 2; REMOTE = 2;
} }
optional SparseParamServiceType sparse_param_service_type = 11; optional SparseParamServiceType sparse_param_service_type = 12;
optional string sparse_param_service_table_name = 12; optional string sparse_param_service_table_name = 13;
optional bool enable_memory_optimization = 13; optional bool enable_memory_optimization = 14;
optional bool enable_ir_optimization = 14; optional bool enable_ir_optimization = 15;
optional bool use_trt = 15; optional bool use_trt = 16;
optional bool use_lite = 16; optional bool use_lite = 17;
optional bool use_xpu = 17; optional bool use_xpu = 18;
optional bool use_gpu = 18; optional bool use_gpu = 19;
optional bool combined_model = 19; optional bool combined_model = 20;
optional bool encrypted_model = 20; optional bool encrypted_model = 21;
optional bool gpu_multi_stream = 22;
}; };
// model_toolkit conf // model_toolkit conf
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -207,7 +207,7 @@ class PredictorClient { ...@@ -207,7 +207,7 @@ class PredictorClient {
void init_gflags(std::vector<std::string> argv); void init_gflags(std::vector<std::string> argv);
int init(const std::vector<std::string> &client_conf); int init(const std::vector<std::string>& client_conf);
void set_predictor_conf(const std::string& conf_path, void set_predictor_conf(const std::string& conf_path,
const std::string& conf_file); const std::string& conf_file);
...@@ -218,23 +218,22 @@ class PredictorClient { ...@@ -218,23 +218,22 @@ class PredictorClient {
int destroy_predictor(); int destroy_predictor();
int numpy_predict( int numpy_predict(const std::vector<py::array_t<float>>& float_feed,
const std::vector<std::vector<py::array_t<float>>>& float_feed_batch, const std::vector<std::string>& float_feed_name,
const std::vector<std::string>& float_feed_name, const std::vector<std::vector<int>>& float_shape,
const std::vector<std::vector<int>>& float_shape, const std::vector<std::vector<int>>& float_lod_slot_batch,
const std::vector<std::vector<int>>& float_lod_slot_batch, const std::vector<py::array_t<int64_t>>& int_feed,
const std::vector<std::vector<py::array_t<int64_t>>>& int_feed_batch, const std::vector<std::string>& int_feed_name,
const std::vector<std::string>& int_feed_name, const std::vector<std::vector<int>>& int_shape,
const std::vector<std::vector<int>>& int_shape, const std::vector<std::vector<int>>& int_lod_slot_batch,
const std::vector<std::vector<int>>& int_lod_slot_batch, const std::vector<std::string>& string_feed,
const std::vector<std::vector<std::string>>& string_feed_batch, const std::vector<std::string>& string_feed_name,
const std::vector<std::string>& string_feed_name, const std::vector<std::vector<int>>& string_shape,
const std::vector<std::vector<int>>& string_shape, const std::vector<std::vector<int>>& string_lod_slot_batch,
const std::vector<std::vector<int>>& string_lod_slot_batch, const std::vector<std::string>& fetch_name,
const std::vector<std::string>& fetch_name, PredictorRes& predict_res_batch, // NOLINT
PredictorRes& predict_res_batch, // NOLINT const int& pid,
const int& pid, const uint64_t log_id);
const uint64_t log_id);
private: private:
PredictorApi _api; PredictorApi _api;
...@@ -243,6 +242,7 @@ class PredictorClient { ...@@ -243,6 +242,7 @@ class PredictorClient {
std::string _predictor_path; std::string _predictor_path;
std::string _conf_file; std::string _conf_file;
std::map<std::string, int> _feed_name_to_idx; std::map<std::string, int> _feed_name_to_idx;
std::vector<std::string> _feed_name;
std::map<std::string, int> _fetch_name_to_idx; std::map<std::string, int> _fetch_name_to_idx;
std::map<std::string, std::string> _fetch_name_to_var_name; std::map<std::string, std::string> _fetch_name_to_var_name;
std::map<std::string, int> _fetch_name_to_type; std::map<std::string, int> _fetch_name_to_type;
......
...@@ -25,8 +25,6 @@ using baidu::paddle_serving::Timer; ...@@ -25,8 +25,6 @@ using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::general_model::FetchInst;
enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING }; enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
std::once_flag gflags_init_flag; std::once_flag gflags_init_flag;
namespace py = pybind11; namespace py = pybind11;
...@@ -68,9 +66,13 @@ int PredictorClient::init(const std::vector<std::string> &conf_file) { ...@@ -68,9 +66,13 @@ int PredictorClient::init(const std::vector<std::string> &conf_file) {
_fetch_name_to_idx.clear(); _fetch_name_to_idx.clear();
_shape.clear(); _shape.clear();
int feed_var_num = model_config.feed_var_size(); int feed_var_num = model_config.feed_var_size();
_feed_name.clear();
VLOG(2) << "feed var num: " << feed_var_num; VLOG(2) << "feed var num: " << feed_var_num;
for (int i = 0; i < feed_var_num; ++i) { for (int i = 0; i < feed_var_num; ++i) {
_feed_name_to_idx[model_config.feed_var(i).alias_name()] = i; _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
VLOG(2) << "feed [" << i << "]"
<< " name: " << model_config.feed_var(i).name();
_feed_name.push_back(model_config.feed_var(i).name());
VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name() VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
<< " index: " << i; << " index: " << i;
std::vector<int> tmp_feed_shape; std::vector<int> tmp_feed_shape;
...@@ -146,15 +148,15 @@ int PredictorClient::create_predictor() { ...@@ -146,15 +148,15 @@ int PredictorClient::create_predictor() {
} }
int PredictorClient::numpy_predict( int PredictorClient::numpy_predict(
const std::vector<std::vector<py::array_t<float>>> &float_feed_batch, const std::vector<py::array_t<float>> &float_feed,
const std::vector<std::string> &float_feed_name, const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int>> &float_shape, const std::vector<std::vector<int>> &float_shape,
const std::vector<std::vector<int>> &float_lod_slot_batch, const std::vector<std::vector<int>> &float_lod_slot_batch,
const std::vector<std::vector<py::array_t<int64_t>>> &int_feed_batch, const std::vector<py::array_t<int64_t>> &int_feed,
const std::vector<std::string> &int_feed_name, const std::vector<std::string> &int_feed_name,
const std::vector<std::vector<int>> &int_shape, const std::vector<std::vector<int>> &int_shape,
const std::vector<std::vector<int>> &int_lod_slot_batch, const std::vector<std::vector<int>> &int_lod_slot_batch,
const std::vector<std::vector<std::string>> &string_feed_batch, const std::vector<std::string> &string_feed,
const std::vector<std::string> &string_feed_name, const std::vector<std::string> &string_feed_name,
const std::vector<std::vector<int>> &string_shape, const std::vector<std::vector<int>> &string_shape,
const std::vector<std::vector<int>> &string_lod_slot_batch, const std::vector<std::vector<int>> &string_lod_slot_batch,
...@@ -162,10 +164,6 @@ int PredictorClient::numpy_predict( ...@@ -162,10 +164,6 @@ int PredictorClient::numpy_predict(
PredictorRes &predict_res_batch, PredictorRes &predict_res_batch,
const int &pid, const int &pid,
const uint64_t log_id) { const uint64_t log_id) {
int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
batch_size = batch_size > string_feed_batch.size() ? batch_size
: string_feed_batch.size();
VLOG(2) << "batch size: " << batch_size;
predict_res_batch.clear(); predict_res_batch.clear();
Timer timeline; Timer timeline;
int64_t preprocess_start = timeline.TimeStampUS(); int64_t preprocess_start = timeline.TimeStampUS();
...@@ -188,134 +186,122 @@ int PredictorClient::numpy_predict( ...@@ -188,134 +186,122 @@ int PredictorClient::numpy_predict(
} }
int vec_idx = 0; int vec_idx = 0;
for (int bi = 0; bi < batch_size; bi++) { // batch is already in Tensor.
VLOG(2) << "prepare batch " << bi; std::vector<Tensor *> tensor_vec;
std::vector<Tensor *> tensor_vec;
FeedInst *inst = req.add_insts();
std::vector<py::array_t<float>> float_feed = float_feed_batch[bi];
std::vector<py::array_t<int64_t>> int_feed = int_feed_batch[bi];
std::vector<std::string> string_feed = string_feed_batch[bi];
for (auto &name : float_feed_name) {
tensor_vec.push_back(inst->add_tensor_array());
}
for (auto &name : int_feed_name) {
tensor_vec.push_back(inst->add_tensor_array());
}
for (auto &name : string_feed_name) { for (auto &name : float_feed_name) {
tensor_vec.push_back(inst->add_tensor_array()); tensor_vec.push_back(req.add_tensor());
} }
VLOG(2) << "batch [" << bi << "] " for (auto &name : int_feed_name) {
<< "prepared"; tensor_vec.push_back(req.add_tensor());
}
vec_idx = 0; for (auto &name : string_feed_name) {
for (auto &name : float_feed_name) { tensor_vec.push_back(req.add_tensor());
int idx = _feed_name_to_idx[name]; }
if (idx >= tensor_vec.size()) {
LOG(ERROR) << "idx > tensor_vec.size()";
return -1;
}
int nbytes = float_feed[vec_idx].nbytes();
void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
int total_number = float_feed[vec_idx].size();
Tensor *tensor = tensor_vec[idx];
VLOG(2) << "prepare float feed " << name << " shape size "
<< float_shape[vec_idx].size();
for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
tensor->add_shape(float_shape[vec_idx][j]);
}
for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
}
tensor->set_elem_type(P_FLOAT32);
tensor->mutable_float_data()->Resize(total_number, 0); vec_idx = 0;
memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes); for (auto &name : float_feed_name) {
vec_idx++; int idx = _feed_name_to_idx[name];
if (idx >= tensor_vec.size()) {
LOG(ERROR) << "idx > tensor_vec.size()";
return -1;
}
VLOG(2) << "prepare float feed " << name << " idx " << idx;
int nbytes = float_feed[vec_idx].nbytes();
void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
int total_number = float_feed[vec_idx].size();
Tensor *tensor = tensor_vec[idx];
VLOG(2) << "prepare float feed " << name << " shape size "
<< float_shape[vec_idx].size();
for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
tensor->add_shape(float_shape[vec_idx][j]);
}
for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
} }
tensor->set_elem_type(P_FLOAT32);
VLOG(2) << "batch [" << bi << "] " tensor->set_name(_feed_name[idx]);
<< "float feed value prepared"; tensor->set_alias_name(name);
vec_idx = 0; tensor->mutable_float_data()->Resize(total_number, 0);
for (auto &name : int_feed_name) { memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
int idx = _feed_name_to_idx[name]; vec_idx++;
if (idx >= tensor_vec.size()) { }
LOG(ERROR) << "idx > tensor_vec.size()";
return -1;
}
Tensor *tensor = tensor_vec[idx];
int nbytes = int_feed[vec_idx].nbytes();
void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
int total_number = int_feed[vec_idx].size();
for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) { vec_idx = 0;
tensor->add_shape(int_shape[vec_idx][j]); for (auto &name : int_feed_name) {
} int idx = _feed_name_to_idx[name];
for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) { if (idx >= tensor_vec.size()) {
tensor->add_lod(int_lod_slot_batch[vec_idx][j]); LOG(ERROR) << "idx > tensor_vec.size()";
} return -1;
tensor->set_elem_type(_type[idx]);
if (_type[idx] == P_INT64) {
tensor->mutable_int64_data()->Resize(total_number, 0);
memcpy(
tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
} else {
tensor->mutable_int_data()->Resize(total_number, 0);
memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
}
vec_idx++;
} }
Tensor *tensor = tensor_vec[idx];
int nbytes = int_feed[vec_idx].nbytes();
void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
int total_number = int_feed[vec_idx].size();
VLOG(2) << "batch [" << bi << "] " for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
<< "int feed value prepared"; tensor->add_shape(int_shape[vec_idx][j]);
}
for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
}
tensor->set_elem_type(_type[idx]);
tensor->set_name(_feed_name[idx]);
tensor->set_alias_name(name);
if (_type[idx] == P_INT64) {
tensor->mutable_int64_data()->Resize(total_number, 0);
memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
} else {
tensor->mutable_int_data()->Resize(total_number, 0);
memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
}
vec_idx++;
}
vec_idx = 0; vec_idx = 0;
for (auto &name : string_feed_name) { for (auto &name : string_feed_name) {
int idx = _feed_name_to_idx[name]; int idx = _feed_name_to_idx[name];
if (idx >= tensor_vec.size()) { if (idx >= tensor_vec.size()) {
LOG(ERROR) << "idx > tensor_vec.size()"; LOG(ERROR) << "idx > tensor_vec.size()";
return -1; return -1;
} }
Tensor *tensor = tensor_vec[idx]; Tensor *tensor = tensor_vec[idx];
for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) { for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
tensor->add_shape(string_shape[vec_idx][j]); tensor->add_shape(string_shape[vec_idx][j]);
} }
for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) { for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
tensor->add_lod(string_lod_slot_batch[vec_idx][j]); tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
} }
tensor->set_elem_type(P_STRING); tensor->set_elem_type(P_STRING);
tensor->set_name(_feed_name[idx]);
const int string_shape_size = string_shape[vec_idx].size(); tensor->set_alias_name(name);
// string_shape[vec_idx] = [1];cause numpy has no datatype of string.
// we pass string via vector<vector<string> >. const int string_shape_size = string_shape[vec_idx].size();
if (string_shape_size != 1) { // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
LOG(ERROR) << "string_shape_size should be 1-D, but received is : " // we pass string via vector<vector<string> >.
<< string_shape_size; if (string_shape_size != 1) {
return -1; LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
} << string_shape_size;
switch (string_shape_size) { return -1;
case 1: { }
tensor->add_data(string_feed[vec_idx]); switch (string_shape_size) {
break; case 1: {
} tensor->add_data(string_feed[vec_idx]);
break;
} }
vec_idx++;
} }
vec_idx++;
VLOG(2) << "batch [" << bi << "] "
<< "string feed value prepared";
} }
int64_t preprocess_end = timeline.TimeStampUS(); int64_t preprocess_end = timeline.TimeStampUS();
int64_t client_infer_start = timeline.TimeStampUS(); int64_t client_infer_start = timeline.TimeStampUS();
Response res; Response res;
int64_t client_infer_end = 0; int64_t client_infer_end = 0;
...@@ -347,19 +333,18 @@ int PredictorClient::numpy_predict( ...@@ -347,19 +333,18 @@ int PredictorClient::numpy_predict(
int idx = 0; int idx = 0;
for (auto &name : fetch_name) { for (auto &name : fetch_name) {
// int idx = _fetch_name_to_idx[name]; // int idx = _fetch_name_to_idx[name];
int shape_size = output.insts(0).tensor_array(idx).shape_size(); int shape_size = output.tensor(idx).shape_size();
VLOG(2) << "fetch var " << name << " index " << idx << " shape size " VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
<< shape_size; << shape_size;
model._shape_map[name].resize(shape_size); model._shape_map[name].resize(shape_size);
for (int i = 0; i < shape_size; ++i) { for (int i = 0; i < shape_size; ++i) {
model._shape_map[name][i] = model._shape_map[name][i] = output.tensor(idx).shape(i);
output.insts(0).tensor_array(idx).shape(i);
} }
int lod_size = output.insts(0).tensor_array(idx).lod_size(); int lod_size = output.tensor(idx).lod_size();
if (lod_size > 0) { if (lod_size > 0) {
model._lod_map[name].resize(lod_size); model._lod_map[name].resize(lod_size);
for (int i = 0; i < lod_size; ++i) { for (int i = 0; i < lod_size; ++i) {
model._lod_map[name][i] = output.insts(0).tensor_array(idx).lod(i); model._lod_map[name][i] = output.tensor(idx).lod(i);
} }
} }
idx += 1; idx += 1;
...@@ -371,22 +356,22 @@ int PredictorClient::numpy_predict( ...@@ -371,22 +356,22 @@ int PredictorClient::numpy_predict(
// int idx = _fetch_name_to_idx[name]; // int idx = _fetch_name_to_idx[name];
if (_fetch_name_to_type[name] == P_INT64) { if (_fetch_name_to_type[name] == P_INT64) {
VLOG(2) << "ferch var " << name << "type int64"; VLOG(2) << "ferch var " << name << "type int64";
int size = output.insts(0).tensor_array(idx).int64_data_size(); int size = output.tensor(idx).int64_data_size();
model._int64_value_map[name] = std::vector<int64_t>( model._int64_value_map[name] = std::vector<int64_t>(
output.insts(0).tensor_array(idx).int64_data().begin(), output.tensor(idx).int64_data().begin(),
output.insts(0).tensor_array(idx).int64_data().begin() + size); output.tensor(idx).int64_data().begin() + size);
} else if (_fetch_name_to_type[name] == P_FLOAT32) { } else if (_fetch_name_to_type[name] == P_FLOAT32) {
VLOG(2) << "fetch var " << name << "type float"; VLOG(2) << "fetch var " << name << "type float";
int size = output.insts(0).tensor_array(idx).float_data_size(); int size = output.tensor(idx).float_data_size();
model._float_value_map[name] = std::vector<float>( model._float_value_map[name] = std::vector<float>(
output.insts(0).tensor_array(idx).float_data().begin(), output.tensor(idx).float_data().begin(),
output.insts(0).tensor_array(idx).float_data().begin() + size); output.tensor(idx).float_data().begin() + size);
} else if (_fetch_name_to_type[name] == P_INT32) { } else if (_fetch_name_to_type[name] == P_INT32) {
VLOG(2) << "fetch var " << name << "type int32"; VLOG(2) << "fetch var " << name << "type int32";
int size = output.insts(0).tensor_array(idx).int_data_size(); int size = output.tensor(idx).int_data_size();
model._int32_value_map[name] = std::vector<int32_t>( model._int32_value_map[name] = std::vector<int32_t>(
output.insts(0).tensor_array(idx).int_data().begin(), output.tensor(idx).int_data().begin(),
output.insts(0).tensor_array(idx).int_data().begin() + size); output.tensor(idx).int_data().begin() + size);
} }
idx += 1; idx += 1;
} }
......
...@@ -97,33 +97,31 @@ PYBIND11_MODULE(serving_client, m) { ...@@ -97,33 +97,31 @@ PYBIND11_MODULE(serving_client, m) {
[](PredictorClient &self) { self.destroy_predictor(); }) [](PredictorClient &self) { self.destroy_predictor(); })
.def("numpy_predict", .def("numpy_predict",
[](PredictorClient &self, [](PredictorClient &self,
const std::vector<std::vector<py::array_t<float>>> const std::vector<py::array_t<float>> &float_feed,
&float_feed_batch,
const std::vector<std::string> &float_feed_name, const std::vector<std::string> &float_feed_name,
const std::vector<std::vector<int>> &float_shape, const std::vector<std::vector<int>> &float_shape,
const std::vector<std::vector<int>> &float_lod_slot_batch, const std::vector<std::vector<int>> &float_lod_slot_batch,
const std::vector<std::vector<py::array_t<int64_t>>> const std::vector<py::array_t<int64_t>> &int_feed,
&int_feed_batch,
const std::vector<std::string> &int_feed_name, const std::vector<std::string> &int_feed_name,
const std::vector<std::vector<int>> &int_shape, const std::vector<std::vector<int>> &int_shape,
const std::vector<std::vector<int>> &int_lod_slot_batch, const std::vector<std::vector<int>> &int_lod_slot_batch,
const std::vector<std::vector<std::string>>& string_feed_batch, const std::vector<std::string> &string_feed,
const std::vector<std::string>& string_feed_name, const std::vector<std::string> &string_feed_name,
const std::vector<std::vector<int>>& string_shape, const std::vector<std::vector<int>> &string_shape,
const std::vector<std::vector<int>>& string_lod_slot_batch, const std::vector<std::vector<int>> &string_lod_slot_batch,
const std::vector<std::string> &fetch_name, const std::vector<std::string> &fetch_name,
PredictorRes &predict_res_batch, PredictorRes &predict_res_batch,
const int &pid, const int &pid,
const uint64_t log_id) { const uint64_t log_id) {
return self.numpy_predict(float_feed_batch, return self.numpy_predict(float_feed,
float_feed_name, float_feed_name,
float_shape, float_shape,
float_lod_slot_batch, float_lod_slot_batch,
int_feed_batch, int_feed,
int_feed_name, int_feed_name,
int_shape, int_shape,
int_lod_slot_batch, int_lod_slot_batch,
string_feed_batch, string_feed,
string_feed_name, string_feed_name,
string_shape, string_shape,
string_lod_slot_batch, string_lod_slot_batch,
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_copy_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralCopyOp::inference() {
// reade request from client
const std::vector<std::string> pre_node_names = pre_names();
if (pre_node_names.size() != 1) {
LOG(ERROR) << "This op(" << op_name()
<< ") can only have one predecessor op, but received "
<< pre_node_names.size();
return -1;
}
const std::string pre_name = pre_node_names[0];
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") precedent name: " << pre_name;
const TensorVector *in = &input_blob->tensor_vector;
VLOG(2) << "(logid=" << log_id << ") input size: " << in->size();
int batch_size = input_blob->GetBatchSize();
int input_var_num = 0;
GeneralBlob *res = mutable_data<GeneralBlob>();
res->SetLogId(log_id);
TensorVector *out = &res->tensor_vector;
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
res->SetBatchSize(batch_size);
if (!res) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed get op tls reader object output";
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
VLOG(2) << "(logid=" << log_id << ") Going to init lod tensor";
for (int i = 0; i < in->size(); ++i) {
paddle::PaddleTensor lod_tensor;
CopyLod(&in->at(i), &lod_tensor);
lod_tensor.dtype = in->at(i).dtype;
lod_tensor.name = in->at(i).name;
VLOG(2) << "(logid=" << log_id << ") lod tensor [" << i
<< "].name = " << lod_tensor.name;
out->push_back(lod_tensor);
}
VLOG(2) << "(logid=" << log_id << ") pack done.";
for (int i = 0; i < out->size(); ++i) {
int64_t *src_ptr = static_cast<int64_t *>(in->at(i).data.data());
out->at(i).data.Resize(out->at(i).lod[0].back() * sizeof(int64_t));
out->at(i).shape = {out->at(i).lod[0].back(), 1};
int64_t *tgt_ptr = static_cast<int64_t *>(out->at(i).data.data());
for (int j = 0; j < out->at(i).lod[0].back(); ++j) {
tgt_ptr[j] = src_ptr[j];
}
}
VLOG(2) << "(logid=" << log_id << ") output done.";
timeline.Pause();
int64_t end = timeline.TimeStampUS();
CopyBlobInfo(input_blob, res);
AddBlobInfo(res, start);
AddBlobInfo(res, end);
VLOG(2) << "(logid=" << log_id << ") read data from client success";
return 0;
}
DEFINE_OP(GeneralCopyOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/resource.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralCopyOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralCopyOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
...@@ -36,7 +36,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper; ...@@ -36,7 +36,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager; using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
......
文件模式从 100755 更改为 100644
...@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper; ...@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager; using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
......
...@@ -35,7 +35,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper; ...@@ -35,7 +35,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager; using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
...@@ -117,9 +116,6 @@ int GeneralDistKVQuantInferOp::inference() { ...@@ -117,9 +116,6 @@ int GeneralDistKVQuantInferOp::inference() {
std::unordered_map<int, int> in_out_map; std::unordered_map<int, int> in_out_map;
baidu::paddle_serving::predictor::Resource &resource = baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance(); baidu::paddle_serving::predictor::Resource::instance();
//TODO:Temporary addition, specific details to be studied by HexToString
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config()[0];
int cube_quant_bits = resource.get_cube_quant_bits(); int cube_quant_bits = resource.get_cube_quant_bits();
size_t EMBEDDING_SIZE = 0; size_t EMBEDDING_SIZE = 0;
if (cube_quant_bits == 0) { if (cube_quant_bits == 0) {
...@@ -146,7 +142,7 @@ int GeneralDistKVQuantInferOp::inference() { ...@@ -146,7 +142,7 @@ int GeneralDistKVQuantInferOp::inference() {
sparse_out[sparse_idx].shape.push_back( sparse_out[sparse_idx].shape.push_back(
sparse_out[sparse_idx].lod[0].back()); sparse_out[sparse_idx].lod[0].back());
sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE); sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
sparse_out[sparse_idx].name = model_config->_feed_name[i]; sparse_out[sparse_idx].name = in->at(i).name;
sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() * sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
EMBEDDING_SIZE * sizeof(float)); EMBEDDING_SIZE * sizeof(float));
// END HERE // END HERE
......
...@@ -31,7 +31,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper; ...@@ -31,7 +31,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::InferManager; using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
...@@ -49,7 +48,7 @@ int GeneralInferOp::inference() { ...@@ -49,7 +48,7 @@ int GeneralInferOp::inference() {
const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name); const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
if (!input_blob) { if (!input_blob) {
LOG(ERROR) << "input_blob is nullptr,error"; LOG(ERROR) << "input_blob is nullptr,error";
return -1; return -1;
} }
uint64_t log_id = input_blob->GetLogId(); uint64_t log_id = input_blob->GetLogId();
VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name; VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
...@@ -57,7 +56,7 @@ int GeneralInferOp::inference() { ...@@ -57,7 +56,7 @@ int GeneralInferOp::inference() {
GeneralBlob *output_blob = mutable_data<GeneralBlob>(); GeneralBlob *output_blob = mutable_data<GeneralBlob>();
if (!output_blob) { if (!output_blob) {
LOG(ERROR) << "output_blob is nullptr,error"; LOG(ERROR) << "output_blob is nullptr,error";
return -1; return -1;
} }
output_blob->SetLogId(log_id); output_blob->SetLogId(log_id);
......
...@@ -30,42 +30,8 @@ using baidu::paddle_serving::Timer; ...@@ -30,42 +30,8 @@ using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper; using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING }; enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
int conf_check(const Request *req,
const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
int var_num = req->insts(0).tensor_array_size();
if (var_num != model_config->_feed_type.size()) {
LOG(ERROR) << "feed var number not match: model config["
<< model_config->_feed_type.size() << "] vs. actual[" << var_num
<< "]";
return -1;
}
VLOG(2) << "fetch var num in reader op: " << req->fetch_var_names_size();
for (int i = 0; i < var_num; ++i) {
const Tensor &tensor = req->insts(0).tensor_array(i);
if (model_config->_feed_type[i] != tensor.elem_type()) {
LOG(ERROR) << "feed type not match.";
return -1;
}
if (model_config->_feed_shape[i].size() == tensor.shape_size()) {
for (int j = 0; j < model_config->_feed_shape[i].size(); ++j) {
tensor.shape(j);
if (model_config->_feed_shape[i][j] != tensor.shape(j)) {
LOG(ERROR) << "feed shape not match.";
return -1;
}
}
} else {
LOG(ERROR) << "feed shape not match.";
return -1;
}
}
return 0;
}
int GeneralReaderOp::inference() { int GeneralReaderOp::inference() {
// read request from client // read request from client
...@@ -93,7 +59,8 @@ int GeneralReaderOp::inference() { ...@@ -93,7 +59,8 @@ int GeneralReaderOp::inference() {
res->SetLogId(log_id); res->SetLogId(log_id);
Timer timeline; Timer timeline;
int64_t start = timeline.TimeStampUS(); int64_t start = timeline.TimeStampUS();
int var_num = req->insts(0).tensor_array_size(); // var_num means the number of feed_var.
int var_num = req->tensor_size();
VLOG(2) << "(logid=" << log_id << ") var num: " << var_num VLOG(2) << "(logid=" << log_id << ") var num: " << var_num
<< ") start to call load general model_conf op"; << ") start to call load general model_conf op";
...@@ -102,19 +69,7 @@ int GeneralReaderOp::inference() { ...@@ -102,19 +69,7 @@ int GeneralReaderOp::inference() {
baidu::paddle_serving::predictor::Resource::instance(); baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done."; VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
// get the first InferOP's model_config as ReaderOp's model_config by default.
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config().front();
// TODO(guru4elephant): how to do conditional check?
/*
int ret = conf_check(req, model_config);
if (ret != 0) {
LOG(ERROR) << "model conf of server:";
resource.print_general_model_config(model_config);
return 0;
}
*/
// package tensor // package tensor
// prepare basic information for input // prepare basic information for input
// specify the memory needed for output tensor_vector // specify the memory needed for output tensor_vector
...@@ -125,7 +80,7 @@ int GeneralReaderOp::inference() { ...@@ -125,7 +80,7 @@ int GeneralReaderOp::inference() {
int64_t databuf_size = 0; int64_t databuf_size = 0;
for (int i = 0; i < var_num; ++i) { for (int i = 0; i < var_num; ++i) {
paddle::PaddleTensor paddleTensor; paddle::PaddleTensor paddleTensor;
const Tensor &tensor = req->insts(0).tensor_array(i); const Tensor &tensor = req->tensor(i);
data_len = 0; data_len = 0;
elem_type = 0; elem_type = 0;
elem_size = 0; elem_size = 0;
...@@ -172,13 +127,16 @@ int GeneralReaderOp::inference() { ...@@ -172,13 +127,16 @@ int GeneralReaderOp::inference() {
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i << "]: " << dim; VLOG(2) << "(logid=" << log_id << ") shape for var[" << i << "]: " << dim;
paddleTensor.shape.push_back(dim); paddleTensor.shape.push_back(dim);
} }
paddleTensor.name = model_config->_feed_name[i]; paddleTensor.name = tensor.name();
out->push_back(paddleTensor); out->push_back(paddleTensor);
VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
<< "]: " << data_len; << "]: " << data_len;
databuf_size = data_len * elem_size; databuf_size = data_len * elem_size;
out->at(i).data.Resize(databuf_size); void *databuf_char = MempoolWrapper::instance().malloc(databuf_size);
paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
out->at(i).data = paddleBuf;
// out->at(i).data.Resize(databuf_size);
if (out->at(i).lod.size() > 0) { if (out->at(i).lod.size() > 0) {
VLOG(2) << "(logid=" << log_id << ") var[" << i VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] has lod_tensor and len=" << out->at(i).lod[0].back(); << "] has lod_tensor and len=" << out->at(i).lod[0].back();
......
...@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper; ...@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor; using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response; using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request; using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::general_model::ModelOutput; using baidu::paddle_serving::predictor::general_model::ModelOutput;
using baidu::paddle_serving::predictor::InferManager; using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig; using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
...@@ -49,7 +48,6 @@ int GeneralResponseOp::inference() { ...@@ -49,7 +48,6 @@ int GeneralResponseOp::inference() {
get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId(); get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
const Request *req = dynamic_cast<const Request *>(get_request_message()); const Request *req = dynamic_cast<const Request *>(get_request_message());
// response inst with only fetch_var_names
Response *res = mutable_data<Response>(); Response *res = mutable_data<Response>();
Timer timeline; Timer timeline;
...@@ -63,7 +61,8 @@ int GeneralResponseOp::inference() { ...@@ -63,7 +61,8 @@ int GeneralResponseOp::inference() {
baidu::paddle_serving::predictor::Resource::instance(); baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done."; VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
//get the last InferOP's model_config as ResponseOp's model_config by default. // get the last InferOP's model_config as ResponseOp's model_config by
// default.
std::shared_ptr<PaddleGeneralModelConfig> model_config = std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config().back(); resource.get_general_model_config().back();
...@@ -71,6 +70,10 @@ int GeneralResponseOp::inference() { ...@@ -71,6 +70,10 @@ int GeneralResponseOp::inference() {
<< ") max body size : " << brpc::fLU64::FLAGS_max_body_size; << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
std::vector<int> fetch_index; std::vector<int> fetch_index;
// this is based on GetOutPutNames() is ordered map.
// and the order of Output is the same as the prototxt FetchVar.
// otherwise, you can only get the Output by the corresponding of
// Name -- Alias_name.
fetch_index.resize(req->fetch_var_names_size()); fetch_index.resize(req->fetch_var_names_size());
for (int i = 0; i < req->fetch_var_names_size(); ++i) { for (int i = 0; i < req->fetch_var_names_size(); ++i) {
fetch_index[i] = fetch_index[i] =
...@@ -95,40 +98,41 @@ int GeneralResponseOp::inference() { ...@@ -95,40 +98,41 @@ int GeneralResponseOp::inference() {
ModelOutput *output = res->add_outputs(); ModelOutput *output = res->add_outputs();
// To get the order of model return values // To get the order of model return values
output->set_engine_name(pre_name); output->set_engine_name(pre_name);
FetchInst *fetch_inst = output->add_insts();
var_idx = 0;
// idx is the real index of FetchVar.
// idx is not the index of FetchList.
// fetch_index is the real index in FetchVar of Fetchlist
// for example, FetchVar = {0:A, 1:B, 2:C}
// FetchList = {0:C,1:A}, at this situation.
// fetch_index = [2,0], C`index = 2 and A`index = 0
for (auto &idx : fetch_index) { for (auto &idx : fetch_index) {
Tensor *tensor = fetch_inst->add_tensor_array(); Tensor *tensor = output->add_tensor();
//tensor->set_elem_type(1); tensor->set_name(in->at(idx).name);
if (model_config->_is_lod_fetch[idx]) { tensor->set_alias_name(model_config->_fetch_alias_name[idx]);
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] " for (int k = 0; k < in->at(idx).shape.size(); ++k) {
<< model_config->_fetch_name[idx] << " is lod_tensor"; VLOG(2) << "(logid=" << log_id << ") shape[" << k
for (int k = 0; k < in->at(idx).shape.size(); ++k) { << "]: " << in->at(idx).shape[k];
VLOG(2) << "(logid=" << log_id << ") shape[" << k tensor->add_shape(in->at(idx).shape[k]);
<< "]: " << in->at(idx).shape[k]; }
tensor->add_shape(in->at(idx).shape[k]); std::string str_tensor_type = "is tensor";
} if (model_config->_is_lod_fetch[idx] && in->at(idx).lod.size() > 0) {
} else { str_tensor_type = "is lod_tensor";
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] " for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
<< model_config->_fetch_name[idx] << " is tensor"; tensor->add_lod(in->at(idx).lod[0][j]);
for (int k = 0; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
} }
} }
} VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
<< model_config->_fetch_name[idx] << str_tensor_type;
var_idx = 0;
for (auto &idx : fetch_index) {
cap = 1; cap = 1;
for (int j = 0; j < in->at(idx).shape.size(); ++j) { for (int j = 0; j < in->at(idx).shape.size(); ++j) {
cap *= in->at(idx).shape[j]; cap *= in->at(idx).shape[j];
} }
FetchInst *fetch_p = output->mutable_insts(0);
auto dtype = in->at(idx).dtype; auto dtype = in->at(idx).dtype;
if (dtype == paddle::PaddleDType::INT64) { if (dtype == paddle::PaddleDType::INT64) {
tensor->set_elem_type(0);
VLOG(2) << "(logid=" << log_id << ") Prepare int64 var [" VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
<< model_config->_fetch_name[idx] << "]."; << model_config->_fetch_name[idx] << "].";
int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data()); int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
...@@ -137,35 +141,24 @@ int GeneralResponseOp::inference() { ...@@ -137,35 +141,24 @@ int GeneralResponseOp::inference() {
// `Swap` method is faster than `{}` method. // `Swap` method is faster than `{}` method.
google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr, google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
data_ptr + cap); data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap( output->mutable_tensor(var_idx)->mutable_int64_data()->Swap(&tmp_data);
&tmp_data);
} else if (dtype == paddle::PaddleDType::FLOAT32) { } else if (dtype == paddle::PaddleDType::FLOAT32) {
tensor->set_elem_type(1);
VLOG(2) << "(logid=" << log_id << ") Prepare float var [" VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
<< model_config->_fetch_name[idx] << "]."; << model_config->_fetch_name[idx] << "].";
float *data_ptr = static_cast<float *>(in->at(idx).data.data()); float *data_ptr = static_cast<float *>(in->at(idx).data.data());
google::protobuf::RepeatedField<float> tmp_data(data_ptr, google::protobuf::RepeatedField<float> tmp_data(data_ptr,
data_ptr + cap); data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap( output->mutable_tensor(var_idx)->mutable_float_data()->Swap(&tmp_data);
&tmp_data);
} else if (dtype == paddle::PaddleDType::INT32) { } else if (dtype == paddle::PaddleDType::INT32) {
tensor->set_elem_type(2);
VLOG(2) << "(logid=" << log_id << ")Prepare int32 var [" VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
<< model_config->_fetch_name[idx] << "]."; << model_config->_fetch_name[idx] << "].";
int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data()); int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr, google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
data_ptr + cap); data_ptr + cap);
fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap( output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
&tmp_data);
}
if (model_config->_is_lod_fetch[idx]) {
if (in->at(idx).lod.size() > 0) {
for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
fetch_p->mutable_tensor_array(var_idx)->add_lod(
in->at(idx).lod[0][j]);
}
}
} }
VLOG(2) << "(logid=" << log_id << ") fetch var [" VLOG(2) << "(logid=" << log_id << ") fetch var ["
...@@ -205,4 +198,4 @@ DEFINE_OP(GeneralResponseOp); ...@@ -205,4 +198,4 @@ DEFINE_OP(GeneralResponseOp);
} // namespace serving } // namespace serving
} // namespace paddle_serving } // namespace paddle_serving
} // namespace baidu } // namespace baidu
\ No newline at end of file
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_text_reader_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FeedInst;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralTextReaderOp::inference() {
// reade request from client
const Request *req = dynamic_cast<const Request *>(get_request_message());
uint64_t log_id = req->log_id();
int batch_size = req->insts_size();
int input_var_num = 0;
std::vector<int64_t> elem_type;
std::vector<int64_t> elem_size;
std::vector<int64_t> capacity;
GeneralBlob *res = mutable_data<GeneralBlob>();
if (!res) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed get op tls reader object output";
}
TensorVector *out = &res->tensor_vector;
res->SetBatchSize(batch_size);
res->SetLogId(log_id);
if (batch_size <= 0) {
LOG(ERROR) << "(logid=" << log_id << ") Batch size < 0";
return -1;
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
int var_num = req->insts(0).tensor_array_size();
VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config()[0];
VLOG(2) << "(logid=" << log_id << ") print general model config done.";
elem_type.resize(var_num);
elem_size.resize(var_num);
capacity.resize(var_num);
for (int i = 0; i < var_num; ++i) {
paddle::PaddleTensor lod_tensor;
elem_type[i] = req->insts(0).tensor_array(i).elem_type();
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] has elem type: " << elem_type[i];
if (elem_type[i] == 0) { // int64
elem_size[i] = sizeof(int64_t);
lod_tensor.dtype = paddle::PaddleDType::INT64;
} else {
elem_size[i] = sizeof(float);
lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
}
if (req->insts(0).tensor_array(i).shape(0) == -1) {
lod_tensor.lod.resize(1);
lod_tensor.lod[0].push_back(0);
VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
} else {
lod_tensor.shape.push_back(batch_size);
capacity[i] = 1;
for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
int dim = req->insts(0).tensor_array(i).shape(k);
VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
<< "]: " << dim;
capacity[i] *= dim;
lod_tensor.shape.push_back(dim);
}
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor, capacity: " << capacity[i];
}
lod_tensor.name = model_config->_feed_name[i];
out->push_back(lod_tensor);
}
for (int i = 0; i < var_num; ++i) {
if (out->at(i).lod.size() == 1) {
for (int j = 0; j < batch_size; ++j) {
const Tensor &tensor = req->insts(j).tensor_array(i);
int data_len = tensor.int_data_size();
int cur_len = out->at(i).lod[0].back();
out->at(i).lod[0].push_back(cur_len + data_len);
}
out->at(i).data.Resize(out->at(i).lod[0].back() * elem_size[i]);
out->at(i).shape = {out->at(i).lod[0].back(), 1};
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is lod_tensor and len=" << out->at(i).lod[0].back();
} else {
out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
VLOG(2) << "(logid=" << log_id << ") var[" << i
<< "] is tensor and capacity=" << batch_size * capacity[i];
}
}
for (int i = 0; i < var_num; ++i) {
if (elem_type[i] == 0) {
int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
++k) {
dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
} else {
offset += capacity[i];
}
}
} else {
float *dst_ptr = static_cast<float *>(out->at(i).data.data());
int offset = 0;
for (int j = 0; j < batch_size; ++j) {
for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
++k) {
dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
}
if (out->at(i).lod.size() == 1) {
offset = out->at(i).lod[0][j + 1];
} else {
offset += capacity[i];
}
}
}
}
int64_t end = timeline.TimeStampUS();
res->p_size = 0;
AddBlobInfo(res, start);
AddBlobInfo(res, end);
VLOG(2) << "(logid=" << log_id << ") read data from client success";
return 0;
}
DEFINE_OP(GeneralTextReaderOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/load_general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "core/predictor/framework/resource.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralTextReaderOp
: public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralTextReaderOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_text_response_op.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
#include "core/util/include/timer.h"
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::MempoolWrapper;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Response;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::FetchInst;
using baidu::paddle_serving::predictor::general_model::ModelOutput;
using baidu::paddle_serving::predictor::InferManager;
using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
int GeneralTextResponseOp::inference() {
VLOG(2) << "Going to run inference";
const std::vector<std::string> pre_node_names = pre_names();
VLOG(2) << "pre node names size: " << pre_node_names.size();
const GeneralBlob *input_blob;
uint64_t log_id =
get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
const Request *req = dynamic_cast<const Request *>(get_request_message());
// response inst with only fetch_var_names
Response *res = mutable_data<Response>();
Timer timeline;
int64_t start = timeline.TimeStampUS();
VLOG(2) << "(logid=" << log_id
<< ") start to call load general model_conf op";
baidu::paddle_serving::predictor::Resource &resource =
baidu::paddle_serving::predictor::Resource::instance();
VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
std::shared_ptr<PaddleGeneralModelConfig> model_config =
resource.get_general_model_config().back();
std::vector<int> fetch_index;
fetch_index.resize(req->fetch_var_names_size());
for (int i = 0; i < req->fetch_var_names_size(); ++i) {
fetch_index[i] =
model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
}
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
const std::string &pre_name = pre_node_names[pi];
VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
<< " (" << pre_node_names.size() << ")";
input_blob = get_depend_argument<GeneralBlob>(pre_name);
if (!input_blob) {
LOG(ERROR) << "(logid=" << log_id
<< ") Failed mutable depended argument, op: " << pre_name;
return -1;
}
const TensorVector *in = &input_blob->tensor_vector;
int batch_size = input_blob->GetBatchSize();
VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
ModelOutput *output = res->add_outputs();
output->set_engine_name(
pre_name); // To get the order of model return values
for (int i = 0; i < batch_size; ++i) {
FetchInst *fetch_inst = output->add_insts();
for (auto &idx : fetch_index) {
Tensor *tensor = fetch_inst->add_tensor_array();
// currently only response float tensor or lod_tensor
tensor->set_elem_type(1);
if (model_config->_is_lod_fetch[idx]) {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << " is lod_tensor";
tensor->add_shape(-1);
} else {
VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] is tensor";
for (int k = 1; k < in->at(idx).shape.size(); ++k) {
VLOG(2) << "(logid=" << log_id << ") shape[" << k - 1
<< "]: " << in->at(idx).shape[k];
tensor->add_shape(in->at(idx).shape[k]);
}
}
}
}
int var_idx = 0;
for (auto &idx : fetch_index) {
float *data_ptr = static_cast<float *>(in->at(idx).data.data());
int cap = 1;
for (int j = 1; j < in->at(idx).shape.size(); ++j) {
cap *= in->at(idx).shape[j];
}
if (model_config->_is_lod_fetch[idx]) {
for (int j = 0; j < batch_size; ++j) {
for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
k++) {
output->mutable_insts(j)
->mutable_tensor_array(var_idx)
->add_float_data(data_ptr[k]);
}
}
} else {
for (int j = 0; j < batch_size; ++j) {
for (int k = j * cap; k < (j + 1) * cap; ++k) {
output->mutable_insts(j)
->mutable_tensor_array(var_idx)
->add_float_data(data_ptr[k]);
}
}
}
var_idx++;
}
}
if (req->profile_server()) {
int64_t end = timeline.TimeStampUS();
// TODO(barriery): multi-model profile_time.
// At present, only the response_op is multi-input, so here we get
// the profile_time by hard coding. It needs to be replaced with
// a more elegant way.
for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
VLOG(2) << "(logid=" << log_id
<< ") p size for input blob: " << input_blob->p_size;
int profile_time_idx = -1;
if (pi == 0) {
profile_time_idx = 0;
} else {
profile_time_idx = input_blob->p_size - 2;
}
for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
}
}
// TODO(guru4elephant): find more elegant way to do this
res->add_profile_time(start);
res->add_profile_time(end);
}
return 0;
}
DEFINE_OP(GeneralTextResponseOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "core/general-server/general_model_service.pb.h"
#include "core/general-server/op/general_infer_helper.h"
#include "paddle_inference_api.h" // NOLINT
namespace baidu {
namespace paddle_serving {
namespace serving {
class GeneralTextResponseOp
: public baidu::paddle_serving::predictor::OpWithChannel<
baidu::paddle_serving::predictor::general_model::Response> {
public:
typedef std::vector<paddle::PaddleTensor> TensorVector;
DECLARE_OP(GeneralTextResponseOp);
int inference();
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
...@@ -24,17 +24,16 @@ message Tensor { ...@@ -24,17 +24,16 @@ message Tensor {
repeated int32 int_data = 2; repeated int32 int_data = 2;
repeated int64 int64_data = 3; repeated int64 int64_data = 3;
repeated float float_data = 4; repeated float float_data = 4;
optional int32 elem_type = 5; optional int32 elem_type =
repeated int32 shape = 6; 5; // 0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
repeated int32 lod = 7; // only for fetch tensor currently repeated int32 shape = 6; // shape should include batch
repeated int32 lod = 7; // only for fetch tensor currently
optional string name = 8; // get from the Model prototxt
optional string alias_name = 9; // get from the Model prototxt
}; };
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst { repeated Tensor tensor_array = 1; };
message Request { message Request {
repeated FeedInst insts = 1; repeated Tensor tensor = 1;
repeated string fetch_var_names = 2; repeated string fetch_var_names = 2;
optional bool profile_server = 3 [ default = false ]; optional bool profile_server = 3 [ default = false ];
required uint64 log_id = 4 [ default = 0 ]; required uint64 log_id = 4 [ default = 0 ];
...@@ -46,7 +45,7 @@ message Response { ...@@ -46,7 +45,7 @@ message Response {
}; };
message ModelOutput { message ModelOutput {
repeated FetchInst insts = 1; repeated Tensor tensor = 1;
optional string engine_name = 2; optional string engine_name = 2;
} }
......
...@@ -280,6 +280,7 @@ class PdsCodeGenerator : public CodeGenerator { ...@@ -280,6 +280,7 @@ class PdsCodeGenerator : public CodeGenerator {
" baidu::rpc::ClosureGuard done_guard(done);\n" " baidu::rpc::ClosureGuard done_guard(done);\n"
" baidu::rpc::Controller* cntl = \n" " baidu::rpc::Controller* cntl = \n"
" static_cast<baidu::rpc::Controller*>(cntl_base);\n" " static_cast<baidu::rpc::Controller*>(cntl_base);\n"
" cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
" uint64_t log_id = request->log_id();\n" " uint64_t log_id = request->log_id();\n"
" cntl->set_log_id(log_id);\n" " cntl->set_log_id(log_id);\n"
" ::baidu::paddle_serving::predictor::InferService* svr = \n" " ::baidu::paddle_serving::predictor::InferService* svr = \n"
...@@ -322,6 +323,7 @@ class PdsCodeGenerator : public CodeGenerator { ...@@ -322,6 +323,7 @@ class PdsCodeGenerator : public CodeGenerator {
" baidu::rpc::ClosureGuard done_guard(done);\n" " baidu::rpc::ClosureGuard done_guard(done);\n"
" baidu::rpc::Controller* cntl = \n" " baidu::rpc::Controller* cntl = \n"
" static_cast<baidu::rpc::Controller*>(cntl_base);\n" " static_cast<baidu::rpc::Controller*>(cntl_base);\n"
" cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
" uint64_t log_id = equest->log_id();\n" " uint64_t log_id = equest->log_id();\n"
" cntl->set_log_id(log_id);\n" " cntl->set_log_id(log_id);\n"
" ::baidu::paddle_serving::predictor::InferService* svr = \n" " ::baidu::paddle_serving::predictor::InferService* svr = \n"
...@@ -1023,6 +1025,7 @@ class PdsCodeGenerator : public CodeGenerator { ...@@ -1023,6 +1025,7 @@ class PdsCodeGenerator : public CodeGenerator {
" brpc::ClosureGuard done_guard(done);\n" " brpc::ClosureGuard done_guard(done);\n"
" brpc::Controller* cntl = \n" " brpc::Controller* cntl = \n"
" static_cast<brpc::Controller*>(cntl_base);\n" " static_cast<brpc::Controller*>(cntl_base);\n"
" cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
" uint64_t log_id = request->log_id();\n" " uint64_t log_id = request->log_id();\n"
" cntl->set_log_id(log_id);\n" " cntl->set_log_id(log_id);\n"
" ::baidu::paddle_serving::predictor::InferService* svr = \n" " ::baidu::paddle_serving::predictor::InferService* svr = \n"
...@@ -1067,6 +1070,7 @@ class PdsCodeGenerator : public CodeGenerator { ...@@ -1067,6 +1070,7 @@ class PdsCodeGenerator : public CodeGenerator {
" brpc::ClosureGuard done_guard(done);\n" " brpc::ClosureGuard done_guard(done);\n"
" brpc::Controller* cntl = \n" " brpc::Controller* cntl = \n"
" static_cast<brpc::Controller*>(cntl_base);\n" " static_cast<brpc::Controller*>(cntl_base);\n"
" cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
" uint64_t log_id = request->log_id();\n" " uint64_t log_id = request->log_id();\n"
" cntl->set_log_id(log_id);\n" " cntl->set_log_id(log_id);\n"
" ::baidu::paddle_serving::predictor::InferService* svr = \n" " ::baidu::paddle_serving::predictor::InferService* svr = \n"
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef BCLOUD
#include <base/atomicops.h>
#else
#include <butil/atomicops.h>
#endif
#include <errno.h>
#include <algorithm>
#include <deque>
#include <vector>
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/infer_data.h"
#include "core/predictor/framework/memory.h"
#include <boost/function.hpp>
namespace im {
namespace bsf {
template <>
struct Task<baidu::paddle_serving::predictor::Tensor,
baidu::paddle_serving::predictor::Tensor> {
typedef Task<baidu::paddle_serving::predictor::Tensor,
baidu::paddle_serving::predictor::Tensor>
TaskT;
typedef baidu::paddle_serving::predictor::Tensor Tensor;
typedef baidu::paddle_serving::predictor::Tensor InType;
typedef baidu::paddle_serving::predictor::Tensor OutType;
typedef baidu::paddle_serving::predictor::BatchTensor BatchTensor;
typedef baidu::paddle_serving::predictor::BatchTensor InArrayT;
typedef baidu::paddle_serving::predictor::BatchTensor OutArrayT;
struct Segment {
Segment(void* p, size_t b, size_t s) : ptr(p), begin(b), size(s) {}
void* ptr;
size_t begin;
size_t size;
};
int read_fd;
int write_fd;
pid_t owner_tid;
const InArrayT* in;
OutArrayT* out;
size_t rem;
size_t size;
butil::atomic<size_t> index;
const BatchTensor* get(bool is_in) const {
if (is_in) {
return in;
} else {
return out;
}
}
BatchTensor* get(bool is_in) {
if (is_in) {
return const_cast<BatchTensor*>(in);
} else {
return out;
}
}
Task() {
read_fd = -1;
write_fd = -1;
owner_tid = -1;
in = NULL;
out = NULL;
rem = -1;
size = -1;
index.store(0, butil::memory_order_relaxed);
}
};
template <>
class BatchTasks<Task<baidu::paddle_serving::predictor::Tensor,
baidu::paddle_serving::predictor::Tensor>> {
public:
typedef baidu::paddle_serving::predictor::Tensor Tensor;
typedef baidu::paddle_serving::predictor::Tensor InType;
typedef baidu::paddle_serving::predictor::Tensor OutType;
typedef baidu::paddle_serving::predictor::DataBuf DataBuf;
typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
typedef Task<baidu::paddle_serving::predictor::Tensor,
baidu::paddle_serving::predictor::Tensor>
TaskT;
typedef TaskMeta<TaskT> TaskMetaT;
typedef TaskT::InArrayT InArrayT;
typedef TaskT::OutArrayT OutArrayT;
explicit BatchTasks(size_t batch_size, bool batch_align = false)
: _batch_size(batch_size),
_rem_size(batch_size),
_batch_align(batch_align) {
_batch_in.clear();
_batch_out.clear();
_tasks.clear();
}
~BatchTasks() {
_batch_in.clear();
_batch_out.clear();
_tasks.clear();
}
static bool check_valid(const InArrayT& in,
OutArrayT& out, // NOLINT
bool align) { // NOLINT
if (align) {
if (out.count() <= 0 || out.size() <= 0) {
LOG(ERROR) << "Out tensor is empty, when aligned";
return false;
}
if (out.size() != in.size()) {
LOG(ERROR) << "In/Out tensor size not eq: " << out.size()
<< "!=" << in.size();
return false;
}
for (size_t fi = 0, shape0 = 0; fi < out.count(); ++fi) {
if (!out[fi].valid()) {
LOG(ERROR) << "Out[" << fi << "] tensor not valid";
return false;
}
if (out.size() != out[fi].shape0()) {
LOG(ERROR) << "Shape0 not consistency, " << out.size()
<< "!=" << out[fi].shape0() << ", " << fi;
return false;
}
}
}
return true;
}
size_t append_task(TaskT* task) {
size_t add = std::min(task->rem, _rem_size);
if (!_batch_align) {
add = task->rem;
}
TaskMetaT tm(task, task->in->size() - task->rem, add);
_tasks.push_back(tm);
task->rem -= add;
_rem_size -= add;
return _rem_size;
}
void merge_tasks() {
merge_input();
merge_output();
}
void merge_input() {
if (_tasks.size() <= 0 || _tasks[0].task->in->count() <= 0) {
return;
}
if (_tasks.size() == 1 && !_batch_align) {
TaskMetaT& tm = _tasks[0];
_batch_in = *(tm.task->in);
return;
}
merge_tensor(true);
}
void merge_output() {
if (_batch_align) {
if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
return;
}
}
if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
return;
}
TaskMetaT& tm = _tasks[0];
if (_tasks.size() == 1 && !_batch_align) {
_batch_out = *(tm.task->out);
return;
}
if (tm.task->out->size() <= 0) {
// shape is empty
_batch_out = *(tm.task->out);
return;
}
if ((*tm.task->out)[0].data.data() == 0 ||
(*tm.task->out)[0].data.size() == 0) {
_batch_out = *(tm.task->out);
return;
}
merge_tensor(false);
}
void merge_tensor(bool is_in) {
// accumulate batch size from fetched tasks
size_t batch_size = 0;
for (size_t ti = 0; ti < _tasks.size(); ++ti) {
TaskMetaT& tm = _tasks[ti];
size_t add = tm.end - tm.begin;
batch_size += add;
}
// merge all instanses in each tensor data
size_t tensor_count = _tasks[0].task->get(is_in)->count();
for (size_t fi = 0; fi < tensor_count; ++fi) {
const Tensor& head = (*(_tasks[0].task->get(is_in)))[fi];
Tensor batch_tensor;
batch_tensor.name = head.name;
batch_tensor.type = head.type;
batch_tensor.shape.push_back(batch_size);
size_t ins_ele_count = 1;
for (size_t si = 1; si < head.shape.size(); ++si) {
batch_tensor.shape.push_back(head.shape[si]);
ins_ele_count *= head.shape[si];
}
size_t tensor_ele_count = ins_ele_count * batch_size;
size_t ins_byte = ins_ele_count * head.ele_byte();
size_t tensor_byte = tensor_ele_count * head.ele_byte();
void* data_buf = MempoolWrapper::instance().malloc(tensor_byte);
if (!data_buf) {
LOG(ERROR) << "Malloc failed, size: " << tensor_byte;
return;
}
size_t data_byte = 0;
for (size_t ti = 0; ti < _tasks.size(); ++ti) {
TaskMetaT& tm = _tasks[ti];
size_t acc_byte = ins_byte * (tm.end - tm.begin);
if (data_byte + acc_byte > tensor_byte) {
LOG(ERROR) << "Invalid bytes: " << data_byte << " + " << acc_byte
<< " >= " << tensor_byte;
return;
}
const Tensor& tensor = (*(tm.task->get(is_in)))[fi];
memcpy(
reinterpret_cast<char*>(data_buf) + data_byte,
reinterpret_cast<char*>(tensor.data.data()) + tm.begin * ins_byte,
acc_byte);
data_byte += acc_byte;
}
if (data_byte != tensor_byte) {
LOG(ERROR) << "Invalid tensor byte: " << data_byte
<< " != " << tensor_byte;
return;
}
batch_tensor.data =
DataBuf(reinterpret_cast<char*>(data_buf), tensor_byte);
if (is_in) {
_batch_in.push_back(batch_tensor);
} else {
_batch_out.push_back(batch_tensor);
}
}
LOG(INFO) << "merge input(" << is_in << ") samples: " << batch_size
<< " from " << _tasks.size() << " pvs";
}
void notify_tasks() {
if (_batch_out.size() != _batch_in.size()) {
LOG(ERROR) << "batch size not consistency: " << _batch_out.size()
<< " != " << _batch_in.size();
return;
}
size_t tensor_count = _batch_out.count();
size_t batch_size = _batch_out.size();
for (size_t fi = 0; fi < tensor_count; ++fi) {
const Tensor& tensor = _batch_out[fi];
size_t ins_byte = tensor.ele_byte();
for (size_t si = 1; si < tensor.shape.size(); ++si) {
ins_byte *= tensor.shape[si];
}
for (size_t ti = 0, bi = 0, add = 0; ti < _tasks.size();
++ti, bi += add) {
OutArrayT* dst = _tasks[ti].task->out;
add = _tasks[ti].end - _tasks[ti].begin;
size_t offset_src = ins_byte * bi;
size_t add_byte = add * ins_byte;
if (_batch_align) { // merge all batchs
size_t offset_dst = ins_byte * _tasks[ti].begin;
void* ptr = const_cast<void*>((*dst)[fi].data.data());
memcpy(
reinterpret_cast<char*>(ptr) + offset_dst,
reinterpret_cast<char*>(_batch_out[fi].data.data()) + offset_src,
add_byte);
} else { // overwrite
if (dst->count() <= 0) {
dst->push_back(_batch_out[fi]);
} else {
(*dst)[fi] = _batch_out[fi];
}
(*dst)[fi].shape[0] = add;
(*dst)[fi].data = DataBuf(
reinterpret_cast<char*>(_batch_out[fi].data.data()) + offset_src,
add_byte);
}
}
}
for (size_t ti = 0; ti < _tasks.size(); ++ti) {
TaskT* task = _tasks[ti].task;
size_t begin = _tasks[ti].begin;
size_t end = _tasks[ti].end;
size_t add = end - begin;
size_t index = task->index.fetch_add(add);
if ((index + add) >= task->in->size()) {
char c = 0;
while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
}
butil::return_object(task);
}
}
}
const typename TaskT::InArrayT& in() const { return _batch_in; }
typename TaskT::OutArrayT& out() { return _batch_out; }
size_t task_size() { return _tasks.size(); }
private:
std::vector<TaskMetaT> _tasks;
InArrayT _batch_in;
OutArrayT _batch_out;
size_t _batch_size;
size_t _rem_size;
bool _batch_align;
};
} // namespace bsf
} // namespace im
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <boost/bind.hpp> #include <boost/bind.hpp>
#include "core/predictor/common/inner_common.h" #include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/memory.h"
namespace im { namespace im {
namespace bsf { namespace bsf {
...@@ -35,7 +36,7 @@ void* TaskExecutor<TaskT>::thread_entry(void* args) { ...@@ -35,7 +36,7 @@ void* TaskExecutor<TaskT>::thread_entry(void* args) {
static_cast<TaskExecutor<TaskT>*>(context->executor); static_cast<TaskExecutor<TaskT>*>(context->executor);
executor->work(context); executor->work(context);
return NULL; return nullptr;
} }
template <typename TaskT> template <typename TaskT>
...@@ -70,7 +71,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) { ...@@ -70,7 +71,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) {
_thread_contexts.push_back(&contexts[i]); _thread_contexts.push_back(&contexts[i]);
} }
int init_timeout = init_timeout_sec * 1000 * 1000; size_t init_timeout = init_timeout_sec * 1000 * 1000;
bool has_error = false; bool has_error = false;
bool has_timeout = true; bool has_timeout = true;
...@@ -102,7 +103,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) { ...@@ -102,7 +103,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) {
} }
// 100ms // 100ms
const int sleep_interval = 100 * 1000; const size_t sleep_interval = 100 * 1000;
usleep(sleep_interval); usleep(sleep_interval);
init_timeout -= sleep_interval; init_timeout -= sleep_interval;
} }
...@@ -125,18 +126,21 @@ void TaskExecutor<TaskT>::stop() { ...@@ -125,18 +126,21 @@ void TaskExecutor<TaskT>::stop() {
} }
template <typename TaskT> template <typename TaskT>
TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in, TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
OutArrayT& out) { // NOLINT const void* inVectorT_ptr,
void* outVectorT_ptr) { // NOLINT
TaskT* task = butil::get_object<TaskT>(); TaskT* task = butil::get_object<TaskT>();
if (!task) { if (!task) {
LOG(ERROR) << "Failed get TaskT from object pool"; LOG(ERROR) << "Failed get TaskT from object pool";
return TaskHandler<TaskT>::valid_handle(); return TaskHandler<TaskT>::valid_handle();
} }
/*
if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) { if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) {
LOG(ERROR) << "Invalid input & output"; LOG(ERROR) << "Invalid input & output";
return TaskHandler<TaskT>::valid_handle(); return TaskHandler<TaskT>::valid_handle();
} }
*/
int fds[2]; int fds[2];
int rc = pipe(fds); int rc = pipe(fds);
...@@ -150,10 +154,9 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in, ...@@ -150,10 +154,9 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in,
task->write_fd = fds[1]; task->write_fd = fds[1];
task->owner_tid = ::syscall(SYS_gettid); task->owner_tid = ::syscall(SYS_gettid);
task->in = &in; task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
task->out = &out; task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
task->rem = in.size(); task->rem = task->batch_size();
task->size = in.size();
task->index.store(0, butil::memory_order_relaxed); task->index.store(0, butil::memory_order_relaxed);
AutoMutex lock(_mut); AutoMutex lock(_mut);
...@@ -163,8 +166,13 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in, ...@@ -163,8 +166,13 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in,
return TaskHandler<TaskT>(*task); return TaskHandler<TaskT>(*task);
} }
// this function is accessed by multi thread.
// so AutoMutex at first.
// so batch.append_task is thread safe.
// you dont need to add extra lock in append_task()
template <typename TaskT> template <typename TaskT>
bool TaskExecutor<TaskT>::fetch_batch(BatchTasks<TaskT>& batch) { // NOLINT bool TaskExecutor<TaskT>::move_task_to_batch(
BatchTasks<TaskT>& batch) { // NOLINT
AutoMutex lock(_mut); AutoMutex lock(_mut);
while (_task_queue.empty()) { while (_task_queue.empty()) {
THREAD_COND_WAIT(&_cond, &_mut); THREAD_COND_WAIT(&_cond, &_mut);
...@@ -187,8 +195,30 @@ bool TaskExecutor<TaskT>::fetch_batch(BatchTasks<TaskT>& batch) { // NOLINT ...@@ -187,8 +195,30 @@ bool TaskExecutor<TaskT>::fetch_batch(BatchTasks<TaskT>& batch) { // NOLINT
return true; return true;
} }
// this function is accessed by multi thread.
// move_task_to_batch have add lock inside the function.
// Packaging 1 TaskT as 1 or Several TaskMeta.
// TaskT is from the SingleTon TaskExecutor`s _task_queue
// although TaskMeta is a local variable, but several TaskMeta may points to
// the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
// put TaskMeta to the local variable BatchTasks<TaskT> batch.
// batch.merge_tasks() and batch.notify_tasks() has no lock.
// BatchTasks<TaskT> batch itself is a local variable, it`s thread safe.
// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
// you need to pay attention to that.
// Multi-Thread deal with different TaskMeta(cause it`s created as local
// variable)
// But different TaskMeta may points to the same TaskT
// which is get from the SingleTon TaskExecutor`s _task_queue.
template <typename TaskT> template <typename TaskT>
int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) { int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
if (_thread_init_fn != NULL) { if (_thread_init_fn != NULL) {
if (_thread_init_fn(context->user_thread_context) != 0) { if (_thread_init_fn(context->user_thread_context) != 0) {
LOG(ERROR) << "execute thread init thunk failed, BSF thread will exit"; LOG(ERROR) << "execute thread init thunk failed, BSF thread will exit";
...@@ -207,10 +237,15 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) { ...@@ -207,10 +237,15 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
} }
} }
if (MempoolWrapper::instance().thread_clear() != 0) {
LOG(ERROR) << "Failed thread clear mempool";
return -1;
}
BatchTasks<TaskT> batch(_batch_size, _batch_align); BatchTasks<TaskT> batch(_batch_size, _batch_align);
if (fetch_batch(batch)) { if (move_task_to_batch(batch)) {
batch.merge_tasks(); batch.merge_tasks();
_fn(batch.in(), batch.out()); _fn(&batch.in(), &batch.out());
batch.notify_tasks(); batch.notify_tasks();
} }
} }
...@@ -219,9 +254,10 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) { ...@@ -219,9 +254,10 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
} }
template <typename InItemT, typename OutItemT> template <typename InItemT, typename OutItemT>
bool TaskManager<InItemT, OutItemT>::schedule(const InArrayT& in, bool TaskManager<InItemT, OutItemT>::schedule(const void* in,
OutArrayT& out) { // NOLINT void* out) { // NOLINT
TaskHandler<TaskT> handler = _executor.schedule(in, out); TaskHandler<TaskT> handler =
TaskExecutorVector<TaskT>::instance()[_model_index].schedule(in, out);
if (handler.valid()) { if (handler.valid()) {
_task_owned = handler; _task_owned = handler;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <errno.h> #include <errno.h>
#include <algorithm> #include <algorithm>
#include <deque> #include <list>
#include <vector> #include <vector>
#ifdef BCLOUD #ifdef BCLOUD
...@@ -29,46 +29,186 @@ ...@@ -29,46 +29,186 @@
#include "boost/function.hpp" #include "boost/function.hpp"
#include "core/predictor/framework/memory.h"
#include "paddle_inference_api.h"
namespace im { namespace im {
namespace bsf { namespace bsf {
static const size_t DEFAULT_BATCH_SIZE = 100; static const size_t DEFAULT_BATCH_SIZE = 100;
// InItemT is paddle::PaddleTensor
// InVectorT std::vector<paddle::PaddleTensor>
// InVectorT means different feedvar, but not batch.
// Batch is already inside the paddle::PaddleTensor.
// size_t `rem` records how many batch have not been put in BatchTasks.
// `rem` don`t need to be atomic, cause the operation `put` is synchronous.
// actually, the reason is that lock have been added outside the operation
// `put`.
// size_t `index` records how many batch have been processing completed.
// `index` need to be atomic, cause the operation 'notify' is asynchronous.
template <typename InItemT, typename OutItemT> template <typename InItemT, typename OutItemT>
struct Task { struct Task {
typedef std::vector<InItemT> InArrayT; typedef std::vector<InItemT> InVectorT;
typedef std::vector<OutItemT> OutArrayT; typedef std::vector<OutItemT> OutVectorT;
typedef InItemT InType; typedef InItemT InType;
typedef OutItemT OutType; typedef OutItemT OutType;
typedef Task<InItemT, OutItemT> TaskT; typedef Task<InItemT, OutItemT> TaskT;
typedef std::vector<int> ShapeVector;
typedef std::vector<ShapeVector> VectorOfShapeVector;
int read_fd; int read_fd;
int write_fd; int write_fd;
pid_t owner_tid; pid_t owner_tid;
const InVectorT* inVectorT_ptr;
const InArrayT* in; OutVectorT* outVectorT_ptr;
OutArrayT* out;
size_t rem; size_t rem;
size_t size;
size_t batch_size() { return in->size(); }
butil::atomic<size_t> index; butil::atomic<size_t> index;
Task() { Task() {
read_fd = -1; read_fd = -1;
write_fd = -1; write_fd = -1;
owner_tid = -1; owner_tid = -1;
in = NULL; inVectorT_ptr = NULL;
out = NULL; outVectorT_ptr = NULL;
rem = -1; rem = -1;
size = -1;
index.store(0, butil::memory_order_relaxed); index.store(0, butil::memory_order_relaxed);
} }
bool check_feedvar_valid(int feedvar_index) {
if (feedvar_index < 0 || inVectorT_ptr->size() <= feedvar_index) {
LOG(ERROR) << "feedvar doesnt exsit or feedvar_index error";
return 0;
}
if ((*inVectorT_ptr)[feedvar_index].shape.size() <= 0) {
LOG(ERROR) << "feedvar[" << feedvar_index << "].shape.size()<=0,error";
return 0;
}
return 1;
}
// Now, it simply assume that the first dimension of data is batch.
// so the batch is PaddleTensor.shape[0]
// If batch information is added into feedvar.prototxt.
// we can get the information from the feedvar.prototxt instead of assume.
size_t feedvar_batch_size(int feedvar_index) {
if (!check_feedvar_valid(feedvar_index)) {
return 0;
}
return (*inVectorT_ptr)[feedvar_index].shape[0];
}
size_t feedvar_element_bytesize(int feedvar_index) {
if (!check_feedvar_valid(feedvar_index)) {
return 0;
}
int dtype = (*inVectorT_ptr)[feedvar_index].dtype;
if (dtype == paddle::PaddleDType::INT64) {
return sizeof(int64_t);
}
if (dtype == paddle::PaddleDType::FLOAT32) {
return sizeof(float);
}
if (dtype == paddle::PaddleDType::INT32) {
return sizeof(int32_t);
}
if (dtype == paddle::PaddleDType::UINT8) {
return sizeof(char);
}
return 0;
}
// Now, the implementation of this function is based on assumption
// that shape [0] = batch_size.
size_t feedvar_element_num(int feedvar_index) {
if (!check_feedvar_valid(feedvar_index)) {
return 0;
}
size_t element_num = 1;
if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) {
// cause shape[0] is batch_size.
// [10,1] = [10], so if shape[1] doesn`t exist.
// should return 1.
return 1;
}
// start from shape[1], cause shape[0] = batch_size.
for (int i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
element_num *= (*inVectorT_ptr)[feedvar_index].shape[i];
}
return element_num;
}
size_t feedvar_bytesize(int feedvar_index) {
return feedvar_element_num(feedvar_index) *
feedvar_element_bytesize(feedvar_index);
}
ShapeVector feedvar_shape_nobatch(int feedvar_index) {
if (!check_feedvar_valid(feedvar_index)) {
return ShapeVector();
}
return ShapeVector{(*inVectorT_ptr)[feedvar_index].shape.begin() + 1,
(*inVectorT_ptr)[feedvar_index].shape.end()};
}
VectorOfShapeVector feedvar_shape_nobatch() {
VectorOfShapeVector vector_of_feedvar_shape_nobatch(inVectorT_ptr->size());
for (int index = 0; index < inVectorT_ptr->size(); ++index) {
vector_of_feedvar_shape_nobatch.push_back(feedvar_shape_nobatch(index));
}
return vector_of_feedvar_shape_nobatch;
}
// At present, it is considered that the batch of all feedvar is consistent.
// so for each feedvar, PaddleTensor.shape[0] should be the same.
bool check_batch_align() {
int batch_size_align = feedvar_batch_size(0);
for (int feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
++feedvar_index) {
if (feedvar_batch_size(feedvar_index) != batch_size_align) {
return 0;
}
}
/*
for(int fetchvar_index = 0; fetchvar_index < outVectorT_ptr->size();
++fetchvar_index) {
if(fetchvar_batch_size(fetchvar_index) != batch_size_align) {
return 0;
}
}
*/
return 1;
}
size_t batch_size() {
if (check_batch_align()) {
return feedvar_batch_size(0);
}
return 0;
}
}; };
// `Several Task` or `part of batch in Task` can be a TaskMeta.
// Task is the original Request from User.
// For example, the batch of Task is 30. There are 4 Requests.
// The batch of BatchTasks is 100, which means we can deal 100 batch 1 time.
// TaskMeta-1:{task-1,0,30} TaskMeta-2:{task-2,0,30} TaskMeta-3:{task-3,0,30}
// but the last Task will be divided to 2 TaskMeta.
// TaskMeta-4:{task-4,0,10} TaskMeta-5:{task-4,10,30}.
// TaskMeta-1 ~ TaskMeta-4 will be inside BatchTasks-1.
// TaskMeta-5 will be inside BatchTasks-2.
// TaskMeta is necessary.
// cause we need know the the corresponding relationship between
// `batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
// especially when 1 Task be divided into several TaskMeta and be put into
// several different BatchTasks.
template <typename TaskT> template <typename TaskT>
struct TaskMeta { struct TaskMeta {
TaskMeta(TaskT* ptr, size_t start, size_t add) TaskMeta(TaskT* ptr, size_t start, size_t add)
...@@ -79,6 +219,11 @@ struct TaskMeta { ...@@ -79,6 +219,11 @@ struct TaskMeta {
size_t end; size_t end;
}; };
// each TaskT is already include batch in itself
// BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
// The only difference between the `big TaskT` and `small TaskT` is that
// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
// which is actually batch_size is different.
template <typename TaskT> template <typename TaskT>
class BatchTasks { class BatchTasks {
public: public:
...@@ -91,33 +236,38 @@ class BatchTasks { ...@@ -91,33 +236,38 @@ class BatchTasks {
_rem_size(batch_size), _rem_size(batch_size),
_batch_align(batch_align) { _batch_align(batch_align) {
_batch_in.clear(); _batch_in.clear();
_batch_in_offset.clear();
_batch_out.clear(); _batch_out.clear();
_tasks.clear(); _batch_out_offset.clear();
_taskmeta_vector.clear();
} }
~BatchTasks() { ~BatchTasks() {
_batch_in.clear(); _batch_in.clear();
_batch_in_offset.clear();
_batch_out.clear(); _batch_out.clear();
_tasks.clear(); _batch_out_offset.clear();
_taskmeta_vector.clear();
} }
// synchronized operation // synchronized operation
// because Upper level callers of this function have already locked.
size_t append_task(TaskT* task) { size_t append_task(TaskT* task) {
size_t add = std::min(task->rem, _rem_size); size_t add = std::min(task->rem, _rem_size);
if (!_batch_align) { if (!_batch_align) {
add = task->rem; add = task->rem;
} }
int start_index = task->batch_size() - task->rem;
TaskMetaT tm(task, task->in->size() - task->rem, add); TaskMetaT tm(task, start_index, add);
_tasks.push_back(tm); _taskmeta_vector.push_back(tm);
task->rem -= add; task->rem -= add;
_rem_size -= add; _rem_size -= add;
return _rem_size; return _rem_size;
} }
static bool check_valid(const typename TaskT::InArrayT& in, static bool check_valid(const typename TaskT::InVectorT& in,
const typename TaskT::OutArrayT& out, const typename TaskT::OutVectorT& out,
bool align) { bool align) {
(void)in; (void)in;
(void)out; (void)out;
...@@ -125,40 +275,220 @@ class BatchTasks { ...@@ -125,40 +275,220 @@ class BatchTasks {
return true; return true;
} }
// this should be modified totally.
// maybe we don`t need to do this inside the BatchTasks.
// we can do the copy work outside the BatchTasks.
// cause maybe next time we don`t need to do the extra copy.
// directly copy the every Task into the Predictor.
// lod is not supported.
// if lod is set, we should not allow to use the bsf task.
// batch.merge_tasks() is thread-safe function
// cause batch is a local variable and Task is just read, not written.
void merge_tasks() { void merge_tasks() {
for (size_t ti = 0; ti < _tasks.size(); ++ti) { if (_taskmeta_vector.size() <= 0) {
TaskMetaT& tm = _tasks[ti]; return;
for (size_t vi = tm.begin; vi < tm.end; ++vi) { }
_batch_in.push_back((*tm.task->in)[vi]);
_batch_out.push_back((*tm.task->out)[vi]); // Temporarily, the batch of each feedvar is consistent
// If not consistent, use feedvar_batch_size instead of task->batch_size().
int temp_batch = 0;
for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
TaskMetaT& tm = _taskmeta_vector[ti];
temp_batch += tm.task->batch_size();
}
if (temp_batch > _batch_size) {
LOG(ERROR) << "_realNumber_batch_in >_batch_size, error.";
return;
}
int feedvar_num = _taskmeta_vector[0].task->inVectorT_ptr->size();
if (_batch_in_offset.size() == 0) {
_batch_in_offset.resize(feedvar_num, 0);
_realNumber_batch_in.resize(feedvar_num, temp_batch);
}
for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
TaskMetaT& tm = _taskmeta_vector[ti];
for (int index = 0; index < feedvar_num; ++index) {
const paddle::PaddleTensor& feedVarTensor =
(*tm.task->inVectorT_ptr)[index];
size_t feedvar_bytesize = tm.task->feedvar_bytesize(index);
if (ti == 0) {
if (feedVarTensor.lod.size() > 0 && feedVarTensor.lod[0].size() > 0) {
LOG(ERROR) << "lod Tensor is not supported now.";
return;
}
// for now, we assume that every task feedvar_bytesize is the same.
// which means we dont support auto embedding.
// but for different feedvar, it is different.
paddle::PaddleTensor paddleTensor;
paddleTensor.dtype = feedVarTensor.dtype;
paddleTensor.name = feedVarTensor.name;
paddleTensor.lod = feedVarTensor.lod;
paddleTensor.shape = feedVarTensor.shape;
paddleTensor.shape[0] = _realNumber_batch_in[index];
paddleTensor.data.Resize(feedvar_bytesize *
_realNumber_batch_in[index]);
_batch_in.push_back(paddleTensor);
}
void* dst_ptr = _batch_in[index].data.data() + _batch_in_offset[index];
void* source_ptr =
feedVarTensor.data.data() + feedvar_bytesize * tm.begin;
size_t length = feedvar_bytesize * (tm.end - tm.begin);
memcpy(dst_ptr, source_ptr, length);
_batch_in_offset[index] += length;
} }
} }
} }
bool check_fetchvar_valid(int fetchvar_index) {
if (fetchvar_index < 0 || _batch_out.size() <= fetchvar_index) {
LOG(ERROR) << "fetchvar doesnt exsit or fetchvar_index error";
return 0;
}
if (_batch_out[fetchvar_index].shape.size() <= 0) {
LOG(ERROR) << "fetchvar[" << fetchvar_index << "].shape.size()<=0,error";
return 0;
}
return 1;
}
size_t fetchvar_batch_size(int fetchvar_index) {
if (!check_fetchvar_valid(fetchvar_index)) {
return 0;
}
return _batch_out[fetchvar_index].shape[0];
}
size_t fetchvar_element_bytesize(int fetchvar_index) {
if (!check_fetchvar_valid(fetchvar_index)) {
return 0;
}
int dtype = _batch_out[fetchvar_index].dtype;
if (dtype == paddle::PaddleDType::INT64) {
return sizeof(int64_t);
}
if (dtype == paddle::PaddleDType::FLOAT32) {
return sizeof(float);
}
if (dtype == paddle::PaddleDType::INT32) {
return sizeof(int32_t);
}
if (dtype == paddle::PaddleDType::UINT8) {
return sizeof(char);
}
return 0;
}
// Now, the implementation of this function is based on assumption
// that shape [0] = batch_size.
size_t fetchvar_element_num(int fetchvar_index) {
if (!check_fetchvar_valid(fetchvar_index)) {
return 0;
}
size_t element_num = 1;
if (_batch_out[fetchvar_index].shape.size() == 1) {
// cause shape[0] is batch_size.
return 1;
}
// start from shape[1], cause shape[0] = batch_size.
for (int i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
element_num *= _batch_out[fetchvar_index].shape[i];
}
return element_num;
}
size_t fetchvar_bytesize(int fetchvar_index) {
return fetchvar_element_num(fetchvar_index) *
fetchvar_element_bytesize(fetchvar_index);
}
bool check_fetchvar_batch_align() {
int batch_size_align = fetchvar_batch_size(0);
for (int fetchvar_index = 0; fetchvar_index < _batch_out.size();
++fetchvar_index) {
if (fetchvar_batch_size(fetchvar_index) != batch_size_align) {
return 0;
}
}
return 1;
}
size_t fetchvar_batch_size() {
if (check_fetchvar_batch_align()) {
return fetchvar_batch_size(0);
}
return 0;
}
void notify_tasks() { void notify_tasks() {
if (_batch_out.size() != _batch_in.size()) { if (_taskmeta_vector.size() <= 0) {
LOG(ERROR) << "batch size not consistency: " << _batch_out.size() LOG(ERROR) << "_taskmeta_vector.size() <=0, error.";
<< " != " << _batch_in.size(); return;
}
if (_realNumber_batch_in[0] != fetchvar_batch_size()) {
LOG(ERROR) << "_batch_out`s batch != _batch_in`s batch, error.";
return; return;
} }
for (size_t ti = 0, bi = 0; ti < _tasks.size(); ++ti) { int fetchvar_num = _batch_out.size();
TaskT* task = _tasks[ti].task; if (_batch_out_offset.size() == 0) {
size_t begin = _tasks[ti].begin; _batch_out_offset.resize(fetchvar_num, 0);
size_t end = _tasks[ti].end; }
for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
TaskT* task = _taskmeta_vector[ti].task;
size_t begin = _taskmeta_vector[ti].begin;
size_t end = _taskmeta_vector[ti].end;
size_t add = end - begin; size_t add = end - begin;
for (size_t oi = begin; oi < end; ++oi, ++bi) { for (int index = 0; index < fetchvar_num; ++index) {
if (bi >= _batch_in.size()) { // the task->outVectorT_ptr is null before core->run().
LOG(ERROR) << "batch index overflow: " << bi << " > " // first time we should copy from _batch_out
<< _batch_in.size(); // so we need init.
size_t fetchvar_bytesize_index = fetchvar_bytesize(index);
if (task->outVectorT_ptr->size() <= index) {
paddle::PaddleTensor tensor_out;
tensor_out.name = _batch_out[index].name;
tensor_out.dtype = paddle::PaddleDType(_batch_out[index].dtype);
tensor_out.shape = _batch_out[index].shape;
tensor_out.shape[0] = task->batch_size();
tensor_out.lod = _batch_out[index].lod;
// resize all batch memory at one time
size_t databuf_size = task->batch_size() * fetchvar_bytesize_index;
tensor_out.data.Resize(databuf_size);
task->outVectorT_ptr->push_back(tensor_out);
}
paddle::PaddleTensor& fetchVarTensor = (*task->outVectorT_ptr)[index];
void* dst_ptr =
fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
size_t length = fetchvar_bytesize_index * add;
if (_batch_out_offset[index] + length >
fetchvar_batch_size() * fetchvar_bytesize(index)) {
LOG(ERROR) << "_batch_out is less than taskmeta, error.";
return; return;
} }
(*task->out)[oi] = _batch_out[bi]; void* source_ptr =
_batch_out[index].data.data() + _batch_out_offset[index];
memcpy(dst_ptr, source_ptr, length);
_batch_out_offset[index] += length;
} }
size_t index = task->index.fetch_add(add); size_t index = task->index.fetch_add(add);
if ((index + add) >= task->in->size()) { if ((index + add) >= task->batch_size()) {
char c = 0; char c = 0;
while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) { while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
} }
...@@ -167,22 +497,33 @@ class BatchTasks { ...@@ -167,22 +497,33 @@ class BatchTasks {
} }
} }
const typename TaskT::InArrayT& in() const { return _batch_in; } const typename TaskT::InVectorT& in() const { return _batch_in; }
typename TaskT::OutArrayT& out() { return _batch_out; } typename TaskT::OutVectorT& out() { return _batch_out; }
size_t task_size() { return _tasks.size(); } size_t task_size() { return _taskmeta_vector.size(); }
private: private:
std::vector<TaskMetaT> _tasks; std::vector<TaskMetaT> _taskmeta_vector;
typename TaskT::InArrayT _batch_in; typename TaskT::InVectorT _batch_in;
typename TaskT::OutArrayT _batch_out; std::vector<size_t> _batch_in_offset;
std::vector<size_t> _realNumber_batch_in;
typename TaskT::OutVectorT _batch_out;
std::vector<size_t> _batch_out_offset;
std::vector<size_t> _realNumber_batch_out;
size_t _rem_size; size_t _rem_size;
size_t _batch_size; size_t _batch_size;
bool _batch_align; bool _batch_align;
}; };
// BSF task handle // BSF task handle
// TaskHandler is the handle of Task.
// `read_fd` is used for receive signal in brpc Thread.
// 'write_fd' is used for write signal in bsf Thread.
// when TaskMeta is done, bsf Thread will write to 'write_fd'.
// brpc Thread is keeping reading 'read_fd' in a while loop.
// brpc Thread will receive signal when TaskMeta is done.
// so `read_fd` and 'write_fd' is used for communicate in different Thread.
template <typename TaskT> template <typename TaskT>
struct TaskHandler { struct TaskHandler {
int read_fd; int read_fd;
...@@ -205,12 +546,11 @@ struct TaskHandler { ...@@ -205,12 +546,11 @@ struct TaskHandler {
} }
}; };
// TaskExecutor is a Thread pool.
template <typename TaskT> template <typename TaskT>
class TaskExecutor; class TaskExecutor;
template <typename InItemT, typename OutItemT> // ThreadContext is used for start a bsf Thread.
class TaskManager;
template <typename TaskT> template <typename TaskT>
struct ThreadContext { struct ThreadContext {
TaskExecutor<TaskT>* executor; TaskExecutor<TaskT>* executor;
...@@ -231,14 +571,24 @@ struct ThreadContext { ...@@ -231,14 +571,24 @@ struct ThreadContext {
} }
}; };
// TaskExecutor is a Thread pool.
// Each Model corresponding to a Model.
// TaskT is actually a Request preprocessed by ReaderOp.
// TaskT will be divided as TaskMeta which will be
// put into _task_queue in brpc-Thread by schedule().
// TaskHander will be returned to brpc-Thread.
// start() function will create `thread_num` bsf Threads.
// every bsf Thread check the _task_queue and take TaskMeta from it.
// when a Task`s all TaskMeta is done, TaskHander will be noticed.
template <typename TaskT> template <typename TaskT>
class TaskExecutor { class TaskExecutor {
public: public:
typedef typename TaskT::InType InType; typedef typename TaskT::InType InType;
typedef typename TaskT::OutType OutType; typedef typename TaskT::OutType OutType;
typedef typename TaskT::InArrayT InArrayT; typedef typename TaskT::InVectorT InVectorT;
typedef typename TaskT::OutArrayT OutArrayT; typedef typename TaskT::OutVectorT OutVectorT;
typedef std::vector<TaskT> TaskArrayT; typedef std::vector<TaskT> TaskArrayT;
typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
TaskExecutor() TaskExecutor()
: _stop(false), : _stop(false),
...@@ -258,9 +608,11 @@ class TaskExecutor { ...@@ -258,9 +608,11 @@ class TaskExecutor {
THREAD_COND_DESTROY(&_cond); THREAD_COND_DESTROY(&_cond);
} }
static TaskExecutor<TaskT>* instance() { // cause vector.resize will use copy or move construct.
static TaskExecutor<TaskT> singleton; TaskExecutor(TaskExecutor<TaskT>&& other) noexcept {
return &singleton; if (this != &other) {
TaskExecutor();
}
} }
void set_batch_size(size_t batch_size) { _batch_size = batch_size; } void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
...@@ -277,8 +629,7 @@ class TaskExecutor { ...@@ -277,8 +629,7 @@ class TaskExecutor {
_thread_reset_fn = reset_fn; _thread_reset_fn = reset_fn;
} }
void set_thread_callback_fn( void set_thread_callback_fn(boost::function<void(const void*, void*)> cb) {
boost::function<void(const InArrayT&, OutArrayT&)> cb) {
_fn = cb; _fn = cb;
} }
...@@ -287,15 +638,21 @@ class TaskExecutor { ...@@ -287,15 +638,21 @@ class TaskExecutor {
static void* thread_entry(void* args); static void* thread_entry(void* args);
private:
TaskExecutor(TaskExecutor<TaskT> const& other);
TaskExecutor* operator=(TaskExecutor<TaskT> const& other);
int work(ThreadContext<TaskT>* context); int work(ThreadContext<TaskT>* context);
TaskHandler<TaskT> schedule(const InArrayT&, OutArrayT&); TaskHandler<TaskT> schedule(const void*, void*);
bool fetch_batch(BatchTasks<TaskT>& batch); // NOLINT bool move_task_to_batch(BatchTasks<TaskT>& batch); // NOLINT
private:
TaskExecutor(TaskExecutor<TaskT> const& other) = delete;
TaskExecutor& operator=(TaskExecutor<TaskT> const& other) = delete;
/*
TaskExecutor(TaskExecutor<TaskT> && other) = delete;
TaskExecutor& operator=(TaskExecutor<TaskT> && other) = delete;
*/
bool _stop; bool _stop;
...@@ -303,43 +660,76 @@ class TaskExecutor { ...@@ -303,43 +660,76 @@ class TaskExecutor {
THREAD_MUTEX_T _mut; THREAD_MUTEX_T _mut;
THREAD_COND_T _cond; THREAD_COND_T _cond;
std::deque<TaskT*> _task_queue; std::list<TaskT*> _task_queue;
boost::function<int(void*)> _thread_init_fn; boost::function<int(void*)> _thread_init_fn;
boost::function<int(void*)> _thread_reset_fn; boost::function<int(void*)> _thread_reset_fn;
void** _user_thread_contexts; void** _user_thread_contexts;
std::vector<ThreadContext<TaskT>*> _thread_contexts; std::vector<ThreadContext<TaskT>*> _thread_contexts;
friend class TaskManager<InType, OutType>;
size_t _batch_size; size_t _batch_size;
bool _batch_align; bool _batch_align;
boost::function<void(const InArrayT&, OutArrayT&)> _fn; boost::function<void(const void*, void*)> _fn;
}; };
// TaskExecutorVector is a SingleTon class.
// Each Model corresponding to a TaskExecutor.
// So we need several TaskExecutor when there are more than 1 Model.
template <typename TaskT>
class TaskExecutorVector {
public:
static TaskExecutorVector<TaskT>& instance() {
static TaskExecutorVector<TaskT> singleton;
return singleton;
}
void resize(int size) { _vector_executor.resize(size); }
TaskExecutor<TaskT>& operator[](int index) {
if (_vector_executor.size() <= index || index <= -1) {
LOG(ERROR) << "_vector_executor.size() <= index or <= -1";
throw "_vector_executor.size() <= index or <= -1";
}
return _vector_executor[index];
}
private:
TaskExecutorVector() = default;
TaskExecutorVector(const TaskExecutorVector<TaskT>& other) = delete;
TaskExecutorVector& operator=(const TaskExecutorVector<TaskT>& other) =
delete;
TaskExecutorVector(TaskExecutorVector<TaskT>&& other) = delete;
TaskExecutorVector& operator=(TaskExecutorVector<TaskT>&& other) = delete;
std::vector<TaskExecutor<TaskT>> _vector_executor;
};
// TaskManager is actually a wrapper of Request in bsf.
// TaskManager`s schedule() change Request to be TaskT.
// and divided TaskT into several TaskMeta to put into the TaskExecutor`s
// task_queue.
// wait() is a while loop to receive signal when a whole Task is done.
template <typename InItemT, typename OutItemT> template <typename InItemT, typename OutItemT>
class TaskManager { class TaskManager {
public: public:
typedef Task<InItemT, OutItemT> TaskT; typedef Task<InItemT, OutItemT> TaskT;
typedef typename TaskT::InArrayT InArrayT; typedef typename TaskT::InVectorT InVectorT;
typedef typename TaskT::OutArrayT OutArrayT; typedef typename TaskT::OutVectorT OutVectorT;
explicit TaskManager(TaskExecutor<TaskT>& exe, size_t batch_size) // NOLINT
: _executor(exe) {}
TaskManager() : _executor(*TaskExecutor<TaskT>::instance()) {} explicit TaskManager(uint32_t index) // NOLINT
: _model_index(index) {}
~TaskManager() { wait(); } ~TaskManager() { wait(); }
bool schedule(const InArrayT& in, OutArrayT& out); // NOLINT bool schedule(const void* in, void* out); // NOLINT
void wait(); void wait();
inline void clear() { wait(); } inline void clear() { wait(); }
private: private:
TaskExecutor<TaskT>& _executor;
TaskHandler<TaskT> _task_owned; TaskHandler<TaskT> _task_owned;
uint32_t _model_index;
}; // class TaskManager }; // class TaskManager
class AutoMutex { class AutoMutex {
...@@ -357,5 +747,5 @@ class AutoMutex { ...@@ -357,5 +747,5 @@ class AutoMutex {
} // namespace bsf } // namespace bsf
} // namespace im } // namespace im
#include "core/predictor/framework/bsf-inl-tensor.h" // #include "core/predictor/framework/bsf-inl-tensor.h"
#include "core/predictor/framework/bsf-inl.h" #include "core/predictor/framework/bsf-inl.h"
...@@ -56,15 +56,23 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf, ...@@ -56,15 +56,23 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
} }
// init bsf framework // init bsf framework
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn( im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
boost::bind(&InferEngine::thrd_initialize_impl, this)); .set_thread_init_fn(
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn( boost::bind(&InferEngine::thrd_initialize_impl, this));
boost::bind(&InferEngine::thrd_clear_impl, this)); im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn( .set_thread_init_fn(
boost::bind(&InferEngine::task_infer_impl, this, _1, _2)); boost::bind(&InferEngine::thrd_initialize_impl, this));
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size); im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(_infer_batch_align); .set_thread_reset_fn(boost::bind(&InferEngine::thrd_clear_impl, this));
if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) != 0) { im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
.set_thread_callback_fn(
boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
_infer_batch_size);
im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
_infer_batch_align);
if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
_infer_thread_num) != 0) {
LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num; LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
return -1; return -1;
} }
...@@ -75,6 +83,11 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf, ...@@ -75,6 +83,11 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
return 0; return 0;
} }
// Multiple threads will enter this method of the same object
// One Model corresponds to One ReloadableInferEngine object.
// ReloadableInferEngine object is Process object.
// One ReloadableInferEngine object can have several ModelData<EngineCore>
// ModelData<EngineCore> is Thread object.
int ReloadableInferEngine::infer(const void* in, int ReloadableInferEngine::infer(const void* in,
void* out, void* out,
uint32_t batch_size) { uint32_t batch_size) {
...@@ -82,9 +95,10 @@ int ReloadableInferEngine::infer(const void* in, ...@@ -82,9 +95,10 @@ int ReloadableInferEngine::infer(const void* in,
return infer_impl(in, out, batch_size); return infer_impl(in, out, batch_size);
} }
im::bsf::TaskManager<Tensor, Tensor> task_manager; im::bsf::TaskManager<paddle::PaddleTensor, paddle::PaddleTensor> task_manager(
task_manager.schedule(*(reinterpret_cast<const BatchTensor*>(in)), _model_index);
*(reinterpret_cast<BatchTensor*>(out)));
task_manager.schedule(in, out);
task_manager.wait(); task_manager.wait();
return 0; return 0;
} }
...@@ -110,7 +124,7 @@ int ReloadableInferEngine::proc_finalize() { ...@@ -110,7 +124,7 @@ int ReloadableInferEngine::proc_finalize() {
} }
if (_infer_thread_num > 0) { if (_infer_thread_num > 0) {
im::bsf::TaskExecutor<TaskT>::instance()->stop(); im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].stop();
} }
return 0; return 0;
} }
...@@ -191,6 +205,7 @@ int VersionedInferEngine::proc_initialize(const configure::EngineDesc& conf, ...@@ -191,6 +205,7 @@ int VersionedInferEngine::proc_initialize(const configure::EngineDesc& conf,
std::string engine_type = conf.type(); std::string engine_type = conf.type();
InferEngine* engine = InferEngine* engine =
StaticInferFactory::instance().generate_object(engine_type); StaticInferFactory::instance().generate_object(engine_type);
engine->set_model_index(_model_index);
if (!engine) { if (!engine) {
LOG(ERROR) << "Failed generate engine with type:" << engine_type; LOG(ERROR) << "Failed generate engine with type:" << engine_type;
return -1; return -1;
...@@ -362,23 +377,30 @@ int VersionedInferEngine::infer_impl(const void* in, ...@@ -362,23 +377,30 @@ int VersionedInferEngine::infer_impl(const void* in,
uint32_t batch_size) { uint32_t batch_size) {
return -1; return -1;
} }
int VersionedInferEngine::task_infer_impl(const BatchTensor& in, int VersionedInferEngine::task_infer_impl(const void* in,
BatchTensor& out) { // NOLINT void* out) { // NOLINT
return -1; return -1;
} }
int InferManager::proc_initialize(const char* path, const char* file) { int InferManager::proc_initialize(const char* path,
const char* file,
std::shared_ptr<int> engine_index_ptr) {
ModelToolkitConf model_toolkit_conf; ModelToolkitConf model_toolkit_conf;
if (configure::read_proto_conf(path, file, &model_toolkit_conf) != 0) { if (configure::read_proto_conf(path, file, &model_toolkit_conf) != 0) {
LOG(ERROR) << "failed load infer config, path: " << path << "/" << file; LOG(ERROR) << "failed load infer config, path: " << path << "/" << file;
return -1; return -1;
} }
size_t engine_num = model_toolkit_conf.engines_size(); uint32_t engine_num = model_toolkit_conf.engines_size();
for (size_t ei = 0; ei < engine_num; ++ei) { im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
engine_num);
for (uint32_t ei = 0; ei < engine_num; ++ei) {
LOG(INFO) << "model_toolkit_conf.engines(" << ei LOG(INFO) << "model_toolkit_conf.engines(" << ei
<< ").name: " << model_toolkit_conf.engines(ei).name(); << ").name: " << model_toolkit_conf.engines(ei).name();
std::string engine_name = model_toolkit_conf.engines(ei).name(); std::string engine_name = model_toolkit_conf.engines(ei).name();
VersionedInferEngine* engine = new (std::nothrow) VersionedInferEngine(); VersionedInferEngine* engine = new (std::nothrow) VersionedInferEngine();
int temp_engine_index_ptr = *engine_index_ptr;
engine->set_model_index(temp_engine_index_ptr);
*engine_index_ptr = temp_engine_index_ptr + 1;
if (!engine) { if (!engine) {
LOG(ERROR) << "Failed generate versioned engine: " << engine_name; LOG(ERROR) << "Failed generate versioned engine: " << engine_name;
return -1; return -1;
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <functional>
#include <memory>
#include <numeric> #include <numeric>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -25,6 +27,7 @@ ...@@ -25,6 +27,7 @@
#include "core/predictor/framework/bsf.h" #include "core/predictor/framework/bsf.h"
#include "core/predictor/framework/factory.h" #include "core/predictor/framework/factory.h"
#include "core/predictor/framework/infer_data.h" #include "core/predictor/framework/infer_data.h"
#include "core/predictor/framework/memory.h"
#include "paddle_inference_api.h" // NOLINT #include "paddle_inference_api.h" // NOLINT
namespace baidu { namespace baidu {
namespace paddle_serving { namespace paddle_serving {
...@@ -71,7 +74,7 @@ class InferEngine { ...@@ -71,7 +74,7 @@ class InferEngine {
virtual int infer(const void* in, void* out, uint32_t batch_size = -1) { virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
return infer_impl(in, out, batch_size); return infer_impl(in, out, batch_size);
} }
virtual void set_model_index(uint32_t index) { _model_index = index; }
virtual int reload() = 0; virtual int reload() = 0;
virtual uint64_t version() const = 0; virtual uint64_t version() const = 0;
...@@ -86,12 +89,13 @@ class InferEngine { ...@@ -86,12 +89,13 @@ class InferEngine {
virtual int infer_impl(const void* in, virtual int infer_impl(const void* in,
void* out, void* out,
uint32_t batch_size = -1) = 0; uint32_t batch_size = -1) = 0;
virtual int task_infer_impl(const BatchTensor& in, virtual int task_infer_impl(const void* in, void* out) = 0; // NOLINT
BatchTensor& out) = 0; // NOLINT
protected:
uint32_t _model_index;
// end: framework inner call // end: framework inner call
}; };
typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;
class ReloadableInferEngine : public InferEngine { class ReloadableInferEngine : public InferEngine {
public: public:
virtual ~ReloadableInferEngine() {} virtual ~ReloadableInferEngine() {}
...@@ -104,7 +108,6 @@ class ReloadableInferEngine : public InferEngine { ...@@ -104,7 +108,6 @@ class ReloadableInferEngine : public InferEngine {
}; };
virtual int load(const configure::EngineDesc& conf) = 0; virtual int load(const configure::EngineDesc& conf) = 0;
typedef im::bsf::Task<Tensor, Tensor> TaskT;
int proc_initialize_impl(const configure::EngineDesc& conf, bool version); int proc_initialize_impl(const configure::EngineDesc& conf, bool version);
...@@ -179,6 +182,8 @@ struct ModelData { ...@@ -179,6 +182,8 @@ struct ModelData {
delete cores[1]; delete cores[1];
} }
void* get() { return cores[current_idx]->get(); }
EngineCore* cores[2]; EngineCore* cores[2];
uint32_t current_idx; uint32_t current_idx;
}; };
...@@ -191,14 +196,20 @@ class DBReloadableInferEngine : public ReloadableInferEngine { ...@@ -191,14 +196,20 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
int proc_initialize(const configure::EngineDesc& conf, bool version) { int proc_initialize(const configure::EngineDesc& conf, bool version) {
THREAD_KEY_CREATE(&_skey, NULL); THREAD_KEY_CREATE(&_skey, NULL);
THREAD_MUTEX_INIT(&_mutex, NULL); THREAD_MUTEX_INIT(&_mutex, NULL);
gpu_index = 0;
return ReloadableInferEngine::proc_initialize(conf, version); return ReloadableInferEngine::proc_initialize(conf, version);
} }
// 进程初始化会调用load,但由于未执行线程初始化,所以_reload_vec为空,不再继续执行。
// 热加载的话会调用load,由于线程已经初始化,_reload_vec不为空,所以继续执行load_data操作加载数据。
// 线程初始化会执行load_data操作加载数据,然后将engine加入_reload_vec中。
// 每个模型只有一个CloneDBReloadableInferEngine对象。
// 但一个CloneDBReloadableInferEngine对象,可以包含N个EngineCore。
virtual int load(const configure::EngineDesc& conf) { virtual int load(const configure::EngineDesc& conf) {
if (_reload_vec.empty()) { if (_reload_vec.empty()) {
return 0; return 0;
} }
gpu_index = 0;
for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) { for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
if (load_data(_reload_vec[ti], conf) != 0) { if (load_data(_reload_vec[ti], conf) != 0) {
LOG(ERROR) << "Failed reload engine model: " << ti; LOG(ERROR) << "Failed reload engine model: " << ti;
...@@ -210,7 +221,8 @@ class DBReloadableInferEngine : public ReloadableInferEngine { ...@@ -210,7 +221,8 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
return 0; return 0;
} }
int load_data(ModelData<EngineCore>* md, const configure::EngineDesc& conf) { virtual int load_data(ModelData<EngineCore>* md,
const configure::EngineDesc& conf) {
uint32_t next_idx = (md->current_idx + 1) % 2; uint32_t next_idx = (md->current_idx + 1) % 2;
if (md->cores[next_idx]) { if (md->cores[next_idx]) {
delete md->cores[next_idx]; delete md->cores[next_idx];
...@@ -219,28 +231,29 @@ class DBReloadableInferEngine : public ReloadableInferEngine { ...@@ -219,28 +231,29 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
md->cores[next_idx] = new (std::nothrow) EngineCore; md->cores[next_idx] = new (std::nothrow) EngineCore;
// params.dump(); // params.dump();
if (!md->cores[next_idx] || md->cores[next_idx]->create(conf) != 0) { size_t gpu_ids_num = conf.gpu_ids_size();
im::bsf::AutoMutex lock(_mutex);
int gpu_id = -1;
if (gpu_ids_num > 0) {
gpu_id = conf.gpu_ids(gpu_index % gpu_ids_num);
}
if (!md->cores[next_idx] ||
md->cores[next_idx]->create(conf, gpu_id) != 0) {
LOG(ERROR) << "Failed create model, path: " << conf.model_dir(); LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
return -1; return -1;
} }
gpu_index++;
md->current_idx = next_idx; md->current_idx = next_idx;
return 0; return 0;
} }
virtual int thrd_initialize_impl() { virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>; ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _conf) != 0) { if (!md || load_data(md, _conf) != 0) {
LOG(ERROR) << "Failed create thread data from " << _conf.model_dir(); LOG(ERROR) << "Failed create thread data from " << _conf.model_dir();
return -1; return -1;
} }
LOG(ERROR) << "THREAD_SETSPECIFIC _skey = md";
THREAD_SETSPECIFIC(_skey, md); THREAD_SETSPECIFIC(_skey, md);
im::bsf::AutoMutex lock(_mutex); im::bsf::AutoMutex lock(_mutex);
_reload_vec.push_back(md); _reload_vec.push_back(md);
...@@ -248,11 +261,33 @@ class DBReloadableInferEngine : public ReloadableInferEngine { ...@@ -248,11 +261,33 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
} }
int thrd_clear_impl() { int thrd_clear_impl() {
// for non-serving-threads // actually, there are 2 kinds of multi-thread.
if (MempoolWrapper::instance().thread_clear() != 0) { // 1. brpc thread 2. bsf Task thread
LOG(ERROR) << "Failed thread clear mempool"; // each request is in 1-single brpc thread.
return -1; // IF (bsf Task thread is not used)
} // every single brpc thread corresponds to all the DBReloadableInferEngines.
// each request runs all models in 1-single brpc thread.
// every single brpc thread will create or clone N predictor.
// N = the number of Model.
// so if there are 2 models, and --thread 10.
// each brpc thread will create predictor of Model-1 and Model-2.
// there are totally 10 predictors of Model-1 and 10 predictors of Model-2
// cause there are 10 brpc threads.
// IF bsf Task thread is used。
// there will be a ThreadPool called bsf TaskExecutor.
// TaskExecutorVector is the vector of TaskExecutor.
// the number of TaskExecutor equals to the number of Model.
// 1 TaskExecutor corresponding to 1 Model.
// 1 TaskExecutor have N bsf threads.
// 1 bsf thread corresponds to 1 predictor of
// the Model corresponding to the TaskExecutor.
// brpc thread only put the data into the task_queue(which is in
// TaskExecutor)
// EngineCore->infer() is running in bsf Task thread.
// MempoolWrapper::instance() is actually a Thread-Local Mempool.
// so it belongs to a single Thread.
return 0; return 0;
} }
...@@ -278,6 +313,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine { ...@@ -278,6 +313,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
THREAD_KEY_T _skey; THREAD_KEY_T _skey;
THREAD_MUTEX_T _mutex; THREAD_MUTEX_T _mutex;
std::vector<ModelData<EngineCore>*> _reload_vec; std::vector<ModelData<EngineCore>*> _reload_vec;
int gpu_index = 0;
}; };
// 多个EngineCore共用同一份模型数据 // 多个EngineCore共用同一份模型数据
...@@ -287,88 +323,76 @@ class CloneDBReloadableInferEngine ...@@ -287,88 +323,76 @@ class CloneDBReloadableInferEngine
public: public:
virtual ~CloneDBReloadableInferEngine() {} virtual ~CloneDBReloadableInferEngine() {}
virtual int proc_initialize(const configure::EngineDesc& conf, bool version) { // 进程初始化会调用load,但由于未执行线程初始化,所以_reload_vec为空,不再继续执行。
_pd = new (std::nothrow) ModelData<EngineCore>; // 热加载的话会调用load,由于线程已经初始化,_reload_vec不为空,所以继续执行load_data操作加载数据。
if (!_pd) { // 线程初始化会执行load_data操作加载数据,然后将engine加入_reload_vec中。
LOG(ERROR) << "Failed to allocate for ProcData"; // 每个模型只有一个CloneDBReloadableInferEngine对象。
return -1; // 但一个CloneDBReloadableInferEngine对象,可以包含N个EngineCore。
}
return DBReloadableInferEngine<EngineCore>::proc_initialize(conf, version);
}
virtual int load(const configure::EngineDesc& conf) { virtual int load_data(ModelData<EngineCore>* md,
// 加载进程级模型数据 const configure::EngineDesc& conf) {
if (!_pd || uint32_t next_idx = (md->current_idx + 1) % 2;
DBReloadableInferEngine<EngineCore>::load_data(_pd, conf) != 0) { if (md->cores[next_idx]) {
LOG(ERROR) << "Failed to create common model from [" << conf.model_dir() delete md->cores[next_idx];
<< "].";
return -1;
} }
LOG(WARNING) << "Succ load common model[" << _pd->cores[_pd->current_idx] md->cores[next_idx] = new (std::nothrow) EngineCore;
<< "], path[" << conf.model_dir() << "].";
if (DBReloadableInferEngine<EngineCore>::_reload_vec.empty()) { // params.dump();
return 0; // gpu_ids_num > 0 is always true.
// if use CPU, gpu_ids = [-1].
// if gpu_ids_num = 0, which means no gpuid is given.
// so we should set gpu_ids_num = 1, and gpu_id = -1.
// so that we can create at least 1 predictor.
size_t gpu_ids_num = conf.gpu_ids_size();
im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
int gpu_id = -1;
if (gpu_ids_num > 0) {
gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::gpu_index %
gpu_ids_num);
} else {
gpu_ids_num = 1;
} }
// gpu_index will be set to be 0, when load() or proc_initial() is called.
for (uint32_t ti = 0; // gpu_index < gpu_ids_num, means there are predictors still not create
ti < DBReloadableInferEngine<EngineCore>::_reload_vec.size(); // on some GPU card.
++ti) { // so we need to create the predictor.
if (load_data(DBReloadableInferEngine<EngineCore>::_reload_vec[ti], // gpu_index >= gpu_ids_num, means each GPU card has already create one.
_pd->cores[_pd->current_idx]) != 0) { // so we need to clone the predictor.
LOG(ERROR) << "Failed reload engine model: " << ti; if (DBReloadableInferEngine<EngineCore>::gpu_index < gpu_ids_num) {
if (!md->cores[next_idx] ||
md->cores[next_idx]->create(conf, gpu_id) != 0) {
LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
return -1; return -1;
} }
DBReloadableInferEngine<EngineCore>::gpu_index++;
md->current_idx = next_idx;
if (_cloneTemplate.size() <
DBReloadableInferEngine<EngineCore>::gpu_index) {
_cloneTemplate.push_back(md);
} else {
_cloneTemplate[DBReloadableInferEngine<EngineCore>::gpu_index - 1] = md;
}
} else {
int template_index = DBReloadableInferEngine<EngineCore>::gpu_index %
_cloneTemplate.size();
if (!md->cores[next_idx] ||
md->cores[next_idx]->clone(_cloneTemplate[template_index]->get()) !=
0) {
LOG(ERROR) << "Failed clone model from core";
return -1;
}
DBReloadableInferEngine<EngineCore>::gpu_index++;
md->current_idx = next_idx;
LOG(WARNING) << "core clone model succ, cur_idx[" << md->current_idx
<< "].";
} }
LOG(WARNING) << "Succ load clone model, path[" << conf.model_dir() << "]";
return 0;
}
// 加载线程级对象,多个线程级对象共用pd_core的模型数据
int load_data(ModelData<EngineCore>* td, EngineCore* pd_core) {
uint32_t next_idx = (td->current_idx + 1) % 2;
if (td->cores[next_idx]) {
delete td->cores[next_idx];
}
td->cores[next_idx] = new (std::nothrow) EngineCore;
if (!td->cores[next_idx] ||
td->cores[next_idx]->clone(pd_core->get()) != 0) {
LOG(ERROR) << "Failed clone model from pd_core[ " << pd_core << "], idx["
<< next_idx << "]";
return -1;
}
td->current_idx = next_idx;
LOG(WARNING) << "td_core[" << td->cores[td->current_idx]
<< "] clone model from pd_core[" << pd_core
<< "] succ, cur_idx[" << td->current_idx << "].";
return 0;
}
virtual int thrd_initialize_impl() {
// memory pool to be inited in non-serving-threads
if (MempoolWrapper::instance().thread_initialize() != 0) {
LOG(ERROR) << "Failed thread initialize mempool";
return -1;
}
ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
if (!md || load_data(md, _pd->cores[_pd->current_idx]) != 0) {
LOG(ERROR) << "Failed clone thread data, origin_core["
<< _pd->cores[_pd->current_idx] << "].";
return -1;
}
THREAD_SETSPECIFIC(DBReloadableInferEngine<EngineCore>::_skey, md);
im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
DBReloadableInferEngine<EngineCore>::_reload_vec.push_back(md);
return 0; return 0;
} }
protected: protected:
ModelData<EngineCore>* // 模板EngineCore,如果已创建,则多个线程级EngineCore共用该对象的模型数据
_pd; // 进程级EngineCore,多个线程级EngineCore共用该对象的模型数据 std::vector<ModelData<EngineCore>*> _cloneTemplate;
}; };
template <typename EngineCore> template <typename EngineCore>
...@@ -505,8 +529,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> { ...@@ -505,8 +529,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
return 0; return 0;
} }
int task_infer_impl(const BatchTensor& in, BatchTensor& out) { // NOLINT int task_infer_impl(const void* in, void* out) { // NOLINT
return infer_impl(&in, &out); return infer_impl(in, out);
} }
}; };
...@@ -559,7 +583,7 @@ class VersionedInferEngine : public InferEngine { ...@@ -559,7 +583,7 @@ class VersionedInferEngine : public InferEngine {
int infer_impl(const void* in, void* out, uint32_t batch_size = -1); int infer_impl(const void* in, void* out, uint32_t batch_size = -1);
int task_infer_impl(const BatchTensor& in, BatchTensor& out); int task_infer_impl(const void* in, void* out);
private: private:
boost::unordered_map<uint64_t, InferEngine*> _versions; boost::unordered_map<uint64_t, InferEngine*> _versions;
...@@ -572,7 +596,9 @@ class InferManager { ...@@ -572,7 +596,9 @@ class InferManager {
return ins; return ins;
} }
int proc_initialize(const char* path, const char* file); int proc_initialize(const char* path,
const char* file,
std::shared_ptr<int> engine_index_ptr);
int thrd_initialize(); int thrd_initialize();
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -135,12 +135,14 @@ int Resource::initialize(const std::string& path, const std::string& file) { ...@@ -135,12 +135,14 @@ int Resource::initialize(const std::string& path, const std::string& file) {
if (FLAGS_enable_model_toolkit) { if (FLAGS_enable_model_toolkit) {
size_t model_toolkit_num = resource_conf.model_toolkit_path_size(); size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
std::shared_ptr<int> engine_index_ptr(new int(0));
for (size_t mi = 0; mi < model_toolkit_num; ++mi) { for (size_t mi = 0; mi < model_toolkit_num; ++mi) {
std::string model_toolkit_path = resource_conf.model_toolkit_path(mi); std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
std::string model_toolkit_file = resource_conf.model_toolkit_file(mi); std::string model_toolkit_file = resource_conf.model_toolkit_file(mi);
if (InferManager::instance().proc_initialize( if (InferManager::instance().proc_initialize(model_toolkit_path.c_str(),
model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) { model_toolkit_file.c_str(),
engine_index_ptr) != 0) {
LOG(ERROR) << "failed proc initialize modeltoolkit, config: " LOG(ERROR) << "failed proc initialize modeltoolkit, config: "
<< model_toolkit_path << "/" << model_toolkit_file; << model_toolkit_path << "/" << model_toolkit_file;
return -1; return -1;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "core/cube/cube-api/include/cube_api.h" #include "core/cube/cube-api/include/cube_api.h"
#include "core/predictor/common/inner_common.h" #include "core/predictor/common/inner_common.h"
......
...@@ -91,6 +91,7 @@ int ServerManager::start_and_wait() { ...@@ -91,6 +91,7 @@ int ServerManager::start_and_wait() {
} }
} }
// rpc multi-thread start from here.
if (_server.Start(FLAGS_port, &_options) != 0) { if (_server.Start(FLAGS_port, &_options) != 0) {
LOG(ERROR) << "Failed to start Paddle Inference Server"; LOG(ERROR) << "Failed to start Paddle Inference Server";
return -1; return -1;
......
文件模式从 100755 更改为 100644
...@@ -24,7 +24,7 @@ namespace fugue { ...@@ -24,7 +24,7 @@ namespace fugue {
namespace memory { namespace memory {
void Region::init() { void Region::init() {
_big_mem_capacity = 64 * 1024 * 1024; // 64MB _big_mem_capacity = 128 * 1024 * 1024; // 128MB
_big_mem_start = new char[_big_mem_capacity]; _big_mem_start = new char[_big_mem_capacity];
} }
......
...@@ -129,7 +129,7 @@ class FreeList { ...@@ -129,7 +129,7 @@ class FreeList {
to get the class Pointer to get the class Pointer
for example for example
T is the member of class Node, T data, 'data' is the name. T is the member of class Node, T data, 'data' is the name.
T* value is the member(pointer type) class Node T* value is the member(pointer type) of class Node
so we can get the Node* by calling container_of(value, Node, data) so we can get the Node* by calling container_of(value, Node, data)
*/ */
Node* node = container_of(value, Node, data); Node* node = container_of(value, Node, data);
...@@ -261,7 +261,11 @@ struct BlockReference { ...@@ -261,7 +261,11 @@ struct BlockReference {
// because BlockFreeList is a threal-safe Singleton. // because BlockFreeList is a threal-safe Singleton.
// so we don`t release Block, it is global memory. // so we don`t release Block, it is global memory.
// total number is 32*1024 // total number is 256*1024.
// the MAX_BLOCK_COUNT of Region(one thread one Region) is 1024.
// so BlockFreeList allow 256 Region(means 256 thread).
// the memory used by BlockFreeListType is sizeof(void*)*256*1024.
// Block(2MB) memory is created only when get() is called.
class BlockFreeList { class BlockFreeList {
public: public:
static const int MAX_BLOCK_COUNT = 256 * 1024; static const int MAX_BLOCK_COUNT = 256 * 1024;
...@@ -341,9 +345,10 @@ class Region { ...@@ -341,9 +345,10 @@ class Region {
2 * 1024 * 2 * 1024 *
1024; // 2MB,means when you need less than 2M, get memory from Block. 1024; // 2MB,means when you need less than 2M, get memory from Block.
// 64MB,means when you need less than 64MB, get memory from BigMemory instead // 128MB,means when you need less than 128MB, get memory from BigMemory
// instead
// of BigNode // of BigNode
static const int BIGNODE_MEM_THRESHOLD = (64 * 1024 * 1024 + 1); static const int BIGNODE_MEM_THRESHOLD = (128 * 1024 * 1024 + 1);
static const int COUNTER_SIZE = static const int COUNTER_SIZE =
BIGNODE_MEM_THRESHOLD / BIG_MEM_THRESHOLD + 1; // this is not used BIGNODE_MEM_THRESHOLD / BIG_MEM_THRESHOLD + 1; // this is not used
...@@ -374,7 +379,8 @@ class Mempool { ...@@ -374,7 +379,8 @@ class Mempool {
void* malloc(size_t size) { void* malloc(size_t size) {
size = _align(size); size = _align(size);
// It does not enter the if statement the first time. // It does not enter the if statement the first time.
// Because the block has not been used up, it will enter. // The if statement may enter after the block is created.
// If the block has not been used up, it will enter.
if (size <= _free_size) { if (size <= _free_size) {
void* p = _free_cursor; void* p = _free_cursor;
_free_size -= size; _free_size -= size;
...@@ -392,7 +398,7 @@ class Mempool { ...@@ -392,7 +398,7 @@ class Mempool {
return; return;
} }
// memory in Block,update the pointer. // memory in _block,update the pointer.
if (_free_cursor - size == static_cast<char*>(p)) { if (_free_cursor - size == static_cast<char*>(p)) {
// for example, you need to release -(8+1)bytes // for example, you need to release -(8+1)bytes
// you can only release -8bytes,cause -(8+2)byte is used by other. // you can only release -8bytes,cause -(8+2)byte is used by other.
...@@ -424,9 +430,8 @@ class Mempool { ...@@ -424,9 +430,8 @@ class Mempool {
} }
// 可能返回的是单独Region中malloc的内存。 // 可能返回的是单独Region中malloc的内存。
// 也可能是Block,例如new_size=1M, old_data原本的指针头就在1.2M处,old_size // 也可能是Block,例如new_size=1M, old_data原本的指针头就在1.2M处
// = // old_size = 0.5M
// 0.5M
// 此时,_free_size = 0.3M,new_size<2M,但是required = 1-0.5 >0.3 // 此时,_free_size = 0.3M,new_size<2M,但是required = 1-0.5 >0.3
// 分配出来的就是Block,但是该Block没有并很完美的利用完全。 // 分配出来的就是Block,但是该Block没有并很完美的利用完全。
void* p = this->malloc_from_region(new_size); void* p = this->malloc_from_region(new_size);
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -68,13 +68,14 @@ static bvar::PassiveStatus<std::string> s_predictor_revision( ...@@ -68,13 +68,14 @@ static bvar::PassiveStatus<std::string> s_predictor_revision(
DEFINE_bool(V, false, "print version, bool"); DEFINE_bool(V, false, "print version, bool");
DEFINE_bool(g, false, "user defined gflag path"); DEFINE_bool(g, false, "user defined gflag path");
DECLARE_string(flagfile); DECLARE_string(flagfile);
/*
namespace bthread { namespace bthread {
extern pthread_mutex_t g_task_control_mutex; extern pthread_mutex_t g_task_control_mutex;
} }
pthread_mutex_t g_worker_start_fn_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t g_worker_start_fn_mutex = PTHREAD_MUTEX_INITIALIZER;
*/
void pthread_worker_start_fn() { void pthread_worker_start_fn() {
/*
while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) { while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) {
} }
...@@ -83,15 +84,18 @@ void pthread_worker_start_fn() { ...@@ -83,15 +84,18 @@ void pthread_worker_start_fn() {
if (lock_status == EBUSY || lock_status == EAGAIN) { if (lock_status == EBUSY || lock_status == EAGAIN) {
pthread_mutex_unlock(&bthread::g_task_control_mutex); pthread_mutex_unlock(&bthread::g_task_control_mutex);
} }
*/
Resource::instance().thread_initialize(); Resource::instance().thread_initialize();
// Try to avoid deadlock in bthread // Try to avoid deadlock in bthread
/*
if (lock_status == EBUSY || lock_status == EAGAIN) { if (lock_status == EBUSY || lock_status == EAGAIN) {
while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) { while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) {
} }
} }
pthread_mutex_unlock(&g_worker_start_fn_mutex); pthread_mutex_unlock(&g_worker_start_fn_mutex);
*/
} }
static void g_change_server_port() { static void g_change_server_port() {
...@@ -126,7 +130,7 @@ int main(int argc, char** argv) { ...@@ -126,7 +130,7 @@ int main(int argc, char** argv) {
return 0; return 0;
} }
//google::ParseCommandLineFlags(&argc, &argv, true); // google::ParseCommandLineFlags(&argc, &argv, true);
g_change_server_port(); g_change_server_port();
...@@ -202,7 +206,7 @@ int main(int argc, char** argv) { ...@@ -202,7 +206,7 @@ int main(int argc, char** argv) {
} }
VLOG(2) << "Succ call pthread worker start function"; VLOG(2) << "Succ call pthread worker start function";
//this is not used by any code segment,which can be cancelled. // this is not used by any code segment,which can be cancelled.
if (Resource::instance().general_model_initialize(FLAGS_resource_path, if (Resource::instance().general_model_initialize(FLAGS_resource_path,
FLAGS_resource_file) != 0) { FLAGS_resource_file) != 0) {
LOG(ERROR) << "Failed to initialize general model conf: " LOG(ERROR) << "Failed to initialize general model conf: "
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -24,17 +24,16 @@ message Tensor { ...@@ -24,17 +24,16 @@ message Tensor {
repeated int32 int_data = 2; repeated int32 int_data = 2;
repeated int64 int64_data = 3; repeated int64 int64_data = 3;
repeated float float_data = 4; repeated float float_data = 4;
optional int32 elem_type = 5; optional int32 elem_type =
repeated int32 shape = 6; 5; // 0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
repeated int32 lod = 7; // only for fetch tensor currently repeated int32 shape = 6; // shape should include batch
repeated int32 lod = 7; // only for fetch tensor currently
optional string name = 8; // get from the Model prototxt
optional string alias_name = 9; // get from the Model prototxt
}; };
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst { repeated Tensor tensor_array = 1; };
message Request { message Request {
repeated FeedInst insts = 1; repeated Tensor tensor = 1;
repeated string fetch_var_names = 2; repeated string fetch_var_names = 2;
optional bool profile_server = 3 [ default = false ]; optional bool profile_server = 3 [ default = false ];
required uint64 log_id = 4 [ default = 0 ]; required uint64 log_id = 4 [ default = 0 ];
...@@ -46,7 +45,7 @@ message Response { ...@@ -46,7 +45,7 @@ message Response {
}; };
message ModelOutput { message ModelOutput {
repeated FetchInst insts = 1; repeated Tensor tensor = 1;
optional string engine_name = 2; optional string engine_name = 2;
} }
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -242,6 +242,9 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is: ...@@ -242,6 +242,9 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is:
**A:** 支持离线部署,需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md)提前准备安装好 **A:** 支持离线部署,需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md)提前准备安装好
#### Q: Docker中启动server IP地址 127.0.0.1 与 0.0.0.0 差异
**A:** 您必须将容器的主进程设置为绑定到特殊的 0.0.0.0 “所有接口”地址,否则它将无法从容器外部访问。在Docker中 127.0.0.1 代表“这个容器”,而不是“这台机器”。如果您从容器建立到 127.0.0.1 的出站连接,它将返回到同一个容器;如果您将服务器绑定到 127.0.0.1,接收不到来自外部的连接。
## 预测问题 ## 预测问题
#### Q: 使用GPU第一次预测时特别慢,如何调整RPC服务的等待时间避免超时? #### Q: 使用GPU第一次预测时特别慢,如何调整RPC服务的等待时间避免超时?
...@@ -321,6 +324,15 @@ GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999 ...@@ -321,6 +324,15 @@ GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999
**A:** Logid默认为0(后续应该有自动生成Logid的计划,当前版本0.4.0),Client端通过在predict函数中指定log_id参数传递 **A:** Logid默认为0(后续应该有自动生成Logid的计划,当前版本0.4.0),Client端通过在predict函数中指定log_id参数传递
#### Q: C++Server出现问题如何调试和定位
**A:** 推荐您使用gdb进行定位和调试,如果您使用docker,在启动容器时候,需要加上docker run --privileged参数,开启特权模式,这样才能在docker容器中使用gdb定位和调试
如果您C++端出现coredump,一般而言会生成一个core文件,若没有,则应开启生成core文件选项,使用ulimit -c unlimited命令。
使用gdb调试core文件的方法为:gdb <可执行文件> <core文件>,进入后输入bt指令,一般即可显示出错在哪一行。
注意:可执行文件路径是C++ bin文件的路径,而不是python命令,一般为类似下面的这种/usr/local/lib/python3.6/site-packages/paddle_serving_server/serving-gpu-102-0.6.2/serving
## 性能优化 ## 性能优化
# HTTP方式访问Server
Paddle Serving服务端目前提供了支持Http直接访问的功能,本文档显示了详细信息。
## 基本原理
BRPC-Server端支持通过Http的方式被访问,各种语言都有实现Http请求的一些库,所以Java/Python/Go等BRPC支持不太完善的语言,可以通过Http的方式直接访问服务端进行预测。
### Http方式
基本流程和原理:客户端需要将数据按照Proto约定的格式(请参阅[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto))封装在Http请求的请求体中。
BRPC-Server会尝试去JSON字符串中再去反序列化出Proto格式的数据,从而进行后续的处理。
### Http+protobuf方式
各种语言都提供了对ProtoBuf的支持,如果您对此比较熟悉,您也可以先将数据使用ProtoBuf序列化,再将序列化后的数据放入Http请求数据体中,然后指定Content-Type: application/proto,从而使用http/h2+protobuf二进制串访问服务。
**理论上讲,序列化/反序列化的性能从高到底排序为:protobuf > http/h2+protobuf > http**
## 示例
我们将以python/examples/fit_a_line为例,讲解如何通过Http访问Server端。
### 获取模型
```shell
sh get_data.sh
```
## 开启服务端
```shell
python3.6 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
```
服务端无须做任何改造,即可支持BRPC和HTTP两种方式。
## 客户端访问
### HttpClient方式发送Http请求(Python/Java)
为了方便用户快速的使用Http方式请求Server端预测服务,我们已经将常用的Http请求的数据体封装、压缩、请求加密等功能封装为一个HttpClient类提供给用户,方便用户使用。
使用HttpClient最简单只需要三步,1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件(本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt),3、调用Predict函数,通过Http方式请求预测服务。
此外,您可以根据自己的需要配置Server端IP、Port、服务名称(此服务名称需要与[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)文件中的Service服务名和rpc方法名对应,即`GeneralModelService`字段和`inference`字段),设置Request数据体压缩,设置Response支持压缩传输,模型加密预测(需要配置Server端使用模型加密)、设置响应超时时间等功能。
Python的HttpClient使用示例见[`python/examples/fit_a_line/test_httpclient.py`](../python/examples/fit_a_line/test_httpclient.py),接口详见[`python/paddle_serving_client/httpclient.py`](../python/paddle_serving_client/httpclient.py)
Java的HttpClient使用示例见[`java/examples/src/main/java/PaddleServingClientExample.java`](../java/examples/src/main/java/PaddleServingClientExample.java)接口详见[`java/src/main/java/io/paddle/serving/client/HttpClient.java`](../java/src/main/java/io/paddle/serving/client/HttpClient.java)
如果不能满足您的需求,您也可以在此基础上添加一些功能。
如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发,请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md,后续如果需求很大,我们也会将这部分功能加入到Server中,尽情期待。
### curl方式发送Http请求(基本原理)
```shell
curl -XPOST http://0.0.0.0:9393/GeneralModelService/inference -d ' {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}'
```
其中`127.0.0.1:9393`为IP和Port,根据您服务端启动的IP和Port自行设定。
`GeneralModelService`字段和`inference`字段分别为Proto文件中的Service服务名和rpc方法名,详见[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)
-d后面的是请求的数据体,json中一定要包含上述proto中的required字段,否则转化会失败,对应请求会被拒绝。
需要注意的是,数据中的shape字段为模型实际需要的shape信息,包含batch维度在内,可能与proto文件中的shape不一致。
#### message
对应rapidjson Object, 以花括号包围,其中的元素会被递归地解析。
```protobuf
// protobuf
message Foo {
required string field1 = 1;
required int32 field2 = 2;
}
message Bar {
required Foo foo = 1;
optional bool flag = 2;
required string name = 3;
}
// rapidjson
{"foo":{"field1":"hello", "field2":3},"name":"Tom" }
```
#### repeated field
对应rapidjson Array, 以方括号包围,其中的元素会被递归地解析,和message不同,每个元素的类型相同。
```protobuf
// protobuf
repeated int32 numbers = 1;
// rapidjson
{"numbers" : [12, 17, 1, 24] }
```
#### elem_type
表示数据类型,0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
#### fetch_var_names
表示返回结果中需要的数据名称,请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`
### Http压缩
支持gzip压缩,但gzip并不是一个压缩解压速度非常快的方法,当数据量较小时候,使用gzip压缩反而会得不偿失,推荐至少数据大于512字节时才考虑使用gzip压缩。
#### Client请求的数据体压缩
以上面的fit_a_line为例,仍使用上文的请求数据体,但只作为示例演示用法,实际此时使用压缩得不偿失。
```shell
echo ' {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}' | gzip -c > data.txt.gz
```
```shell
curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' -XPOST http://127.0.0.1:9393/GeneralModelService/inference
```
**注意:当请求数据体压缩时,需要指定请求头中Content-Encoding: gzip**
#### Server端Response压缩
当Http请求头中设置了Accept-encoding: gzip时,Server端会尝试用gzip压缩Response的数据,“尝试“指的是压缩有可能不发生,条件有:
- 请求中没有设置Accept-encoding: gzip。
- body尺寸小于-http_body_compress_threshold指定的字节数,默认是512。gzip并不是一个很快的压缩算法,当body较小时,压缩增加的延时可能比网络传输省下的还多。当包较小时不做压缩可能是个更好的选项。
这时server总是会返回不压缩的结果。
如果使用curl,通常推荐使用--compressed参数来设置Response压缩,--compressed参数会自动地在http请求中设置Accept-encoding: gzip,并在收到压缩后的Response后自动解压,对于用户而言,整个压缩/解压过程就像透明的一样。
```shell
curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' --compressed -XPOST http://127.0.0.1:9393/GeneralModelService/inference
```
若您只是在Http请求头中通过-H'Accept-encoding: gzip'设置了接收压缩的信息,收到的将是压缩后的Response,此时,您需要手动解压。
也就是说,--compressed = -H'Content-Encoding: gzip' + 自动解压,所以推荐您使用--compressed,以下仅作为单独设置请求头+手动解压的原理性示例。
当您想要验证返回值是否真的压缩时,您可以只添加请求头-H'Content-Encoding: gzip',而不解压,可以看到返回信息是压缩后的数据(一般而言是看不懂的压缩码)。
```shell
curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' -H'Accept-encoding: gzip' -XPOST http://127.0.0.1:9393/GeneralModelService/inference | gunzip
```
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
package main
import (
"io"
"os"
"fmt"
"bufio"
"strings"
"strconv"
)
func main() {
score_file := os.Args[1]
fi, err := os.Open(score_file)
if err != nil {
fmt.Print(err)
}
defer fi.Close()
br := bufio.NewReader(fi)
total := int(0)
acc := int(0)
for {
line, err := br.ReadString('\n')
if err == io.EOF {
break
}
line = strings.Trim(line, "\n")
s := strings.Split(line, "\t")
prob_str := strings.Trim(s[0], " ")
label_str := strings.Trim(s[1], " ")
prob, err := strconv.ParseFloat(prob_str, 32)
if err != nil {
panic(err)
}
label, err := strconv.ParseFloat(label_str, 32)
if err != nil {
panic(err)
}
if (prob - 0.5) * (label - 0.5) > 0 {
acc++
}
total++
}
fmt.Println("total num: ", total)
fmt.Println("acc num: ", acc)
fmt.Println("acc: ", float32(acc) / float32(total))
}
\ No newline at end of file
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"io"
"fmt"
"strings"
"bufio"
"strconv"
"os"
serving_client "github.com/PaddlePaddle/Serving/go/serving_client"
)
func main() {
var config_file_path string
config_file_path = os.Args[1]
handle := serving_client.LoadModelConfig(config_file_path)
handle = serving_client.Connect("127.0.0.1", "9292", handle)
test_file_path := os.Args[2]
fi, err := os.Open(test_file_path)
if err != nil {
fmt.Print(err)
}
defer fi.Close()
br := bufio.NewReader(fi)
fetch := []string{"cost", "acc", "prediction"}
var result map[string][]float32
for {
line, err := br.ReadString('\n')
if err == io.EOF {
break
}
line = strings.Trim(line, "\n")
var words = []int64{}
s := strings.Split(line, " ")
value, err := strconv.Atoi(s[0])
var feed_int_map map[string][]int64
for _, v := range s[1:value + 1] {
int_v, _ := strconv.Atoi(v)
words = append(words, int64(int_v))
}
label, err := strconv.Atoi(s[len(s)-1])
if err != nil {
panic(err)
}
feed_int_map = map[string][]int64{}
feed_int_map["words"] = words
feed_int_map["label"] = []int64{int64(label)}
result = serving_client.Predict(handle,
feed_int_map, fetch)
fmt.Println(result["prediction"][1], "\t", int64(label))
}
}
\ No newline at end of file
// Code generated by protoc-gen-go. DO NOT EDIT.
// source: general_model_config.proto
package baidu_paddle_serving_configure
import (
fmt "fmt"
proto "github.com/golang/protobuf/proto"
math "math"
)
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package
type FeedVar struct {
Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
AliasName *string `protobuf:"bytes,2,opt,name=alias_name,json=aliasName" json:"alias_name,omitempty"`
IsLodTensor *bool `protobuf:"varint,3,opt,name=is_lod_tensor,json=isLodTensor,def=0" json:"is_lod_tensor,omitempty"`
FeedType *int32 `protobuf:"varint,4,opt,name=feed_type,json=feedType,def=0" json:"feed_type,omitempty"`
Shape []int32 `protobuf:"varint,5,rep,name=shape" json:"shape,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *FeedVar) Reset() { *m = FeedVar{} }
func (m *FeedVar) String() string { return proto.CompactTextString(m) }
func (*FeedVar) ProtoMessage() {}
func (*FeedVar) Descriptor() ([]byte, []int) {
return fileDescriptor_efa52beffa29d37a, []int{0}
}
func (m *FeedVar) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_FeedVar.Unmarshal(m, b)
}
func (m *FeedVar) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_FeedVar.Marshal(b, m, deterministic)
}
func (m *FeedVar) XXX_Merge(src proto.Message) {
xxx_messageInfo_FeedVar.Merge(m, src)
}
func (m *FeedVar) XXX_Size() int {
return xxx_messageInfo_FeedVar.Size(m)
}
func (m *FeedVar) XXX_DiscardUnknown() {
xxx_messageInfo_FeedVar.DiscardUnknown(m)
}
var xxx_messageInfo_FeedVar proto.InternalMessageInfo
const Default_FeedVar_IsLodTensor bool = false
const Default_FeedVar_FeedType int32 = 0
func (m *FeedVar) GetName() string {
if m != nil && m.Name != nil {
return *m.Name
}
return ""
}
func (m *FeedVar) GetAliasName() string {
if m != nil && m.AliasName != nil {
return *m.AliasName
}
return ""
}
func (m *FeedVar) GetIsLodTensor() bool {
if m != nil && m.IsLodTensor != nil {
return *m.IsLodTensor
}
return Default_FeedVar_IsLodTensor
}
func (m *FeedVar) GetFeedType() int32 {
if m != nil && m.FeedType != nil {
return *m.FeedType
}
return Default_FeedVar_FeedType
}
func (m *FeedVar) GetShape() []int32 {
if m != nil {
return m.Shape
}
return nil
}
type FetchVar struct {
Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
AliasName *string `protobuf:"bytes,2,opt,name=alias_name,json=aliasName" json:"alias_name,omitempty"`
IsLodTensor *bool `protobuf:"varint,3,opt,name=is_lod_tensor,json=isLodTensor,def=0" json:"is_lod_tensor,omitempty"`
Shape []int32 `protobuf:"varint,4,rep,name=shape" json:"shape,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *FetchVar) Reset() { *m = FetchVar{} }
func (m *FetchVar) String() string { return proto.CompactTextString(m) }
func (*FetchVar) ProtoMessage() {}
func (*FetchVar) Descriptor() ([]byte, []int) {
return fileDescriptor_efa52beffa29d37a, []int{1}
}
func (m *FetchVar) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_FetchVar.Unmarshal(m, b)
}
func (m *FetchVar) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_FetchVar.Marshal(b, m, deterministic)
}
func (m *FetchVar) XXX_Merge(src proto.Message) {
xxx_messageInfo_FetchVar.Merge(m, src)
}
func (m *FetchVar) XXX_Size() int {
return xxx_messageInfo_FetchVar.Size(m)
}
func (m *FetchVar) XXX_DiscardUnknown() {
xxx_messageInfo_FetchVar.DiscardUnknown(m)
}
var xxx_messageInfo_FetchVar proto.InternalMessageInfo
const Default_FetchVar_IsLodTensor bool = false
func (m *FetchVar) GetName() string {
if m != nil && m.Name != nil {
return *m.Name
}
return ""
}
func (m *FetchVar) GetAliasName() string {
if m != nil && m.AliasName != nil {
return *m.AliasName
}
return ""
}
func (m *FetchVar) GetIsLodTensor() bool {
if m != nil && m.IsLodTensor != nil {
return *m.IsLodTensor
}
return Default_FetchVar_IsLodTensor
}
func (m *FetchVar) GetShape() []int32 {
if m != nil {
return m.Shape
}
return nil
}
type GeneralModelConfig struct {
FeedVar []*FeedVar `protobuf:"bytes,1,rep,name=feed_var,json=feedVar" json:"feed_var,omitempty"`
FetchVar []*FetchVar `protobuf:"bytes,2,rep,name=fetch_var,json=fetchVar" json:"fetch_var,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *GeneralModelConfig) Reset() { *m = GeneralModelConfig{} }
func (m *GeneralModelConfig) String() string { return proto.CompactTextString(m) }
func (*GeneralModelConfig) ProtoMessage() {}
func (*GeneralModelConfig) Descriptor() ([]byte, []int) {
return fileDescriptor_efa52beffa29d37a, []int{2}
}
func (m *GeneralModelConfig) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_GeneralModelConfig.Unmarshal(m, b)
}
func (m *GeneralModelConfig) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_GeneralModelConfig.Marshal(b, m, deterministic)
}
func (m *GeneralModelConfig) XXX_Merge(src proto.Message) {
xxx_messageInfo_GeneralModelConfig.Merge(m, src)
}
func (m *GeneralModelConfig) XXX_Size() int {
return xxx_messageInfo_GeneralModelConfig.Size(m)
}
func (m *GeneralModelConfig) XXX_DiscardUnknown() {
xxx_messageInfo_GeneralModelConfig.DiscardUnknown(m)
}
var xxx_messageInfo_GeneralModelConfig proto.InternalMessageInfo
func (m *GeneralModelConfig) GetFeedVar() []*FeedVar {
if m != nil {
return m.FeedVar
}
return nil
}
func (m *GeneralModelConfig) GetFetchVar() []*FetchVar {
if m != nil {
return m.FetchVar
}
return nil
}
func init() {
proto.RegisterType((*FeedVar)(nil), "baidu.paddle_serving.configure.FeedVar")
proto.RegisterType((*FetchVar)(nil), "baidu.paddle_serving.configure.FetchVar")
proto.RegisterType((*GeneralModelConfig)(nil), "baidu.paddle_serving.configure.GeneralModelConfig")
}
func init() { proto.RegisterFile("general_model_config.proto", fileDescriptor_efa52beffa29d37a) }
var fileDescriptor_efa52beffa29d37a = []byte{
// 283 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xb4, 0xd0, 0x31, 0x4b, 0xc4, 0x30,
0x14, 0x07, 0x70, 0x72, 0x6d, 0xb9, 0xf6, 0x1d, 0x2e, 0xc1, 0xa1, 0x08, 0x1e, 0xe5, 0x16, 0xe3,
0x52, 0xc4, 0xf1, 0x46, 0xc5, 0x73, 0x51, 0x87, 0x72, 0xb8, 0x86, 0xd8, 0xbc, 0xb6, 0x81, 0x5c,
0x53, 0x92, 0xde, 0xc1, 0x2d, 0x7e, 0x13, 0xf1, 0xab, 0x4a, 0x93, 0x43, 0x9c, 0x74, 0x72, 0x7b,
0x79, 0xff, 0xf0, 0xde, 0xe3, 0x07, 0x17, 0x2d, 0xf6, 0x68, 0x85, 0xe6, 0x3b, 0x23, 0x51, 0xf3,
0xda, 0xf4, 0x8d, 0x6a, 0xcb, 0xc1, 0x9a, 0xd1, 0xd0, 0xe5, 0x9b, 0x50, 0x72, 0x5f, 0x0e, 0x42,
0x4a, 0x8d, 0xdc, 0xa1, 0x3d, 0xa8, 0xbe, 0x2d, 0xc3, 0x97, 0xbd, 0xc5, 0xd5, 0x07, 0x81, 0xf9,
0x06, 0x51, 0xbe, 0x0a, 0x4b, 0x29, 0xc4, 0xbd, 0xd8, 0x61, 0x4e, 0x0a, 0xc2, 0xb2, 0xca, 0xd7,
0xf4, 0x12, 0x40, 0x68, 0x25, 0x1c, 0xf7, 0xc9, 0xcc, 0x27, 0x99, 0xef, 0xbc, 0x4c, 0xf1, 0x35,
0x9c, 0x29, 0xc7, 0xb5, 0x91, 0x7c, 0xc4, 0xde, 0x19, 0x9b, 0x47, 0x05, 0x61, 0xe9, 0x3a, 0x69,
0x84, 0x76, 0x58, 0x2d, 0x94, 0x7b, 0x32, 0x72, 0xeb, 0x13, 0xba, 0x84, 0xac, 0x41, 0x94, 0x7c,
0x3c, 0x0e, 0x98, 0xc7, 0x05, 0x61, 0xc9, 0x9a, 0xdc, 0x54, 0xe9, 0xd4, 0xdb, 0x1e, 0x07, 0xa4,
0xe7, 0x90, 0xb8, 0x4e, 0x0c, 0x98, 0x27, 0x45, 0xc4, 0x92, 0x2a, 0x3c, 0x56, 0xef, 0x90, 0x6e,
0x70, 0xac, 0xbb, 0xff, 0xbf, 0xef, 0x7b, 0x7f, 0xfc, 0x73, 0xff, 0x27, 0x01, 0xfa, 0x18, 0x78,
0x9f, 0x27, 0xdd, 0x7b, 0x2f, 0x47, 0xef, 0xc0, 0x1f, 0xce, 0x0f, 0xc2, 0xe6, 0xa4, 0x88, 0xd8,
0xe2, 0xf6, 0xaa, 0xfc, 0x5d, 0xba, 0x3c, 0x29, 0x57, 0xf3, 0xe6, 0xc4, 0xfd, 0x30, 0x81, 0x8c,
0x75, 0xe7, 0x87, 0xcc, 0xfc, 0x10, 0xf6, 0xf7, 0x90, 0x60, 0x31, 0xb9, 0x85, 0xea, 0x2b, 0x00,
0x00, 0xff, 0xff, 0x08, 0x27, 0x9c, 0x1a, 0xfe, 0x01, 0x00, 0x00,
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package serving_client
import (
"bytes"
"encoding/json"
"io/ioutil"
"log"
"net/http"
pb "github.com/PaddlePaddle/Serving/go/proto"
"github.com/golang/protobuf/proto"
)
type Tensor struct {
Data []byte `json:"data"`
FloatData []float32 `json:"float_data"`
IntData []int `json:"int_data"`
Int64Data []int64 `json:"int64_data"`
ElemType int `json:"elem_type"`
Shape []int `json:"shape"`
}
type FeedInst struct {
TensorArray []Tensor `json:"tensor_array"`
}
type FetchInst struct {
TensorArray []Tensor `json:"tensor_array"`
}
type Request struct {
Insts []FeedInst `json:"insts"`
FetchVarNames []string `json:"fetch_var_names"`
ProfileServer bool `json:"profile_server"`
}
type Response struct {
Insts []FetchInst `json:"insts"`
ProfileTime []int64 `json:"profile_time"`
}
type Handle struct {
Url string
Port string
FeedAliasNameMap map[string]string
FeedShapeMap map[string][]int
FeedNameMap map[string]int
FeedAliasNames []string
FetchNameMap map[string]int
FetchAliasNameMap map[string]string
}
func LoadModelConfig(config string) Handle {
in, err := ioutil.ReadFile(config)
if err != nil {
log.Fatalln("Failed to read general model: ", err)
}
general_model_config := &pb.GeneralModelConfig{}
if err := proto.Unmarshal(in, general_model_config); err != nil {
log.Fatalln("Failed to parse GeneralModelConfig: ", err)
}
log.Println("read protobuf succeed")
handle := Handle{}
handle.FeedNameMap = map[string]int{}
handle.FeedAliasNameMap = map[string]string{}
handle.FeedShapeMap = map[string][]int{}
handle.FetchNameMap = map[string]int{}
handle.FetchAliasNameMap = map[string]string{}
handle.FeedAliasNames = []string{}
for i, v := range general_model_config.FeedVar {
handle.FeedNameMap[*v.Name] = i
tmp_array := []int{}
for _, vv := range v.Shape {
tmp_array = append(tmp_array, int(vv))
}
handle.FeedShapeMap[*v.Name] = tmp_array
handle.FeedAliasNameMap[*v.AliasName] = *v.Name
handle.FeedAliasNames = append(handle.FeedAliasNames, *v.AliasName)
}
for i, v := range general_model_config.FetchVar {
handle.FetchNameMap[*v.Name] = i
handle.FetchAliasNameMap[*v.AliasName] = *v.Name
}
return handle
}
func Connect(url string, port string, handle Handle) Handle {
handle.Url = url
handle.Port = port
return handle
}
func Predict(handle Handle, int_feed_map map[string][]int64, fetch []string) map[string][]float32 {
contentType := "application/json;charset=utf-8"
var tensor_array []Tensor
var inst FeedInst
tensor_array = []Tensor{}
inst = FeedInst{}
for i := 0; i < len(handle.FeedAliasNames); i++ {
key_i := handle.FeedAliasNames[i]
var tmp Tensor
tmp.IntData = []int{}
tmp.Shape = []int{}
tmp.Int64Data = int_feed_map[key_i]
tmp.ElemType = 0
tmp.Shape = handle.FeedShapeMap[key_i]
tensor_array = append(tensor_array, tmp)
}
inst.TensorArray = tensor_array
var profile_server bool
profile_server = false
req := &Request{
Insts: []FeedInst{inst},
FetchVarNames: fetch,
ProfileServer: profile_server}
b, err := json.Marshal(req)
body := bytes.NewBuffer(b)
var post_address bytes.Buffer
post_address.WriteString("http://")
post_address.WriteString(handle.Url)
post_address.WriteString(":")
post_address.WriteString(handle.Port)
post_address.WriteString("/GeneralModelService/inference")
resp, err := http.Post(post_address.String(), contentType, body)
if err != nil {
log.Println("Post failed:", err)
}
defer resp.Body.Close()
content, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("Read failed:", err)
}
response_json := Response{}
err = json.Unmarshal([]byte(content), &response_json)
var result map[string][]float32
result = map[string][]float32{}
for i, v := range fetch {
result[v] = response_json.Insts[0].TensorArray[i].FloatData
}
return result
}
...@@ -11,39 +11,83 @@ import org.nd4j.linalg.factory.Nd4j; ...@@ -11,39 +11,83 @@ import org.nd4j.linalg.factory.Nd4j;
import java.util.*; import java.util.*;
public class PaddleServingClientExample { public class PaddleServingClientExample {
boolean fit_a_line() { boolean fit_a_line(String model_config_path) {
float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f, float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
0.0582f, -0.0727f, -0.1583f, -0.0584f, 0.0582f, -0.0727f, -0.1583f, -0.0584f,
0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f}; 0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data); INDArray npdata = Nd4j.createFromArray(data);
long[] batch_shape = {1,13}; long[] batch_shape = {1,13};
INDArray batch_npdata = npdata.reshape(batch_shape); INDArray batch_npdata = npdata.reshape(batch_shape);
HashMap<String, INDArray> feed_data HashMap<String, Object> feed_data
= new HashMap<String, INDArray>() {{ = new HashMap<String, Object>() {{
put("x", batch_npdata); put("x", batch_npdata);
}}; }};
List<String> fetch = Arrays.asList("price"); List<String> fetch = Arrays.asList("price");
Client client = new Client(); HttpClient client = new HttpClient();
String target = "localhost:9393"; client.setIP("0.0.0.0");
boolean succ = client.connect(target); client.setPort("9393");
if (succ != true) { client.loadClientConfig(model_config_path);
System.out.println("connect failed."); String result = client.predict(feed_data, fetch, true, 0);
return false;
} System.out.println(result);
return true;
}
Map<String, INDArray> fetch_map = client.predict(feed_data, fetch); boolean encrypt(String model_config_path,String keyFilePath) {
if (fetch_map == null) { float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
return false; 0.0582f, -0.0727f, -0.1583f, -0.0584f,
0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
long[] batch_shape = {1,13};
INDArray batch_npdata = npdata.reshape(batch_shape);
HashMap<String, Object> feed_data
= new HashMap<String, Object>() {{
put("x", batch_npdata);
}};
List<String> fetch = Arrays.asList("price");
HttpClient client = new HttpClient();
client.setIP("0.0.0.0");
client.setPort("9393");
client.loadClientConfig(model_config_path);
client.use_key(keyFilePath);
try {
Thread.sleep(1000*3); // 休眠3秒,等待Server启动
} catch (Exception e) {
//TODO: handle exception
} }
String result = client.predict(feed_data, fetch, true, 0);
System.out.println(result);
return true;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) { boolean compress(String model_config_path) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue()); float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
} 0.0582f, -0.0727f, -0.1583f, -0.0584f,
0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
long[] batch_shape = {500,13};
INDArray batch_npdata = npdata.broadcast(batch_shape);
HashMap<String, Object> feed_data
= new HashMap<String, Object>() {{
put("x", batch_npdata);
}};
List<String> fetch = Arrays.asList("price");
HttpClient client = new HttpClient();
client.setIP("0.0.0.0");
client.setPort("9393");
client.loadClientConfig(model_config_path);
client.set_request_compress(true);
client.set_response_compress(true);
String result = client.predict(feed_data, fetch, true, 0);
System.out.println(result);
return true; return true;
} }
boolean yolov4(String filename) { boolean yolov4(String model_config_path,String filename) {
// https://deeplearning4j.konduit.ai/ // https://deeplearning4j.konduit.ai/
int height = 608; int height = 608;
int width = 608; int width = 608;
...@@ -77,171 +121,44 @@ public class PaddleServingClientExample { ...@@ -77,171 +121,44 @@ public class PaddleServingClientExample {
INDArray im_size = Nd4j.createFromArray(new int[]{height, width}); INDArray im_size = Nd4j.createFromArray(new int[]{height, width});
long[] batch_size_shape = {1,2}; long[] batch_size_shape = {1,2};
INDArray batch_im_size = im_size.reshape(batch_size_shape); INDArray batch_im_size = im_size.reshape(batch_size_shape);
HashMap<String, INDArray> feed_data HashMap<String, Object> feed_data
= new HashMap<String, INDArray>() {{ = new HashMap<String, Object>() {{
put("image", batch_image); put("image", batch_image);
put("im_size", batch_im_size); put("im_size", batch_im_size);
}}; }};
List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0"); List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0");
HttpClient client = new HttpClient();
Client client = new Client(); client.setIP("0.0.0.0");
String target = "localhost:9393"; client.setPort("9393");
boolean succ = client.connect(target); client.loadClientConfig(model_config_path);
if (succ != true) { String result = client.predict(feed_data, fetch, true, 0);
System.out.println("connect failed."); System.out.println(result);
return false;
}
succ = client.setRpcTimeoutMs(20000); // cpu
if (succ != true) {
System.out.println("set timeout failed.");
return false;
}
Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
if (fetch_map == null) {
return false;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
return true;
}
boolean batch_predict() {
float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
0.0582f, -0.0727f, -0.1583f, -0.0584f,
0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
HashMap<String, INDArray> feed_data
= new HashMap<String, INDArray>() {{
put("x", npdata);
}};
List<HashMap<String, INDArray>> feed_batch
= new ArrayList<HashMap<String, INDArray>>() {{
add(feed_data);
add(feed_data);
}};
List<String> fetch = Arrays.asList("price");
Client client = new Client();
String target = "localhost:9393";
boolean succ = client.connect(target);
if (succ != true) {
System.out.println("connect failed.");
return false;
}
Map<String, INDArray> fetch_map = client.predict(feed_batch, fetch);
if (fetch_map == null) {
return false;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
return true; return true;
} }
boolean asyn_predict() { boolean bert(String model_config_path) {
float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
0.0582f, -0.0727f, -0.1583f, -0.0584f,
0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
INDArray npdata = Nd4j.createFromArray(data);
HashMap<String, INDArray> feed_data
= new HashMap<String, INDArray>() {{
put("x", npdata);
}};
List<String> fetch = Arrays.asList("price");
Client client = new Client();
String target = "localhost:9393";
boolean succ = client.connect(target);
if (succ != true) {
System.out.println("connect failed.");
return false;
}
PredictFuture future = client.asyn_predict(feed_data, fetch);
Map<String, INDArray> fetch_map = future.get();
if (fetch_map == null) {
System.out.println("Get future reslut failed");
return false;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
return true;
}
boolean model_ensemble() {
long[] data = {8, 233, 52, 601};
INDArray npdata = Nd4j.createFromArray(data);
HashMap<String, INDArray> feed_data
= new HashMap<String, INDArray>() {{
put("words", npdata);
}};
List<String> fetch = Arrays.asList("prediction");
Client client = new Client();
String target = "localhost:9393";
boolean succ = client.connect(target);
if (succ != true) {
System.out.println("connect failed.");
return false;
}
Map<String, HashMap<String, INDArray>> fetch_map
= client.ensemble_predict(feed_data, fetch);
if (fetch_map == null) {
return false;
}
for (Map.Entry<String, HashMap<String, INDArray>> entry : fetch_map.entrySet()) {
System.out.println("Model = " + entry.getKey());
HashMap<String, INDArray> tt = entry.getValue();
for (Map.Entry<String, INDArray> e : tt.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
}
return true;
}
boolean bert() {
float[] input_mask = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; float[] input_mask = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
long[] position_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; long[] position_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
long[] input_ids = {101, 6843, 3241, 749, 8024, 7662, 2533, 1391, 2533, 2523, 7676, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; long[] input_ids = {101, 6843, 3241, 749, 8024, 7662, 2533, 1391, 2533, 2523, 7676, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
long[] segment_ids = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; long[] segment_ids = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
HashMap<String, INDArray> feed_data HashMap<String, Object> feed_data
= new HashMap<String, INDArray>() {{ = new HashMap<String, Object>() {{
put("input_mask", Nd4j.createFromArray(input_mask)); put("input_mask", Nd4j.createFromArray(input_mask));
put("position_ids", Nd4j.createFromArray(position_ids)); put("position_ids", Nd4j.createFromArray(position_ids));
put("input_ids", Nd4j.createFromArray(input_ids)); put("input_ids", Nd4j.createFromArray(input_ids));
put("segment_ids", Nd4j.createFromArray(segment_ids)); put("segment_ids", Nd4j.createFromArray(segment_ids));
}}; }};
List<String> fetch = Arrays.asList("pooled_output"); List<String> fetch = Arrays.asList("pooled_output");
HttpClient client = new HttpClient();
Client client = new Client(); client.setIP("0.0.0.0");
String target = "localhost:9393"; client.setPort("9393");
boolean succ = client.connect(target); client.loadClientConfig(model_config_path);
if (succ != true) { String result = client.predict(feed_data, fetch, true, 0);
System.out.println("connect failed."); System.out.println(result);
return false;
}
Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
if (fetch_map == null) {
return false;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
return true; return true;
} }
boolean cube_local() { boolean cube_local(String model_config_path) {
long[] embedding_14 = {250644}; long[] embedding_14 = {250644};
long[] embedding_2 = {890346}; long[] embedding_2 = {890346};
long[] embedding_10 = {3939}; long[] embedding_10 = {3939};
...@@ -271,8 +188,8 @@ public class PaddleServingClientExample { ...@@ -271,8 +188,8 @@ public class PaddleServingClientExample {
long[] embedding_19 = {537425}; long[] embedding_19 = {537425};
long[] embedding_0 = {737395}; long[] embedding_0 = {737395};
HashMap<String, INDArray> feed_data HashMap<String, Object> feed_data
= new HashMap<String, INDArray>() {{ = new HashMap<String, Object>() {{
put("embedding_14.tmp_0", Nd4j.createFromArray(embedding_14)); put("embedding_14.tmp_0", Nd4j.createFromArray(embedding_14));
put("embedding_2.tmp_0", Nd4j.createFromArray(embedding_2)); put("embedding_2.tmp_0", Nd4j.createFromArray(embedding_2));
put("embedding_10.tmp_0", Nd4j.createFromArray(embedding_10)); put("embedding_10.tmp_0", Nd4j.createFromArray(embedding_10));
...@@ -302,23 +219,12 @@ public class PaddleServingClientExample { ...@@ -302,23 +219,12 @@ public class PaddleServingClientExample {
put("embedding_0.tmp_0", Nd4j.createFromArray(embedding_0)); put("embedding_0.tmp_0", Nd4j.createFromArray(embedding_0));
}}; }};
List<String> fetch = Arrays.asList("prob"); List<String> fetch = Arrays.asList("prob");
HttpClient client = new HttpClient();
Client client = new Client(); client.setIP("0.0.0.0");
String target = "localhost:9393"; client.setPort("9393");
boolean succ = client.connect(target); client.loadClientConfig(model_config_path);
if (succ != true) { String result = client.predict(feed_data, fetch, true, 0);
System.out.println("connect failed."); System.out.println(result);
return false;
}
Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
if (fetch_map == null) {
return false;
}
for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
}
return true; return true;
} }
...@@ -328,33 +234,33 @@ public class PaddleServingClientExample { ...@@ -328,33 +234,33 @@ public class PaddleServingClientExample {
PaddleServingClientExample e = new PaddleServingClientExample(); PaddleServingClientExample e = new PaddleServingClientExample();
boolean succ = false; boolean succ = false;
if (args.length < 1) { if (args.length < 2) {
System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>."); System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type> <configPath>.");
System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4"); System.out.println("<test-type>: fit_a_line bert cube_local yolov4 encrypt");
return; return;
} }
String testType = args[0]; String testType = args[0];
System.out.format("[Example] %s\n", testType); System.out.format("[Example] %s\n", testType);
if ("fit_a_line".equals(testType)) { if ("fit_a_line".equals(testType)) {
succ = e.fit_a_line(); succ = e.fit_a_line(args[1]);
} else if ("compress".equals(testType)) {
succ = e.compress(args[1]);
} else if ("bert".equals(testType)) { } else if ("bert".equals(testType)) {
succ = e.bert(); succ = e.bert(args[1]);
} else if ("model_ensemble".equals(testType)) {
succ = e.model_ensemble();
} else if ("asyn_predict".equals(testType)) {
succ = e.asyn_predict();
} else if ("batch_predict".equals(testType)) {
succ = e.batch_predict();
} else if ("cube_local".equals(testType)) { } else if ("cube_local".equals(testType)) {
succ = e.cube_local(); succ = e.cube_local(args[1]);
} else if ("cube_quant".equals(testType)) {
succ = e.cube_local();
} else if ("yolov4".equals(testType)) { } else if ("yolov4".equals(testType)) {
if (args.length < 2) { if (args.length < 3) {
System.out.println("Usage: java -cp <jar> PaddleServingClientExample yolov4 <image-filepath>."); System.out.println("Usage: java -cp <jar> PaddleServingClientExample yolov4 <configPath> <image-filepath>.");
return;
}
succ = e.yolov4(args[1],args[2]);
} else if ("encrypt".equals(testType)) {
if (args.length < 3) {
System.out.println("Usage: java -cp <jar> PaddleServingClientExample encrypt <configPath> <keyPath>.");
return; return;
} }
succ = e.yolov4(args[1]); succ = e.encrypt(args[1],args[2]);
} else { } else {
System.out.format("test-type(%s) not match.\n", testType); System.out.format("test-type(%s) not match.\n", testType);
return; return;
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -145,6 +145,11 @@ ...@@ -145,6 +145,11 @@
<artifactId>json</artifactId> <artifactId>json</artifactId>
<version>20190722</version> <version>20190722</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.12</version>
</dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId> <artifactId>slf4j-api</artifactId>
......
package io.paddle.serving.client;
import java.util.*;
import java.util.function.Function;
import java.lang.management.ManagementFactory;
import java.lang.management.RuntimeMXBean;
import java.util.stream.Collectors;
import java.util.List;
import java.util.ArrayList;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import io.grpc.StatusRuntimeException;
import com.google.protobuf.ByteString;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.factory.Nd4j;
import io.paddle.serving.grpc.*;
import io.paddle.serving.configure.*;
import io.paddle.serving.client.PredictFuture;
class Profiler {
int pid_;
String print_head_ = null;
List<String> time_record_ = null;
boolean enable_ = false;
Profiler() {
RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean();
pid_ = Integer.valueOf(runtimeMXBean.getName().split("@")[0]).intValue();
print_head_ = "\nPROFILE\tpid:" + pid_ + "\t";
time_record_ = new ArrayList<String>();
time_record_.add(print_head_);
}
void record(String name) {
if (enable_) {
long ctime = System.currentTimeMillis() * 1000;
time_record_.add(name + ":" + String.valueOf(ctime) + " ");
}
}
void printProfile() {
if (enable_) {
String profile_str = String.join("", time_record_);
time_record_ = new ArrayList<String>();
time_record_.add(print_head_);
}
}
void enable(boolean flag) {
enable_ = flag;
}
}
public class Client {
private ManagedChannel channel_;
private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceBlockingStub blockingStub_;
private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceFutureStub futureStub_;
private double rpcTimeoutS_;
private List<String> feedNames_;
private Map<String, Integer> feedTypes_;
private Map<String, List<Integer>> feedShapes_;
private List<String> fetchNames_;
private Map<String, Integer> fetchTypes_;
private Set<String> lodTensorSet_;
private Map<String, Integer> feedTensorLen_;
private Profiler profiler_;
public Client() {
channel_ = null;
blockingStub_ = null;
futureStub_ = null;
rpcTimeoutS_ = 2;
feedNames_ = null;
feedTypes_ = null;
feedShapes_ = null;
fetchNames_ = null;
fetchTypes_ = null;
lodTensorSet_ = null;
feedTensorLen_ = null;
profiler_ = new Profiler();
boolean is_profile = false;
String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
if (FLAGS_profile_client != null && FLAGS_profile_client.equals("1")) {
is_profile = true;
}
profiler_.enable(is_profile);
}
public boolean setRpcTimeoutMs(int rpc_timeout) {
if (futureStub_ == null || blockingStub_ == null) {
System.out.println("set timeout must be set after connect.");
return false;
}
rpcTimeoutS_ = rpc_timeout / 1000.0;
SetTimeoutRequest timeout_req = SetTimeoutRequest.newBuilder()
.setTimeoutMs(rpc_timeout)
.build();
SimpleResponse resp;
try {
resp = blockingStub_.setTimeout(timeout_req);
} catch (StatusRuntimeException e) {
System.out.format("Set RPC timeout failed: %s\n", e.toString());
return false;
}
return resp.getErrCode() == 0;
}
public boolean connect(String target) {
// TODO: target must be NameResolver-compliant URI
// https://grpc.github.io/grpc-java/javadoc/io/grpc/ManagedChannelBuilder.html
try {
channel_ = ManagedChannelBuilder.forTarget(target)
.defaultLoadBalancingPolicy("round_robin")
.maxInboundMessageSize(Integer.MAX_VALUE)
.usePlaintext()
.build();
blockingStub_ = MultiLangGeneralModelServiceGrpc.newBlockingStub(channel_);
futureStub_ = MultiLangGeneralModelServiceGrpc.newFutureStub(channel_);
} catch (Exception e) {
System.out.format("Connect failed: %s\n", e.toString());
return false;
}
GetClientConfigRequest get_client_config_req = GetClientConfigRequest.newBuilder().build();
GetClientConfigResponse resp;
try {
resp = blockingStub_.getClientConfig(get_client_config_req);
} catch (Exception e) {
System.out.format("Get Client config failed: %s\n", e.toString());
return false;
}
String model_config_str = resp.getClientConfigStr();
_parseModelConfig(model_config_str);
return true;
}
private void _parseModelConfig(String model_config_str) {
GeneralModelConfig.Builder model_conf_builder = GeneralModelConfig.newBuilder();
try {
com.google.protobuf.TextFormat.getParser().merge(model_config_str, model_conf_builder);
} catch (com.google.protobuf.TextFormat.ParseException e) {
System.out.format("Parse client config failed: %s\n", e.toString());
}
GeneralModelConfig model_conf = model_conf_builder.build();
feedNames_ = new ArrayList<String>();
fetchNames_ = new ArrayList<String>();
feedTypes_ = new HashMap<String, Integer>();
feedShapes_ = new HashMap<String, List<Integer>>();
fetchTypes_ = new HashMap<String, Integer>();
lodTensorSet_ = new HashSet<String>();
feedTensorLen_ = new HashMap<String, Integer>();
List<FeedVar> feed_var_list = model_conf.getFeedVarList();
for (FeedVar feed_var : feed_var_list) {
feedNames_.add(feed_var.getAliasName());
}
List<FetchVar> fetch_var_list = model_conf.getFetchVarList();
for (FetchVar fetch_var : fetch_var_list) {
fetchNames_.add(fetch_var.getAliasName());
}
for (int i = 0; i < feed_var_list.size(); ++i) {
FeedVar feed_var = feed_var_list.get(i);
String var_name = feed_var.getAliasName();
feedTypes_.put(var_name, feed_var.getFeedType());
feedShapes_.put(var_name, feed_var.getShapeList());
if (feed_var.getIsLodTensor()) {
lodTensorSet_.add(var_name);
} else {
int counter = 1;
for (int dim : feedShapes_.get(var_name)) {
counter *= dim;
}
feedTensorLen_.put(var_name, counter);
}
}
for (int i = 0; i < fetch_var_list.size(); i++) {
FetchVar fetch_var = fetch_var_list.get(i);
String var_name = fetch_var.getAliasName();
fetchTypes_.put(var_name, fetch_var.getFetchType());
if (fetch_var.getIsLodTensor()) {
lodTensorSet_.add(var_name);
}
}
}
private InferenceRequest _packInferenceRequest(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
long log_id) throws IllegalArgumentException {
List<String> feed_var_names = new ArrayList<String>();
feed_var_names.addAll(feed_batch.get(0).keySet());
InferenceRequest.Builder req_builder = InferenceRequest.newBuilder()
.addAllFeedVarNames(feed_var_names)
.addAllFetchVarNames(fetch)
.setIsPython(false)
.setLogId(log_id);
for (HashMap<String, INDArray> feed_data: feed_batch) {
FeedInst.Builder inst_builder = FeedInst.newBuilder();
for (String name: feed_var_names) {
Tensor.Builder tensor_builder = Tensor.newBuilder();
INDArray variable = feed_data.get(name);
long[] flattened_shape = {-1};
INDArray flattened_list = variable.reshape(flattened_shape);
int v_type = feedTypes_.get(name);
NdIndexIterator iter = new NdIndexIterator(flattened_list.shape());
if (v_type == 0) { // int64
while (iter.hasNext()) {
long[] next_index = iter.next();
long x = flattened_list.getLong(next_index);
tensor_builder.addInt64Data(x);
}
} else if (v_type == 1) { // float32
while (iter.hasNext()) {
long[] next_index = iter.next();
float x = flattened_list.getFloat(next_index);
tensor_builder.addFloatData(x);
}
} else if (v_type == 2) { // int32
while (iter.hasNext()) {
long[] next_index = iter.next();
// the interface of INDArray is strange:
// https://deeplearning4j.org/api/latest/org/nd4j/linalg/api/ndarray/INDArray.html
int[] int_next_index = new int[next_index.length];
for(int i = 0; i < next_index.length; i++) {
int_next_index[i] = (int)next_index[i];
}
int x = flattened_list.getInt(int_next_index);
tensor_builder.addIntData(x);
}
} else {
throw new IllegalArgumentException("error tensor value type.");
}
long[] longArray = variable.shape();
int[] intArray = Arrays.stream(longArray).mapToInt(i -> (int) i).toArray();
List<Integer> indarrayShapeList = Arrays.stream(intArray).boxed().collect(Collectors.toList());
//tensor_builder.addAllShape(feedShapes_.get(name));
tensor_builder.addAllShape(indarrayShapeList);
inst_builder.addTensorArray(tensor_builder.build());
}
req_builder.addInsts(inst_builder.build());
}
return req_builder.build();
}
private Map<String, HashMap<String, INDArray>>
_unpackInferenceResponse(
InferenceResponse resp,
Iterable<String> fetch,
Boolean need_variant_tag) throws IllegalArgumentException {
return Client._staticUnpackInferenceResponse(
resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
}
private static Map<String, HashMap<String, INDArray>>
_staticUnpackInferenceResponse(
InferenceResponse resp,
Iterable<String> fetch,
Map<String, Integer> fetchTypes,
Set<String> lodTensorSet,
Boolean need_variant_tag) throws IllegalArgumentException {
if (resp.getErrCode() != 0) {
return null;
}
String tag = resp.getTag();
HashMap<String, HashMap<String, INDArray>> multi_result_map
= new HashMap<String, HashMap<String, INDArray>>();
for (ModelOutput model_result: resp.getOutputsList()) {
String engine_name = model_result.getEngineName();
FetchInst inst = model_result.getInsts(0);
HashMap<String, INDArray> result_map
= new HashMap<String, INDArray>();
int index = 0;
for (String name: fetch) {
Tensor variable = inst.getTensorArray(index);
int v_type = fetchTypes.get(name);
INDArray data = null;
if (v_type == 0) { // int64
List<Long> list = variable.getInt64DataList();
long[] array = new long[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
data = Nd4j.createFromArray(array);
} else if (v_type == 1) { // float32
List<Float> list = variable.getFloatDataList();
float[] array = new float[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
data = Nd4j.createFromArray(array);
} else if (v_type == 2) { // int32
List<Integer> list = variable.getIntDataList();
int[] array = new int[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
data = Nd4j.createFromArray(array);
} else {
throw new IllegalArgumentException("error tensor value type.");
}
// shape
List<Integer> shape_lsit = variable.getShapeList();
int[] shape_array = new int[shape_lsit.size()];
for (int i = 0; i < shape_lsit.size(); ++i) {
shape_array[i] = shape_lsit.get(i);
}
data = data.reshape(shape_array);
// put data to result_map
result_map.put(name, data);
// lod
if (lodTensorSet.contains(name)) {
List<Integer> list = variable.getLodList();
int[] array = new int[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
result_map.put(name + ".lod", Nd4j.createFromArray(array));
}
index += 1;
}
multi_result_map.put(engine_name, result_map);
}
// TODO: tag(ABtest not support now)
return multi_result_map;
}
public Map<String, INDArray> predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch) {
return predict(feed, fetch, false, 0);
}
public Map<String, INDArray> predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
long log_id) {
return predict(feed, fetch, false, log_id);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch) {
return ensemble_predict(feed, fetch, false, 0);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
long log_id) {
return ensemble_predict(feed, fetch, false, log_id);
}
public PredictFuture asyn_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch) {
return asyn_predict(feed, fetch, false, 0);
}
public PredictFuture asyn_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
long log_id) {
return asyn_predict(feed, fetch, false, log_id);
}
public Map<String, INDArray> predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag) {
return predict(feed, fetch, need_variant_tag, 0);
}
public Map<String, INDArray> predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
List<HashMap<String, INDArray>> feed_batch
= new ArrayList<HashMap<String, INDArray>>();
feed_batch.add(feed);
return predict(feed_batch, fetch, need_variant_tag, log_id);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag) {
return ensemble_predict(feed, fetch, need_variant_tag, 0);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
List<HashMap<String, INDArray>> feed_batch
= new ArrayList<HashMap<String, INDArray>>();
feed_batch.add(feed);
return ensemble_predict(feed_batch, fetch, need_variant_tag, log_id);
}
public PredictFuture asyn_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag) {
return asyn_predict(feed, fetch, need_variant_tag, 0);
}
public PredictFuture asyn_predict(
HashMap<String, INDArray> feed,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
List<HashMap<String, INDArray>> feed_batch
= new ArrayList<HashMap<String, INDArray>>();
feed_batch.add(feed);
return asyn_predict(feed_batch, fetch, need_variant_tag, log_id);
}
public Map<String, INDArray> predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch) {
return predict(feed_batch, fetch, false, 0);
}
public Map<String, INDArray> predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
long log_id) {
return predict(feed_batch, fetch, false, log_id);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch) {
return ensemble_predict(feed_batch, fetch, false, 0);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
long log_id) {
return ensemble_predict(feed_batch, fetch, false, log_id);
}
public PredictFuture asyn_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch) {
return asyn_predict(feed_batch, fetch, false, 0);
}
public PredictFuture asyn_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
long log_id) {
return asyn_predict(feed_batch, fetch, false, log_id);
}
public Map<String, INDArray> predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag) {
return predict(feed_batch, fetch, need_variant_tag, 0);
}
public Map<String, INDArray> predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
try {
profiler_.record("java_prepro_0");
InferenceRequest req = _packInferenceRequest(
feed_batch, fetch, log_id);
profiler_.record("java_prepro_1");
profiler_.record("java_client_infer_0");
InferenceResponse resp = blockingStub_.inference(req);
profiler_.record("java_client_infer_1");
profiler_.record("java_postpro_0");
Map<String, HashMap<String, INDArray>> ensemble_result
= _unpackInferenceResponse(resp, fetch, need_variant_tag);
List<Map.Entry<String, HashMap<String, INDArray>>> list
= new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
ensemble_result.entrySet());
if (list.size() != 1) {
System.out.format("Failed to predict: please use ensemble_predict impl.\n");
return null;
}
profiler_.record("java_postpro_1");
profiler_.printProfile();
return list.get(0).getValue();
} catch (StatusRuntimeException e) {
System.out.format("Failed to predict: %s\n", e.toString());
return null;
}
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag) {
return ensemble_predict(feed_batch, fetch, need_variant_tag, 0);
}
public Map<String, HashMap<String, INDArray>> ensemble_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
try {
profiler_.record("java_prepro_0");
InferenceRequest req = _packInferenceRequest(
feed_batch, fetch, log_id);
profiler_.record("java_prepro_1");
profiler_.record("java_client_infer_0");
InferenceResponse resp = blockingStub_.inference(req);
profiler_.record("java_client_infer_1");
profiler_.record("java_postpro_0");
Map<String, HashMap<String, INDArray>> ensemble_result
= _unpackInferenceResponse(resp, fetch, need_variant_tag);
profiler_.record("java_postpro_1");
profiler_.printProfile();
return ensemble_result;
} catch (StatusRuntimeException e) {
System.out.format("Failed to predict: %s\n", e.toString());
return null;
}
}
public PredictFuture asyn_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag) {
return asyn_predict(feed_batch, fetch, need_variant_tag, 0);
}
public PredictFuture asyn_predict(
List<HashMap<String, INDArray>> feed_batch,
Iterable<String> fetch,
Boolean need_variant_tag,
long log_id) {
InferenceRequest req = _packInferenceRequest(
feed_batch, fetch, log_id);
ListenableFuture<InferenceResponse> future = futureStub_.inference(req);
PredictFuture predict_future = new PredictFuture(future,
(InferenceResponse resp) -> {
return Client._staticUnpackInferenceResponse(
resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
}
);
return predict_future;
}
}
package io.paddle.serving.client;
import java.util.*;
import java.util.function.Function;
import java.lang.management.ManagementFactory;
import java.lang.management.RuntimeMXBean;
import java.util.stream.Collectors;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Map.Entry;
import java.nio.file.*;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.iter.NdIndexIterator;
import org.nd4j.linalg.factory.Nd4j;
import java.lang.reflect.*;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.entity.StringEntity;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.Header;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.apache.http.entity.InputStreamEntity;
import org.json.*;
import io.paddle.serving.configure.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
enum ElementType
{
Int64_type, Float32_type, Int32_type, Bytes_type;
}
class Profiler {
int pid_;
String print_head_ = null;
List<String> time_record_ = null;
boolean enable_ = false;
Profiler() {
RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean();
pid_ = Integer.valueOf(runtimeMXBean.getName().split("@")[0]).intValue();
print_head_ = "\nPROFILE\tpid:" + pid_ + "\t";
time_record_ = new ArrayList<String>();
time_record_.add(print_head_);
}
void record(String name) {
if (enable_) {
long ctime = System.currentTimeMillis() * 1000;
time_record_.add(name + ":" + String.valueOf(ctime) + " ");
}
}
void printProfile() {
if (enable_) {
String profile_str = String.join("", time_record_);
time_record_ = new ArrayList<String>();
time_record_.add(print_head_);
}
}
void enable(boolean flag) {
enable_ = flag;
}
}
public class HttpClient {
private int httpTimeoutS_;
private List<String> feedNames_;
private Map<String, String> feedRealNames_;
private Map<String, Integer> feedTypes_;
private Map<String, List<Integer>> feedShapes_;
private Map<String, Integer> feedNameToIndex_;
private Map<Integer, String> feedTypeToDataKey_;
private List<String> fetchNames_;
private Map<String, Integer> fetchTypes_;
private Set<String> lodTensorSet_;
private Map<String, Integer> feedTensorLen_;
private Profiler profiler_;
private String ip;
private String serverPort;
private String port;
private String serviceName;
private boolean request_compress_flag;
private boolean response_compress_flag;
private String GLOG_v;
public HttpClient() {
feedNames_ = null;
feedRealNames_ = null;
feedTypes_ = null;
feedShapes_ = null;
fetchNames_ = null;
fetchTypes_ = null;
lodTensorSet_ = null;
feedTensorLen_ = null;
feedNameToIndex_ = null;
httpTimeoutS_ = 200000;
ip = "0.0.0.0";
port = "9393";
serverPort = "9393";
serviceName = "/GeneralModelService/inference";
request_compress_flag = false;
response_compress_flag = false;
GLOG_v = System.getenv("GLOG_v");
feedTypeToDataKey_ = new HashMap<Integer, String>();
feedTypeToDataKey_.put(0, "int64_data");
feedTypeToDataKey_.put(1, "float_data");
feedTypeToDataKey_.put(2, "int_data");
feedTypeToDataKey_.put(3, "data");
profiler_ = new Profiler();
boolean is_profile = false;
String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
if (FLAGS_profile_client != null && FLAGS_profile_client.equals("1")) {
is_profile = true;
}
profiler_.enable(is_profile);
}
public void setTimeOut(int httpTimeoutS_) {
this.httpTimeoutS_ = httpTimeoutS_;
}
public void setIP(String ip) {
this.ip = ip;
}
public void setPort(String port) {
this.port = port;
this.serverPort = port;
}
public void setServiceName(String serviceName){
this.serviceName = serviceName;
}
public void loadClientConfig(String model_config_path) {
GeneralModelConfig.Builder model_conf_builder = GeneralModelConfig.newBuilder();
try {
byte[] data = Files.readAllBytes(Paths.get(model_config_path));
String model_config_str = new String(data, "utf-8");
com.google.protobuf.TextFormat.getParser().merge(model_config_str, model_conf_builder);
} catch (com.google.protobuf.TextFormat.ParseException e) {
System.out.format("Parse client config failed: %s\n", e.toString());
} catch (Exception e) {
System.out.format("Open client config failed: %s\n", e.toString());
}
GeneralModelConfig model_conf = model_conf_builder.build();
feedNames_ = new ArrayList<String>();
feedRealNames_ = new HashMap<String, String>();
feedTypes_ = new HashMap<String, Integer>();
feedShapes_ = new HashMap<String, List<Integer>>();
lodTensorSet_ = new HashSet<String>();
feedTensorLen_ = new HashMap<String, Integer>();
feedNameToIndex_ = new HashMap<String, Integer>();
fetchNames_ = new ArrayList<String>();
fetchTypes_ = new HashMap<String, Integer>();
List<FeedVar> feed_var_list = model_conf.getFeedVarList();
for (int i = 0; i < feed_var_list.size(); ++i) {
FeedVar feed_var = feed_var_list.get(i);
String var_name = feed_var.getAliasName();
feedNames_.add(var_name);
feedRealNames_.put(var_name, feed_var.getName());
feedTypes_.put(var_name, feed_var.getFeedType());
feedShapes_.put(var_name, feed_var.getShapeList());
feedNameToIndex_.put(var_name, i);
if (feed_var.getIsLodTensor()) {
lodTensorSet_.add(var_name);
} else {
int counter = 1;
for (int dim : feedShapes_.get(var_name)) {
counter *= dim;
}
feedTensorLen_.put(var_name, counter);
}
}
List<FetchVar> fetch_var_list = model_conf.getFetchVarList();
for (int i = 0; i < fetch_var_list.size(); i++) {
FetchVar fetch_var = fetch_var_list.get(i);
String var_name = fetch_var.getAliasName();
fetchNames_.add(var_name);
fetchTypes_.put(var_name, fetch_var.getFetchType());
}
}
public void use_key(String keyFilePath) {
String key_str = null;
String encrypt_url = "http://" + this.ip + ":" +this.port;
try {
byte[] data = Files.readAllBytes(Paths.get(keyFilePath));
key_str = Base64.getEncoder().encodeToString(data);
} catch (Exception e) {
System.out.format("Open key file failed: %s\n", e.toString());
}
JSONObject jsonKey = new JSONObject();
if( key_str != null) {
jsonKey.put("key", key_str);
}else{
jsonKey.put("key", "");
}
String result = doPost(encrypt_url, jsonKey.toString());
try {
JSONObject jsonObject = new JSONObject(result);
JSONArray jsonArray = jsonObject.getJSONArray("endpoint_list");
this.serverPort = jsonArray.getString(0);
System.out.format("Real ServerPort is: %s\n", this.serverPort);
}catch (JSONException err) {
System.out.format("Parse serverPort failed: %s\n", err.toString());
}
}
public void set_request_compress(boolean request_compress_flag) {
// need to be done.
this.request_compress_flag = request_compress_flag;
}
public void set_response_compress(boolean response_compress_flag) {
// need to be done.
this.response_compress_flag = response_compress_flag;
}
public byte[] compress(String str) {
if (str == null || str.length() == 0) {
return null;
}
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip;
try {
gzip = new GZIPOutputStream(out);
gzip.write(str.getBytes("UTF-8"));
gzip.close();
} catch (Exception e) {
e.printStackTrace();
}
return out.toByteArray();
}
// 帮助用户封装Http请求的接口,用户只需要传递FeedData,Lod,Fetchlist即可。
// 根据Proto组装Json的过程由这个函数来完成,且接口与Python基本一致.
// 共提供了四组重载的接口,支持用户最少传入feedData和fetch,还可传lod和batchFlag.
public String predict(Map<String, Object> feedData,
List<String> fetch,
int log_id) {
return predict(feedData,null,fetch,false,log_id);
}
public String predict(Map<String, Object> feedData,
List<String> fetch,
boolean batchFlag,
int log_id) {
return predict(feedData,null,fetch,batchFlag,log_id);
}
public String predict(Map<String, Object> feedData,
Map<String, Object> feedLod,
List<String> fetch,
int log_id) {
return predict(feedData,feedLod,fetch,false,log_id);
}
public String predict(Map<String, Object> feedData,
Map<String, Object> feedLod,
List<String> fetch,
boolean batchFlag,
int log_id) {
String server_url = "http://" + this.ip + ":" + this.serverPort + this.serviceName;
// 处理fetchList
JSONArray jsonFetchList = new JSONArray();
Iterator<String> fetchIterator = fetch.iterator();
while (fetchIterator.hasNext()) {
jsonFetchList.put(fetchIterator.next());
}
// 处理Tensor
JSONArray jsonTensorArray = new JSONArray();
try{
if (null != feedData && feedData.size() > 0) {
// 通过map集成entrySet方法获取entity
Set<Entry<String, Object>> entrySet = feedData.entrySet();
// 循环遍历,获取迭代器
Iterator<Entry<String, Object>> iterator = entrySet.iterator();
while (iterator.hasNext()) {
JSONObject jsonTensor = new JSONObject();
Entry<String, Object> mapEntry = iterator.next();
Object objectValue = mapEntry.getValue();
String feed_alias_name = mapEntry.getKey();
String feed_real_name = feedRealNames_.get(feed_alias_name);
List<Integer> shape = new ArrayList<Integer>(feedShapes_.get(feed_alias_name));
int element_type = feedTypes_.get(feed_alias_name);
jsonTensor.put("alias_name", feed_alias_name);
jsonTensor.put("name", feed_real_name);
jsonTensor.put("elem_type", element_type);
// 处理数据与shape
String protoDataKey = feedTypeToDataKey_.get(element_type);
// 如果是INDArray类型,先转为一维.
// 此时shape为INDArray的shape
if(objectValue instanceof INDArray){
INDArray tempIndArray = (INDArray)objectValue;
long[] indarrayShape = tempIndArray.shape();
shape.clear();
for(long dim:indarrayShape){
shape.add((int)dim);
}
objectValue = tempIndArray.data().asDouble();
}else if(objectValue.getClass().isArray()){
// 如果是数组类型,则无须处理,直接使用即可。
// 且数组无法嵌套,此时batch无法从数据中获取
// 默认batch维度为1,或者feedVar的shape信息中已包含batch
}else if(objectValue instanceof List){
// 如果为list,可能存在嵌套,此时需要展平
// 如果batchFlag为True,则认为是嵌套list
// 此时取最外层为batch的维度
if (batchFlag) {
List<?> list = new ArrayList<>();
list = new ArrayList<>((Collection<?>)objectValue);
// 在index=0处,加上batch
shape.add(0, list.size());
}
objectValue = recursiveExtract(objectValue);
}else{
// 此时认为是传入的单个String或者Int等
// 此时无法获取batch信息,故对shape不处理
// 由于Proto中为Repeated,需要把数据包装成list
if(objectValue instanceof String){
if(feedTypes_.get(protoDataKey)!= ElementType.Bytes_type.ordinal()){
throw new Exception("feedvar is not string-type,feed can`t be a single string.");
}
}else{
if(feedTypes_.get(protoDataKey)== ElementType.Bytes_type.ordinal()){
throw new Exception("feedvar is string-type,feed, feed can`t be a single int or others.");
}
}
List<Object> list = new ArrayList<>();
list.add(objectValue);
objectValue = list;
}
jsonTensor.put(protoDataKey,objectValue);
if(!batchFlag){
// 在index=0处,加上batch=1
shape.add(0, 1);
}
jsonTensor.put("shape", shape);
// 处理lod信息,支持INDArray Array Iterable
Object feedLodValue = null;
if(feedLod != null){
feedLodValue = feedLod.get(feed_alias_name);
if(feedLodValue != null) {
if(feedLodValue instanceof INDArray){
INDArray tempIndArray = (INDArray)feedLodValue;
feedLodValue = tempIndArray.data().asInt();
}else if(feedLodValue.getClass().isArray()){
// 如果是数组类型,则无须处理,直接使用即可。
}else if(feedLodValue instanceof Iterable){
// 如果为list,可能存在嵌套,此时需要展平
feedLodValue = recursiveExtract(feedLodValue);
}else{
throw new Exception("Lod must be INDArray or Array or Iterable.");
}
jsonTensor.put("lod", feedLodValue);
}
}
jsonTensorArray.put(jsonTensor);
}
}
}catch (Exception e) {
e.printStackTrace();
}
JSONObject jsonRequest = new JSONObject();
jsonRequest.put("log_id",log_id);
jsonRequest.put("fetch_var_names", jsonFetchList);
jsonRequest.put("tensor",jsonTensorArray);
if(GLOG_v != null){
System.out.format("------- Final jsonRequest: %s\n", jsonRequest.toString());
}
return doPost(server_url, jsonRequest.toString());
}
public String doPost(String url, String strPostData) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse httpResponse = null;
String result = "";
// 创建httpClient实例
httpClient = HttpClients.createDefault();
// 创建httpPost远程连接实例
HttpPost httpPost = new HttpPost(url);
// 配置请求参数实例
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(httpTimeoutS_)// 设置连接主机服务超时时间
.setConnectionRequestTimeout(httpTimeoutS_)// 设置连接请求超时时间
.setSocketTimeout(httpTimeoutS_)// 设置读取数据连接超时时间
.build();
// 为httpPost实例设置配置
httpPost.setConfig(requestConfig);
httpPost.setHeader("Content-Type", "application/json;charset=utf-8");
// 设置请求头
if(response_compress_flag){
httpPost.addHeader("Accept-encoding", "gzip");
if(GLOG_v != null){
System.out.format("------- Accept-encoding gzip: \n");
}
}
try {
if(request_compress_flag && strPostData.length()>1024){
try{
byte[] gzipEncrypt = compress(strPostData);
httpPost.setEntity(new InputStreamEntity(new ByteArrayInputStream(gzipEncrypt), gzipEncrypt.length));
httpPost.addHeader("Content-Encoding", "gzip");
} catch (Exception e) {
e.printStackTrace();
}
}else{
httpPost.setEntity(new StringEntity(strPostData, "UTF-8"));
}
// httpClient对象执行post请求,并返回响应参数对象
httpResponse = httpClient.execute(httpPost);
// 从响应对象中获取响应内容
HttpEntity entity = httpResponse.getEntity();
Header header = entity.getContentEncoding();
if(GLOG_v != null){
System.out.format("------- response header: %s\n", header);
}
if(header != null && header.getValue().equalsIgnoreCase("gzip")){ //判断返回内容是否为gzip压缩格式
GzipDecompressingEntity gzipEntity = new GzipDecompressingEntity(entity);
result = EntityUtils.toString(gzipEntity);
if(GLOG_v != null){
System.out.format("------- degzip response: %s\n", result);
}
}else{
result = EntityUtils.toString(entity);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭资源
if (null != httpResponse) {
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != httpClient) {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
}
public List<Object> recursiveExtract(Object stuff) {
List<Object> mylist = new ArrayList<Object>();
if(stuff instanceof Iterable) {
for(Object o : (Iterable< ? >)stuff) {
mylist.addAll(recursiveExtract(o));
}
} else if(stuff instanceof Map) {
for(Object o : ((Map<?, ? extends Object>) stuff).values()) {
mylist.addAll(recursiveExtract(o));
}
} else {
mylist.add(stuff);
}
return mylist;
}
}
package io.paddle.serving.client;
import java.util.*;
import java.util.function.Function;
import io.grpc.StatusRuntimeException;
import com.google.common.util.concurrent.ListenableFuture;
import org.nd4j.linalg.api.ndarray.INDArray;
import io.paddle.serving.client.Client;
import io.paddle.serving.grpc.*;
public class PredictFuture {
private ListenableFuture<InferenceResponse> callFuture_;
private Function<InferenceResponse,
Map<String, HashMap<String, INDArray>>> callBackFunc_;
PredictFuture(ListenableFuture<InferenceResponse> call_future,
Function<InferenceResponse,
Map<String, HashMap<String, INDArray>>> call_back_func) {
callFuture_ = call_future;
callBackFunc_ = call_back_func;
}
public Map<String, INDArray> get() {
InferenceResponse resp = null;
try {
resp = callFuture_.get();
} catch (Exception e) {
System.out.format("predict failed: %s\n", e.toString());
return null;
}
Map<String, HashMap<String, INDArray>> ensemble_result
= callBackFunc_.apply(resp);
List<Map.Entry<String, HashMap<String, INDArray>>> list
= new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
ensemble_result.entrySet());
if (list.size() != 1) {
System.out.format("predict failed: please use get_ensemble impl.\n");
return null;
}
return list.get(0).getValue();
}
public Map<String, HashMap<String, INDArray>> ensemble_get() {
InferenceResponse resp = null;
try {
resp = callFuture_.get();
} catch (Exception e) {
System.out.format("predict failed: %s\n", e.toString());
return null;
}
return callBackFunc_.apply(resp);
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package baidu.paddle_serving.multi_lang;
option java_multiple_files = true;
option java_package = "io.paddle.serving.grpc";
option java_outer_classname = "ServingProto";
message Tensor {
optional bytes data = 1;
repeated int32 int_data = 2;
repeated int64 int64_data = 3;
repeated float float_data = 4;
optional int32 elem_type = 5;
repeated int32 shape = 6;
repeated int32 lod = 7; // only for fetch tensor currently
};
message FeedInst { repeated Tensor tensor_array = 1; };
message FetchInst { repeated Tensor tensor_array = 1; };
message InferenceRequest {
repeated FeedInst insts = 1;
repeated string feed_var_names = 2;
repeated string fetch_var_names = 3;
required bool is_python = 4 [ default = false ];
required uint64 log_id = 5 [ default = 0 ];
};
message InferenceResponse {
repeated ModelOutput outputs = 1;
optional string tag = 2;
required int32 err_code = 3;
};
message ModelOutput {
repeated FetchInst insts = 1;
optional string engine_name = 2;
}
message SetTimeoutRequest { required int32 timeout_ms = 1; }
message SimpleResponse { required int32 err_code = 1; }
message GetClientConfigRequest {}
message GetClientConfigResponse { required string client_config_str = 1; }
service MultiLangGeneralModelService {
rpc Inference(InferenceRequest) returns (InferenceResponse) {}
rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
rpc GetClientConfig(GetClientConfigRequest)
returns (GetClientConfigResponse) {}
};
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "core/configure/include/configure_parser.h" #include "core/configure/include/configure_parser.h"
#include "core/configure/inferencer_configure.pb.h" #include "core/configure/inferencer_configure.pb.h"
...@@ -96,7 +97,7 @@ class EngineCore { ...@@ -96,7 +97,7 @@ class EngineCore {
return true; return true;
} }
virtual int create(const configure::EngineDesc& conf) = 0; virtual int create(const configure::EngineDesc& conf, int gpu_id) = 0;
virtual int clone(void* predictor) { virtual int clone(void* predictor) {
if (predictor == NULL) { if (predictor == NULL) {
...@@ -121,7 +122,7 @@ class EngineCore { ...@@ -121,7 +122,7 @@ class EngineCore {
// Paddle Inference Engine // Paddle Inference Engine
class PaddleInferenceEngine : public EngineCore { class PaddleInferenceEngine : public EngineCore {
public: public:
int create(const configure::EngineDesc& engine_conf) { int create(const configure::EngineDesc& engine_conf, int gpu_id) {
std::string model_path = engine_conf.model_dir(); std::string model_path = engine_conf.model_dir();
if (access(model_path.c_str(), F_OK) == -1) { if (access(model_path.c_str(), F_OK) == -1) {
LOG(ERROR) << "create paddle predictor failed, path not exits: " LOG(ERROR) << "create paddle predictor failed, path not exits: "
...@@ -162,7 +163,11 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -162,7 +163,11 @@ class PaddleInferenceEngine : public EngineCore {
config.SetCpuMathLibraryNumThreads(1); config.SetCpuMathLibraryNumThreads(1);
if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) { if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) {
// 2000MB GPU memory // 2000MB GPU memory
config.EnableUseGpu(2000, FLAGS_gpuid); config.EnableUseGpu(50, gpu_id);
if (engine_conf.has_gpu_multi_stream() &&
engine_conf.gpu_multi_stream()) {
config.EnableGpuMultiStream();
}
} }
precision_type = GetPrecision(FLAGS_precision); precision_type = GetPrecision(FLAGS_precision);
...@@ -174,8 +179,13 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -174,8 +179,13 @@ class PaddleInferenceEngine : public EngineCore {
} }
if (engine_conf.has_use_trt() && engine_conf.use_trt()) { if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
config.SwitchIrOptim(true);
if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) { if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
config.EnableUseGpu(2000, FLAGS_gpuid); config.EnableUseGpu(50, gpu_id);
if (engine_conf.has_gpu_multi_stream() &&
engine_conf.gpu_multi_stream()) {
config.EnableGpuMultiStream();
}
} }
config.EnableTensorRtEngine(1 << 20, config.EnableTensorRtEngine(1 << 20,
max_batch, max_batch,
...@@ -203,7 +213,7 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -203,7 +213,7 @@ class PaddleInferenceEngine : public EngineCore {
if (precision_type == PrecisionType::kInt8) { if (precision_type == PrecisionType::kInt8) {
config.EnableMkldnnQuantizer(); config.EnableMkldnnQuantizer();
auto quantizer_config = config.mkldnn_quantizer_config(); auto quantizer_config = config.mkldnn_quantizer_config();
// TODO: warmup data // TODO(somebody): warmup data
// quantizer_config -> SetWarmupData(); // quantizer_config -> SetWarmupData();
// quantizer_config -> SetWarmupBatchSize(); // quantizer_config -> SetWarmupBatchSize();
// quantizer_config -> SetEnabledOpTypes(4); // quantizer_config -> SetEnabledOpTypes(4);
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -42,6 +42,7 @@ class BertService(WebService): ...@@ -42,6 +42,7 @@ class BertService(WebService):
bert_service = BertService(name="bert") bert_service = BertService(name="bert")
bert_service.load() bert_service.load()
bert_service.load_model_config(sys.argv[1]) bert_service.load_model_config(sys.argv[1])
bert_service.set_gpus("0")
bert_service.prepare_server( bert_service.prepare_server(
workdir="workdir", port=int(sys.argv[2]), device="gpu") workdir="workdir", port=int(sys.argv[2]), device="gpu")
bert_service.run_rpc_service() bert_service.run_rpc_service()
......
...@@ -13,7 +13,8 @@ tar xf faster_rcnn_hrnetv2p_w18_1x.tar ...@@ -13,7 +13,8 @@ tar xf faster_rcnn_hrnetv2p_w18_1x.tar
python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
``` ```
This model support TensorRT, if you want a faster inference, please use `--use_trt`. This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
### Prediction ### Prediction
......
...@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ ...@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
tar xf faster_rcnn_hrnetv2p_w18_1x.tar tar xf faster_rcnn_hrnetv2p_w18_1x.tar
python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
``` ```
该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项。 该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
### 执行预测 ### 执行预测
``` ```
......
...@@ -13,7 +13,8 @@ tar xf faster_rcnn_r50_fpn_1x_coco.tar ...@@ -13,7 +13,8 @@ tar xf faster_rcnn_r50_fpn_1x_coco.tar
python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
``` ```
This model support TensorRT, if you want a faster inference, please use `--use_trt`. This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
### Perform prediction ### Perform prediction
......
...@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ ...@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
tar xf faster_rcnn_r50_fpn_1x_coco.tar tar xf faster_rcnn_r50_fpn_1x_coco.tar
python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
``` ```
该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项。 该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
### 执行预测 ### 执行预测
``` ```
......
...@@ -35,11 +35,11 @@ client-side configuration file are stored in the `encrypt_client` directory. ...@@ -35,11 +35,11 @@ client-side configuration file are stored in the `encrypt_client` directory.
## Start Encryption Service ## Start Encryption Service
CPU Service CPU Service
``` ```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
``` ```
GPU Service GPU Service
``` ```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0 python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
``` ```
## Prediction ## Prediction
......
...@@ -36,14 +36,14 @@ def serving_encryption(): ...@@ -36,14 +36,14 @@ def serving_encryption():
## 启动加密预测服务 ## 启动加密预测服务
CPU预测服务 CPU预测服务
``` ```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
``` ```
GPU预测服务 GPU预测服务
``` ```
python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0 python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
``` ```
## 预测 ## 预测
``` ```
python test_client.py encrypt_client/ python test_client.py encrypt_client/serving_client_conf.prototxt
``` ```
...@@ -19,7 +19,8 @@ import sys ...@@ -19,7 +19,8 @@ import sys
client = Client() client = Client()
client.load_client_config(sys.argv[1]) client.load_client_config(sys.argv[1])
client.use_key("./key") client.use_key("./key")
client.connect(["127.0.0.1:9300"], encryption=True) client.connect(["0.0.0.0:9393"], encryption=True)
fetch_list = client.get_fetch_names()
import paddle import paddle
test_reader = paddle.batch( test_reader = paddle.batch(
...@@ -28,5 +29,5 @@ test_reader = paddle.batch( ...@@ -28,5 +29,5 @@ test_reader = paddle.batch(
batch_size=1) batch_size=1)
for data in test_reader(): for data in test_reader():
fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"]) fetch_map = client.predict(feed={"x": data[0][0]}, fetch=fetch_list)
print("{} {}".format(fetch_map["price"][0], data[0][1][0])) print(fetch_map)
...@@ -18,30 +18,21 @@ sh get_data.sh ...@@ -18,30 +18,21 @@ sh get_data.sh
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
``` ```
### Client prediction ## Client prediction
### RPC Client
The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`). The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
``` shell ``` shell
python test_client.py uci_housing_client/serving_client_conf.prototxt python test_client.py uci_housing_client/serving_client_conf.prototxt
``` ```
### Http Client
## HTTP service
### Start server
Start a web service with default web service hosting modules:
``` shell ``` shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
``` ```
### Client prediction
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
```
## Benchmark ## Benchmark
``` shell ``` shell
......
...@@ -10,15 +10,16 @@ sh get_data.sh ...@@ -10,15 +10,16 @@ sh get_data.sh
## RPC服务
### 开启服务端 ## 开启服务端
```shell ```shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
``` ```
### 客户端预测 ## 客户端预测
### 客户端RPC
`test_client.py`中使用了`paddlepaddle`包,需要进行下载(`pip install paddlepaddle`)。 `test_client.py`中使用了`paddlepaddle`包,需要进行下载(`pip install paddlepaddle`)。
...@@ -26,23 +27,12 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po ...@@ -26,23 +27,12 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
python test_client.py uci_housing_client/serving_client_conf.prototxt python test_client.py uci_housing_client/serving_client_conf.prototxt
``` ```
### 客户端Http预测
## HTTP服务
### 开启服务端
通过下面的一行代码开启默认web服务:
``` shell ``` shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
``` ```
### 客户端预测
``` shell
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
```
## 性能测试 ## 性能测试
``` shell ``` shell
......
文件模式从 100755 更改为 100644
...@@ -20,7 +20,7 @@ import numpy as np ...@@ -20,7 +20,7 @@ import numpy as np
client = Client() client = Client()
client.load_client_config(sys.argv[1]) client.load_client_config(sys.argv[1])
client.connect(["127.0.0.1:9393"]) client.connect(["127.0.0.1:9393"])
fetch_list = client.get_fetch_names()
import paddle import paddle
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -31,6 +31,5 @@ for data in test_reader(): ...@@ -31,6 +31,5 @@ for data in test_reader():
new_data = np.zeros((1, 13)).astype("float32") new_data = np.zeros((1, 13)).astype("float32")
new_data[0] = data[0][0] new_data[0] = data[0][0]
fetch_map = client.predict( fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=True) feed={"x": new_data}, fetch=fetch_list, batch=True)
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
print(fetch_map) print(fetch_map)
...@@ -13,29 +13,31 @@ ...@@ -13,29 +13,31 @@
# limitations under the License. # limitations under the License.
# pylint: disable=doc-string-missing # pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client from paddle_serving_client.httpclient import HttpClient
import sys
import numpy as np import numpy as np
client = Client() import time
client.connect(["127.0.0.1:9393"])
""" client = HttpClient()
client.load_client_config(sys.argv[1])
# if you want to enable Encrypt Module,uncommenting the following line
# client.use_key("./key")
client.set_response_compress(True)
client.set_request_compress(True)
fetch_list = client.get_fetch_names()
import paddle
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.test(), buf_size=500),
batch_size=1)
for data in test_reader(): for data in test_reader():
new_data = np.zeros((1, 1, 13)).astype("float32") new_data = np.zeros((1, 13)).astype("float32")
new_data[0] = data[0][0] new_data[0] = data[0][0]
lst_data = []
for i in range(200):
lst_data.append(data[0][0])
fetch_map = client.predict( fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=True) feed={"x": lst_data}, fetch=fetch_list, batch=True)
print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
print(fetch_map) print(fetch_map)
""" break
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
new_data = np.array(x).astype("float32").reshape((1, 13))
fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
# 线性回归预测服务示例
## 获取数据
```shell
sh get_data.sh
```
## 开启 gRPC 服务端
``` shell
python test_server.py uci_housing_model/
```
也可以通过下面的一行代码开启默认 gRPC 服务:
```shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
```
## 客户端预测
### 同步预测
``` shell
python test_sync_client.py
```
### 异步预测
``` shell
python test_asyn_client.py
```
### Batch 预测
``` shell
python test_batch_client.py
```
### 预测超时
``` shell
python test_timeout_client.py
```
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import functools
import time
import threading
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
complete_task_count = [0]
lock = threading.Lock()
def call_back(call_future):
try:
fetch_map = call_future.result()
print(fetch_map)
except grpc.RpcError as e:
print(e.code())
finally:
with lock:
complete_task_count[0] += 1
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
task_count = 0
for i in range(3):
new_data = np.array(x).astype("float32").reshape((1, 13))
future = client.predict(
feed={"x": new_data}, fetch=["price"], batch=False, asyn=True)
task_count += 1
future.add_done_callback(functools.partial(call_back))
while complete_task_count[0] != task_count:
time.sleep(0.1)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
batch_size = 2
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
new_data = np.array(x).astype("float32").reshape((1, 1, 13))
batch_data = np.concatenate([new_data, new_data, new_data], axis=0)
print(batch_data.shape)
fetch_map = client.predict(
feed={"x": batch_data}, fetch=["price"], batch=True)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server import OpMaker
from paddle_serving_server import OpSeqMaker
from paddle_serving_server import MultiLangServer as Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_infer_op = op_maker.create('general_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.load_model_config(sys.argv[1])
server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
server.run_server()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import os
import sys
from paddle_serving_server import OpMaker
from paddle_serving_server import OpSeqMaker
from paddle_serving_server import MultiLangServer as Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
general_infer_op = op_maker.create('general_infer')
response_op = op_maker.create('general_response')
op_seq_maker = OpSeqMaker()
op_seq_maker.add_op(read_op)
op_seq_maker.add_op(general_infer_op)
op_seq_maker.add_op(response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.load_model_config(sys.argv[1])
server.set_gpuid(0)
server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
server.run_server()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
import grpc
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
client.set_rpc_timeout_ms(40)
x = [
0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
0.4919, 0.1856, 0.0795, -0.0332
]
for i in range(3):
new_data = np.array(x).astype("float32").reshape((1, 13))
fetch_map = client.predict(
feed={"x": new_data}, fetch=["price"], batch=False)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
print('timeout')
else:
print(fetch_map["serving_status_code"])
## IMDB comment sentiment inference service
([简体中文](./README_CN.md)|English)
### Get model files and sample data
```
sh get_data.sh
```
the package downloaded contains cnn, lstm and bow model config along with their test_data and train_data.
### Start RPC inference service
```
python -m paddle_serving_server.serve --model imdb_cnn_model/ --thread 10 --port 9393 --use_multilang
```
### RPC Infer
The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
```
head test_data/part-0 | python test_client.py
```
it will get predict results of the first 10 test cases.
## IMDB评论情绪预测服务
(简体中文|[English](./README.md))
### 获取模型文件和样例数据
```
sh get_data.sh
```
脚本会下载和解压出cnn、lstm和bow三种模型的配置文文件以及test_data和train_data。
### 启动RPC预测服务
```
python -m paddle_serving_server.serve --model imdb_cnn_model/ --thread 10 --port 9393 --use_multilang
```
### 执行预测
`test_client.py`中使用了`paddlepaddle`包,需要进行下载(`pip install paddlepaddle`)。
```
head test_data/part-0 | python test_client.py
```
预测test_data/part-0的前十个样例。
wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_model.tar.gz
tar -zxvf text_classification_data.tar.gz
tar -zxvf imdb_model.tar.gz
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import sys
import os
import paddle
import re
import paddle.fluid.incubate.data_generator as dg
py_version = sys.version_info[0]
class IMDBDataset(dg.MultiSlotDataGenerator):
def load_resource(self, dictfile):
self._vocab = {}
wid = 0
if py_version == 2:
with open(dictfile) as f:
for line in f:
self._vocab[line.strip()] = wid
wid += 1
else:
with open(dictfile, encoding="utf-8") as f:
for line in f:
self._vocab[line.strip()] = wid
wid += 1
self._unk_id = len(self._vocab)
self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
def get_words_only(self, line):
sent = line.lower().replace("<br />", " ").strip()
words = [x for x in self._pattern.split(sent) if x and x != " "]
feas = [
self._vocab[x] if x in self._vocab else self._unk_id for x in words
]
return feas
def get_words_and_label(self, line):
send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
" ").strip()
label = [int(line.split('|')[-1])]
words = [x for x in self._pattern.split(send) if x and x != " "]
feas = [
self._vocab[x] if x in self._vocab else self._unk_id for x in words
]
return feas, label
def infer_reader(self, infer_filelist, batch, buf_size):
def local_iter():
for fname in infer_filelist:
with open(fname, "r") as fin:
for line in fin:
feas, label = self.get_words_and_label(line)
yield feas, label
import paddle
batch_iter = paddle.batch(
paddle.reader.shuffle(
local_iter, buf_size=buf_size),
batch_size=batch)
return batch_iter
def generate_sample(self, line):
def memory_iter():
for i in range(1000):
yield self.return_value
def data_iter():
feas, label = self.get_words_and_label(line)
yield ("words", feas), ("label", label)
return data_iter
if __name__ == "__main__":
imdb = IMDBDataset()
imdb.load_resource("imdb.vocab")
imdb.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import MultiLangClient as Client
from paddle_serving_app.reader.imdb_reader import IMDBDataset
import sys
import numpy as np
client = Client()
client.connect(["127.0.0.1:9393"])
# you can define any english sentence or dataset here
# This example reuses imdb reader in training, you
# can define your own data preprocessing easily.
imdb_dataset = IMDBDataset()
imdb_dataset.load_resource('imdb.vocab')
for line in sys.stdin:
word_ids, label = imdb_dataset.get_words_and_label(line)
word_len = len(word_ids)
feed = {
"words": np.array(word_ids).reshape(word_len, 1),
"words.lod": [0, word_len]
}
fetch = ["prediction"]
fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
if fetch_map["serving_status_code"] == 0:
print(fetch_map)
else:
print(fetch_map["serving_status_code"])
#print("{} {}".format(fetch_map["prediction"][0], label[0]))
# Yolov4 Detection Service
([简体中文](README_CN.md)|English)
## Get Model
```
python -m paddle_serving_app.package --get_model yolov4
tar -xzvf yolov4.tar.gz
```
## Start RPC Service
```
python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
```
## Prediction
```
python test_client.py 000000570688.jpg
```
After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
# Yolov4 检测服务
(简体中文|[English](README.md))
## 获取模型
```
python -m paddle_serving_app.package --get_model yolov4
tar -xzvf yolov4.tar.gz
```
## 启动RPC服务
```
python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
```
## 预测
```
python test_client.py 000000570688.jpg
```
预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
person
bicycle
car
motorcycle
airplane
bus
train
truck
boat
traffic light
fire hydrant
stop sign
parking meter
bench
bird
cat
dog
horse
sheep
cow
elephant
bear
zebra
giraffe
backpack
umbrella
handbag
tie
suitcase
frisbee
skis
snowboard
sports ball
kite
baseball bat
baseball glove
skateboard
surfboard
tennis racket
bottle
wine glass
cup
fork
knife
spoon
bowl
banana
apple
sandwich
orange
broccoli
carrot
hot dog
pizza
donut
cake
chair
couch
potted plant
bed
dining table
toilet
tv
laptop
mouse
remote
keyboard
cell phone
microwave
oven
toaster
sink
refrigerator
book
clock
vase
scissors
teddy bear
hair drier
toothbrush
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
from paddle_serving_client import MultiLangClient as Client
from paddle_serving_app.reader import *
import cv2
preprocess = Sequential([
File2Image(), BGR2RGB(), Resize(
(608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose(
(2, 0, 1))
])
postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
client = Client()
client.connect(['127.0.0.1:9393'])
client.set_rpc_timeout_ms(15000)
im = preprocess(sys.argv[1])
fetch_map = client.predict(
feed={
"image": im,
"im_size": np.array(list(im.shape[1:])),
},
fetch=["save_infer_model/scale_0.tmp_0"],
batch=False)
print(fetch_map)
fetch_map.pop("serving_status_code")
fetch_map["image"] = sys.argv[1]
postprocess(fetch_map)
文件模式从 100755 更改为 100644
...@@ -26,7 +26,7 @@ tar xf test_imgs.tar ...@@ -26,7 +26,7 @@ tar xf test_imgs.tar
python -m paddle_serving_server.serve --model ocr_det_model --port 9293 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
python ocr_web_server.py cpu python ocr_web_server.py cpu
#for gpu user #for gpu user
python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0 python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
python ocr_web_server.py gpu python ocr_web_server.py gpu
``` ```
...@@ -111,7 +111,7 @@ After the -- model parameter, the folder path of multiple model files is passed ...@@ -111,7 +111,7 @@ After the -- model parameter, the folder path of multiple model files is passed
#for cpu user #for cpu user
python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
#for gpu user #for gpu user
python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
``` ```
### Client Prediction ### Client Prediction
......
...@@ -25,7 +25,7 @@ tar xf test_imgs.tar ...@@ -25,7 +25,7 @@ tar xf test_imgs.tar
python -m paddle_serving_server.serve --model ocr_det_model --port 9293 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
python ocr_web_server.py cpu python ocr_web_server.py cpu
#for gpu user #for gpu user
python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0 python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
python ocr_web_server.py gpu python ocr_web_server.py gpu
``` ```
...@@ -110,7 +110,7 @@ python rec_web_client.py ...@@ -110,7 +110,7 @@ python rec_web_client.py
#for cpu user #for cpu user
python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
#for gpu user #for gpu user
python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
``` ```
### 启动客户端 ### 启动客户端
......
...@@ -71,8 +71,7 @@ ocr_service = OCRService(name="ocr") ...@@ -71,8 +71,7 @@ ocr_service = OCRService(name="ocr")
ocr_service.load_model_config("ocr_det_model") ocr_service.load_model_config("ocr_det_model")
if sys.argv[1] == 'gpu': if sys.argv[1] == 'gpu':
ocr_service.set_gpus("0") ocr_service.set_gpus("0")
ocr_service.prepare_server( ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
workdir="workdir", port=9292, device="gpu", gpuid=0)
elif sys.argv[1] == 'cpu': elif sys.argv[1] == 'cpu':
ocr_service.prepare_server(workdir="workdir", port=9292) ocr_service.prepare_server(workdir="workdir", port=9292)
ocr_service.init_det() ocr_service.init_det()
......
...@@ -70,8 +70,7 @@ ocr_service = OCRService(name="ocr") ...@@ -70,8 +70,7 @@ ocr_service = OCRService(name="ocr")
ocr_service.load_model_config("ocr_det_model") ocr_service.load_model_config("ocr_det_model")
if sys.argv[1] == 'gpu': if sys.argv[1] == 'gpu':
ocr_service.set_gpus("0") ocr_service.set_gpus("0")
ocr_service.prepare_server( ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
workdir="workdir", port=9292, device="gpu", gpuid=0)
elif sys.argv[1] == 'cpu': elif sys.argv[1] == 'cpu':
ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu") ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
ocr_service.init_det() ocr_service.init_det()
......
文件模式从 100755 更改为 100644
...@@ -95,8 +95,7 @@ ocr_service = OCRService(name="ocr") ...@@ -95,8 +95,7 @@ ocr_service = OCRService(name="ocr")
ocr_service.load_model_config("ocr_rec_model") ocr_service.load_model_config("ocr_rec_model")
if sys.argv[1] == 'gpu': if sys.argv[1] == 'gpu':
ocr_service.set_gpus("0") ocr_service.set_gpus("0")
ocr_service.prepare_server( ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
workdir="workdir", port=9292, device="gpu", gpuid=0)
elif sys.argv[1] == 'cpu': elif sys.argv[1] == 'cpu':
ocr_service.prepare_server(workdir="workdir", port=9292) ocr_service.prepare_server(workdir="workdir", port=9292)
ocr_service.init_det_client( ocr_service.init_det_client(
......
...@@ -71,8 +71,7 @@ ocr_service.load_model_config("ocr_rec_model") ...@@ -71,8 +71,7 @@ ocr_service.load_model_config("ocr_rec_model")
if sys.argv[1] == 'gpu': if sys.argv[1] == 'gpu':
ocr_service.set_gpus("0") ocr_service.set_gpus("0")
ocr_service.init_rec() ocr_service.init_rec()
ocr_service.prepare_server( ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
workdir="workdir", port=9292, device="gpu", gpuid=0)
elif sys.argv[1] == 'cpu': elif sys.argv[1] == 'cpu':
ocr_service.init_rec() ocr_service.init_rec()
ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu") ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
......
...@@ -73,8 +73,7 @@ ocr_service.load_model_config("ocr_rec_model") ...@@ -73,8 +73,7 @@ ocr_service.load_model_config("ocr_rec_model")
ocr_service.init_rec() ocr_service.init_rec()
if sys.argv[1] == 'gpu': if sys.argv[1] == 'gpu':
ocr_service.set_gpus("0") ocr_service.set_gpus("0")
ocr_service.prepare_server( ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
workdir="workdir", port=9292, device="gpu", gpuid=0)
elif sys.argv[1] == 'cpu': elif sys.argv[1] == 'cpu':
ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu") ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
ocr_service.run_rpc_service() ocr_service.run_rpc_service()
......
文件模式从 100755 更改为 100644
...@@ -91,6 +91,7 @@ class LocalPredictor(object): ...@@ -91,6 +91,7 @@ class LocalPredictor(object):
mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default. mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
use_feed_fetch_ops: use feed/fetch ops, False default. use_feed_fetch_ops: use feed/fetch ops, False default.
""" """
gpu_id = int(gpu_id)
client_config = "{}/serving_server_conf.prototxt".format(model_path) client_config = "{}/serving_server_conf.prototxt".format(model_path)
model_conf = m_config.GeneralModelConfig() model_conf = m_config.GeneralModelConfig()
f = open(client_config, 'r') f = open(client_config, 'r')
......
...@@ -16,5 +16,6 @@ ...@@ -16,5 +16,6 @@
from . import version from . import version
from . import client from . import client
from .client import * from .client import *
from .httpclient import *
__version__ = version.version_tag __version__ = version.version_tag
...@@ -25,11 +25,8 @@ import base64 ...@@ -25,11 +25,8 @@ import base64
import time import time
import sys import sys
import grpc
from .proto import multi_lang_general_model_service_pb2
sys.path.append( sys.path.append(
os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto')) os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
from .proto import multi_lang_general_model_service_pb2_grpc
#param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
#param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
...@@ -79,7 +76,7 @@ class SDKConfig(object): ...@@ -79,7 +76,7 @@ class SDKConfig(object):
self.tag_list = [] self.tag_list = []
self.cluster_list = [] self.cluster_list = []
self.variant_weight_list = [] self.variant_weight_list = []
self.rpc_timeout_ms = 20000 self.rpc_timeout_ms = 200000
self.load_balance_strategy = "la" self.load_balance_strategy = "la"
def add_server_variant(self, tag, cluster, variant_weight): def add_server_variant(self, tag, cluster, variant_weight):
...@@ -142,7 +139,7 @@ class Client(object): ...@@ -142,7 +139,7 @@ class Client(object):
self.profile_ = _Profiler() self.profile_ = _Profiler()
self.all_numpy_input = True self.all_numpy_input = True
self.has_numpy_input = False self.has_numpy_input = False
self.rpc_timeout_ms = 20000 self.rpc_timeout_ms = 200000
from .serving_client import PredictorRes from .serving_client import PredictorRes
self.predictorres_constructor = PredictorRes self.predictorres_constructor = PredictorRes
...@@ -307,26 +304,40 @@ class Client(object): ...@@ -307,26 +304,40 @@ class Client(object):
if isinstance(feed, dict): if isinstance(feed, dict):
feed_batch.append(feed) feed_batch.append(feed)
elif isinstance(feed, list): elif isinstance(feed, list):
feed_batch = feed # if input is a list and the number of feed_var is 1.
# create a temp_dict { key = feed_var_name, value = list}
# put the temp_dict into the feed_batch.
if len(self.feed_names_) != 1:
raise ValueError(
"input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
)
temp_dict = {}
temp_dict[self.feed_names_[0]] = feed
feed_batch.append(temp_dict)
else: else:
raise ValueError("Feed only accepts dict and list of dict") raise ValueError("Feed only accepts dict and list of dict")
int_slot_batch = [] # batch_size must be 1, cause batch is already in Tensor.
if len(feed_batch) != 1:
raise ValueError("len of feed_batch can only be 1.")
int_slot = []
int_feed_names = [] int_feed_names = []
int_shape = [] int_shape = []
int_lod_slot_batch = [] int_lod_slot_batch = []
float_slot_batch = []
float_slot = []
float_feed_names = [] float_feed_names = []
float_lod_slot_batch = [] float_lod_slot_batch = []
float_shape = [] float_shape = []
string_slot_batch = []
string_slot = []
string_feed_names = [] string_feed_names = []
string_lod_slot_batch = [] string_lod_slot_batch = []
string_shape = [] string_shape = []
fetch_names = [] fetch_names = []
counter = 0 counter = 0
batch_size = len(feed_batch)
for key in fetch_list: for key in fetch_list:
if key in self.fetch_names_: if key in self.fetch_names_:
...@@ -335,89 +346,70 @@ class Client(object): ...@@ -335,89 +346,70 @@ class Client(object):
if len(fetch_names) == 0: if len(fetch_names) == 0:
raise ValueError( raise ValueError(
"Fetch names should not be empty or out of saved fetch list.") "Fetch names should not be empty or out of saved fetch list.")
return {}
feed_i = feed_batch[0]
for i, feed_i in enumerate(feed_batch): for key in feed_i:
int_slot = [] if ".lod" not in key and key not in self.feed_names_:
int_lod_slot = [] raise ValueError("Wrong feed name: {}.".format(key))
float_slot = [] if ".lod" in key:
float_lod_slot = [] continue
string_slot = []
string_lod_slot = [] self.shape_check(feed_i, key)
for key in feed_i: if self.feed_types_[key] in int_type:
if ".lod" not in key and key not in self.feed_names_: int_feed_names.append(key)
raise ValueError("Wrong feed name: {}.".format(key)) shape_lst = []
if ".lod" in key: if batch == False:
continue feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
#if not isinstance(feed_i[key], np.ndarray): 1, axis=0)
self.shape_check(feed_i, key) if isinstance(feed_i[key], np.ndarray):
if self.feed_types_[key] in int_type: shape_lst.extend(list(feed_i[key].shape))
if i == 0: int_shape.append(shape_lst)
int_feed_names.append(key) else:
shape_lst = [] int_shape.append(self.feed_shapes_[key])
if batch == False: if "{}.lod".format(key) in feed_i:
feed_i[key] = np.expand_dims(feed_i[key], 0).repeat( int_lod_slot_batch.append(feed_i["{}.lod".format(key)])
1, axis=0) else:
if isinstance(feed_i[key], np.ndarray): int_lod_slot_batch.append([])
shape_lst.extend(list(feed_i[key].shape))
int_shape.append(shape_lst) if isinstance(feed_i[key], np.ndarray):
else: int_slot.append(np.ascontiguousarray(feed_i[key]))
int_shape.append(self.feed_shapes_[key])
if "{}.lod".format(key) in feed_i:
int_lod_slot_batch.append(feed_i["{}.lod".format(
key)])
else:
int_lod_slot_batch.append([])
if isinstance(feed_i[key], np.ndarray):
int_slot.append(np.ascontiguousarray(feed_i[key]))
self.has_numpy_input = True
else:
int_slot.append(np.ascontiguousarray(feed_i[key]))
self.all_numpy_input = False
elif self.feed_types_[key] in float_type:
if i == 0:
float_feed_names.append(key)
shape_lst = []
if batch == False:
feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
1, axis=0)
if isinstance(feed_i[key], np.ndarray):
shape_lst.extend(list(feed_i[key].shape))
float_shape.append(shape_lst)
else:
float_shape.append(self.feed_shapes_[key])
if "{}.lod".format(key) in feed_i:
float_lod_slot_batch.append(feed_i["{}.lod".format(
key)])
else:
float_lod_slot_batch.append([])
if isinstance(feed_i[key], np.ndarray):
float_slot.append(np.ascontiguousarray(feed_i[key]))
self.has_numpy_input = True
else:
float_slot.append(np.ascontiguousarray(feed_i[key]))
self.all_numpy_input = False
#if input is string, feed is not numpy.
elif self.feed_types_[key] in string_type:
if i == 0:
string_feed_names.append(key)
string_shape.append(self.feed_shapes_[key])
if "{}.lod".format(key) in feed_i:
string_lod_slot_batch.append(feed_i["{}.lod".format(
key)])
else:
string_lod_slot_batch.append([])
string_slot.append(feed_i[key])
self.has_numpy_input = True self.has_numpy_input = True
int_slot_batch.append(int_slot) else:
int_lod_slot_batch.append(int_lod_slot) int_slot.append(np.ascontiguousarray(feed_i[key]))
float_slot_batch.append(float_slot) self.all_numpy_input = False
float_lod_slot_batch.append(float_lod_slot)
string_slot_batch.append(string_slot) elif self.feed_types_[key] in float_type:
string_lod_slot_batch.append(string_lod_slot) float_feed_names.append(key)
shape_lst = []
if batch == False:
feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
1, axis=0)
if isinstance(feed_i[key], np.ndarray):
shape_lst.extend(list(feed_i[key].shape))
float_shape.append(shape_lst)
else:
float_shape.append(self.feed_shapes_[key])
if "{}.lod".format(key) in feed_i:
float_lod_slot_batch.append(feed_i["{}.lod".format(key)])
else:
float_lod_slot_batch.append([])
if isinstance(feed_i[key], np.ndarray):
float_slot.append(np.ascontiguousarray(feed_i[key]))
self.has_numpy_input = True
else:
float_slot.append(np.ascontiguousarray(feed_i[key]))
self.all_numpy_input = False
#if input is string, feed is not numpy.
elif self.feed_types_[key] in string_type:
string_feed_names.append(key)
string_shape.append(self.feed_shapes_[key])
if "{}.lod".format(key) in feed_i:
string_lod_slot_batch.append(feed_i["{}.lod".format(key)])
else:
string_lod_slot_batch.append([])
string_slot.append(feed_i[key])
self.has_numpy_input = True
self.profile_.record('py_prepro_1') self.profile_.record('py_prepro_1')
self.profile_.record('py_client_infer_0') self.profile_.record('py_client_infer_0')
...@@ -425,11 +417,11 @@ class Client(object): ...@@ -425,11 +417,11 @@ class Client(object):
result_batch_handle = self.predictorres_constructor() result_batch_handle = self.predictorres_constructor()
if self.all_numpy_input: if self.all_numpy_input:
res = self.client_handle_.numpy_predict( res = self.client_handle_.numpy_predict(
float_slot_batch, float_feed_names, float_shape, float_slot, float_feed_names, float_shape, float_lod_slot_batch,
float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape, int_slot, int_feed_names, int_shape, int_lod_slot_batch,
int_lod_slot_batch, string_slot_batch, string_feed_names, string_slot, string_feed_names, string_shape,
string_shape, string_lod_slot_batch, fetch_names, string_lod_slot_batch, fetch_names, result_batch_handle,
result_batch_handle, self.pid, log_id) self.pid, log_id)
elif self.has_numpy_input == False: elif self.has_numpy_input == False:
raise ValueError( raise ValueError(
"Please make sure all of your inputs are numpy array") "Please make sure all of your inputs are numpy array")
...@@ -517,243 +509,3 @@ class Client(object): ...@@ -517,243 +509,3 @@ class Client(object):
def release(self): def release(self):
self.client_handle_.destroy_predictor() self.client_handle_.destroy_predictor()
self.client_handle_ = None self.client_handle_ = None
class MultiLangClient(object):
def __init__(self):
self.channel_ = None
self.stub_ = None
self.rpc_timeout_s_ = 2
self.profile_ = _Profiler()
def add_variant(self, tag, cluster, variant_weight):
# TODO
raise Exception("cannot support ABtest yet")
def set_rpc_timeout_ms(self, rpc_timeout):
if self.stub_ is None:
raise Exception("set timeout must be set after connect.")
if not isinstance(rpc_timeout, int):
# for bclient
raise ValueError("rpc_timeout must be int type.")
self.rpc_timeout_s_ = rpc_timeout / 1000.0
timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
timeout_req.timeout_ms = rpc_timeout
resp = self.stub_.SetTimeout(timeout_req)
return resp.err_code == 0
def connect(self, endpoints):
# https://github.com/tensorflow/serving/issues/1382
options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
('grpc.max_send_message_length', 512 * 1024 * 1024),
('grpc.lb_policy_name', 'round_robin')]
# TODO: weight round robin
g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
self.channel_)
# get client model config
get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
)
resp = self.stub_.GetClientConfig(get_client_config_req)
model_config_str = resp.client_config_str
self._parse_model_config(model_config_str)
def _flatten_list(self, nested_list):
for item in nested_list:
if isinstance(item, (list, tuple)):
for sub_item in self._flatten_list(item):
yield sub_item
else:
yield item
def _parse_model_config(self, model_config_str):
model_conf = m_config.GeneralModelConfig()
model_conf = google.protobuf.text_format.Merge(model_config_str,
model_conf)
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.feed_types_ = {}
self.feed_shapes_ = {}
self.lod_tensor_set_ = set()
for i, var in enumerate(model_conf.feed_var):
self.feed_types_[var.alias_name] = var.feed_type
self.feed_shapes_[var.alias_name] = var.shape
if var.is_lod_tensor:
self.lod_tensor_set_.add(var.alias_name)
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
self.fetch_types_ = {}
for i, var in enumerate(model_conf.fetch_var):
self.fetch_types_[var.alias_name] = var.fetch_type
if var.is_lod_tensor:
self.lod_tensor_set_.add(var.alias_name)
def _pack_inference_request(self, feed, fetch, is_python, log_id):
req = multi_lang_general_model_service_pb2.InferenceRequest()
req.fetch_var_names.extend(fetch)
req.is_python = is_python
req.log_id = log_id
feed_var_names = []
for key in feed.keys():
if '.lod' not in key:
feed_var_names.append(key)
req.feed_var_names.extend(feed_var_names)
inst = multi_lang_general_model_service_pb2.FeedInst()
for name in req.feed_var_names:
tensor = multi_lang_general_model_service_pb2.Tensor()
var = feed[name]
v_type = self.feed_types_[name]
if is_python:
data = None
if isinstance(var, list):
if v_type == 0: # int64
data = np.array(var, dtype="int64")
elif v_type == 1: # float32
data = np.array(var, dtype="float32")
elif v_type == 2: # int32
data = np.array(var, dtype="int32")
else:
raise Exception("error tensor value type.")
elif isinstance(var, np.ndarray):
data = var
if v_type == 0:
if data.dtype != 'int64':
data = data.astype("int64")
elif v_type == 1:
if data.dtype != 'float32':
data = data.astype("float32")
elif v_type == 2:
if data.dtype != 'int32':
data = data.astype("int32")
else:
raise Exception("error tensor value type.")
else:
raise Exception("var must be list or ndarray.")
data = np.ascontiguousarray(data)
tensor.data = data.tobytes()
tensor.shape.extend(list(var.shape))
if "{}.lod".format(name) in feed.keys():
tensor.lod.extend(feed["{}.lod".format(name)])
inst.tensor_array.append(tensor)
req.insts.append(inst)
return req
def _unpack_inference_response(self, resp, fetch, is_python,
need_variant_tag):
if resp.err_code != 0:
return None
tag = resp.tag
multi_result_map = {}
for model_result in resp.outputs:
inst = model_result.insts[0]
result_map = {}
for i, name in enumerate(fetch):
var = inst.tensor_array[i]
v_type = self.fetch_types_[name]
if is_python:
if v_type == 0: # int64
result_map[name] = np.frombuffer(
var.data, dtype="int64")
elif v_type == 1: # float32
result_map[name] = np.frombuffer(
var.data, dtype="float32")
else:
raise Exception("error type.")
else:
if v_type == 0: # int64
result_map[name] = np.array(
list(var.int64_data), dtype="int64")
elif v_type == 1: # float32
result_map[name] = np.array(
list(var.float_data), dtype="float32")
else:
raise Exception("error type.")
result_map[name].shape = list(var.shape)
if name in self.lod_tensor_set_:
result_map["{}.lod".format(name)] = np.array(list(var.lod))
multi_result_map[model_result.engine_name] = result_map
ret = None
if len(resp.outputs) == 1:
ret = list(multi_result_map.values())[0]
else:
ret = multi_result_map
ret["serving_status_code"] = 0
return ret if not need_variant_tag else [ret, tag]
def _done_callback_func(self, fetch, is_python, need_variant_tag):
def unpack_resp(resp):
return self._unpack_inference_response(resp, fetch, is_python,
need_variant_tag)
return unpack_resp
def get_feed_names(self):
return self.feed_names_
def predict(self,
feed,
fetch,
batch=True,
need_variant_tag=False,
asyn=False,
is_python=True,
log_id=0):
if isinstance(feed, dict) is False:
raise ValueError("Type Error. grpc feed must be dict.")
if batch is False:
for key in feed:
if ".lod" not in key:
feed[key] = np.expand_dims(feed[key], 0).repeat(1, axis=0)
if not asyn:
try:
self.profile_.record('py_prepro_0')
req = self._pack_inference_request(
feed, fetch, is_python=is_python, log_id=log_id)
self.profile_.record('py_prepro_1')
self.profile_.record('py_client_infer_0')
resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
self.profile_.record('py_client_infer_1')
self.profile_.record('py_postpro_0')
ret = self._unpack_inference_response(
resp,
fetch,
is_python=is_python,
need_variant_tag=need_variant_tag)
self.profile_.record('py_postpro_1')
self.profile_.print_profile()
return ret
except grpc.RpcError as e:
return {"serving_status_code": e.code()}
else:
req = self._pack_inference_request(
feed, fetch, is_python=is_python, log_id=log_id)
call_future = self.stub_.Inference.future(
req, timeout=self.rpc_timeout_s_)
return MultiLangPredictFuture(
call_future,
self._done_callback_func(
fetch,
is_python=is_python,
need_variant_tag=need_variant_tag))
class MultiLangPredictFuture(object):
def __init__(self, call_future, callback_func):
self.call_future_ = call_future
self.callback_func_ = callback_func
def result(self):
try:
resp = self.call_future_.result()
except grpc.RpcError as e:
return {"serving_status_code": e.code()}
return self.callback_func_(resp)
def add_done_callback(self, fn):
def __fn__(call_future):
assert call_future == self.call_future_
fn(self)
self.call_future_.add_done_callback(__fn__)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import json
import numpy as np
import os
from .proto import general_model_config_pb2 as m_config
import google.protobuf.text_format
import gzip
from collections import Iterable
import base64
#param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
#param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
#param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
int64_type = 0
float32_type = 1
int32_type = 2
bytes_type = 3
# this is corresponding to the proto
proto_data_key_list = ["int64_data", "float_data", "int_data", "data"]
def list_flatten(items, ignore_types=(str, bytes)):
for x in items:
if isinstance(x, Iterable) and not isinstance(x, ignore_types):
yield from list_flatten(x)
else:
yield x
def data_bytes_number(datalist):
total_bytes_number = 0
if isinstance(datalist, list):
if len(datalist) == 0:
return total_bytes_number
else:
for data in datalist:
if isinstance(data, str):
total_bytes_number = total_bytes_number + len(data)
else:
total_bytes_number = total_bytes_number + 4 * len(datalist)
break
else:
raise ValueError(
"In the Function data_bytes_number(), data must be list.")
class HttpClient(object):
def __init__(self,
ip="0.0.0.0",
port="9393",
service_name="/GeneralModelService/inference"):
self.feed_names_ = []
self.feed_real_names = []
self.fetch_names_ = []
self.feed_shapes_ = {}
self.feed_types_ = {}
self.feed_names_to_idx_ = {}
self.http_timeout_ms = 200000
self.ip = ip
self.port = port
self.server_port = port
self.service_name = service_name
self.key = None
self.try_request_gzip = False
self.try_response_gzip = False
def load_client_config(self, model_config_path_list):
if isinstance(model_config_path_list, str):
model_config_path_list = [model_config_path_list]
elif isinstance(model_config_path_list, list):
pass
file_path_list = []
for single_model_config in model_config_path_list:
if os.path.isdir(single_model_config):
file_path_list.append("{}/serving_client_conf.prototxt".format(
single_model_config))
elif os.path.isfile(single_model_config):
file_path_list.append(single_model_config)
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[0], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
# load configuraion here
# get feed vars, fetch vars
# get feed shapes, feed types
# map feed names to index
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.feed_real_names = [var.name for var in model_conf.feed_var]
self.feed_names_to_idx_ = {} #this is useful
self.lod_tensor_set = set()
self.feed_tensor_len = {} #this is only used for shape check
self.key = None
for i, var in enumerate(model_conf.feed_var):
self.feed_names_to_idx_[var.alias_name] = i
self.feed_types_[var.alias_name] = var.feed_type
self.feed_shapes_[var.alias_name] = [dim for dim in var.shape]
if var.is_lod_tensor:
self.lod_tensor_set.add(var.alias_name)
else:
counter = 1
for dim in self.feed_shapes_[var.alias_name]:
counter *= dim
self.feed_tensor_len[var.alias_name] = counter
if len(file_path_list) > 1:
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[-1], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
self.fetch_names_to_type_ = {}
self.fetch_names_to_idx_ = {}
for i, var in enumerate(model_conf.fetch_var):
self.fetch_names_to_idx_[var.alias_name] = i
self.fetch_names_to_type_[var.alias_name] = var.fetch_type
if var.is_lod_tensor:
self.lod_tensor_set.add(var.alias_name)
return
def set_http_timeout_ms(self, http_timeout_ms):
if not isinstance(http_timeout_ms, int):
raise ValueError("http_timeout_ms must be int type.")
else:
self.http_timeout_ms = http_timeout_ms
def set_request_compress(self, try_request_gzip):
self.try_request_gzip = try_request_gzip
def set_response_compress(self, try_response_gzip):
self.try_response_gzip = try_response_gzip
# use_key is the function of encryption.
def use_key(self, key_filename):
with open(key_filename, "rb") as f:
self.key = f.read()
self.get_serving_port()
def get_serving_port(self):
encrypt_url = "http://" + str(self.ip) + ":" + str(self.port)
if self.key is not None:
req = json.dumps({"key": base64.b64encode(self.key).decode()})
else:
req = json.dumps({})
r = requests.post(encrypt_url, req)
result = r.json()
print(result)
if "endpoint_list" not in result:
raise ValueError("server not ready")
else:
self.server_port = str(result["endpoint_list"][0])
print("rpc port is ", self.server_port)
def get_feed_names(self):
return self.feed_names_
def get_fetch_names(self):
return self.fetch_names_
# feed 支持Numpy类型,以及直接List、tuple
# 不支持str类型,因为proto中为repeated.
def predict(self,
feed=None,
fetch=None,
batch=False,
need_variant_tag=False,
log_id=0):
if feed is None or fetch is None:
raise ValueError("You should specify feed and fetch for prediction")
fetch_list = []
if isinstance(fetch, str):
fetch_list = [fetch]
elif isinstance(fetch, (list, tuple)):
fetch_list = fetch
else:
raise ValueError("Fetch only accepts string and list of string")
feed_batch = []
if isinstance(feed, dict):
feed_batch.append(feed)
elif isinstance(feed, (list, str, tuple)):
# if input is a list or str or tuple, and the number of feed_var is 1.
# create a temp_dict { key = feed_var_name, value = list}
# put the temp_dict into the feed_batch.
if len(self.feed_names_) != 1:
raise ValueError(
"input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
)
temp_dict = {}
temp_dict[self.feed_names_[0]] = feed
feed_batch.append(temp_dict)
else:
raise ValueError("Feed only accepts dict and list of dict")
# batch_size must be 1, cause batch is already in Tensor.
if len(feed_batch) != 1:
raise ValueError("len of feed_batch can only be 1.")
fetch_names = []
for key in fetch_list:
if key in self.fetch_names_:
fetch_names.append(key)
if len(fetch_names) == 0:
raise ValueError(
"Fetch names should not be empty or out of saved fetch list.")
return {}
feed_i = feed_batch[0]
Request = {}
Request["fetch_var_names"] = fetch_list
Request["log_id"] = int(log_id)
Request["tensor"] = []
index = 0
total_data_number = 0
for key in feed_i:
if ".lod" not in key and key not in self.feed_names_:
raise ValueError("Wrong feed name: {}.".format(key))
if ".lod" in key:
continue
Request["tensor"].append('')
Request["tensor"][index] = {}
lod = []
if "{}.lod".format(key) in feed_i:
lod = feed_i["{}.lod".format(key)]
shape = self.feed_shapes_[key].copy()
elem_type = self.feed_types_[key]
data_value = feed_i[key]
data_key = proto_data_key_list[elem_type]
# feed_i[key] 可以是np.ndarray
# 也可以是list或tuple
# 当np.ndarray需要处理为list
if isinstance(feed_i[key], np.ndarray):
shape_lst = []
# 0维numpy 需要在外层再加一个[]
if feed_i[key].ndim == 0:
data_value = [feed_i[key].tolist()]
shape_lst.append(1)
else:
shape_lst.extend(list(feed_i[key].shape))
shape = shape_lst
data_value = feed_i[key].flatten().tolist()
# 当Batch为False,shape字段前插一个1,表示batch维
# 当Batch为True,则直接使用numpy.shape作为batch维度
if batch == False:
shape.insert(0, 1)
# 当是list或tuple时,需要把多层嵌套展开
elif isinstance(feed_i[key], (list, tuple)):
# 当Batch为False,shape字段前插一个1,表示batch维
# 当Batch为True, 由于list并不像numpy那样规整,所以
# 无法获取shape,此时取第一维度作为Batch维度.
# 插入到feedVar.shape前面.
if batch == False:
shape.insert(0, 1)
else:
shape.insert(0, len(feed_i[key]))
feed_i[key] = [x for x in list_flatten(feed_i[key])]
data_value = feed_i[key]
else:
# 输入可能是单个的str或int值等
# 此时先统一处理为一个list
# 由于输入比较特殊,shape保持原feedvar中不变
data_value = []
data_value.append(feed_i[key])
if isinstance(feed_i[key], str):
if self.feed_types_[key] != bytes_type:
raise ValueError(
"feedvar is not string-type,feed can`t be a single string."
)
else:
if self.feed_types_[key] == bytes_type:
raise ValueError(
"feedvar is string-type,feed, feed can`t be a single int or others."
)
total_data_number = total_data_number + data_bytes_number(
data_value)
Request["tensor"][index]["elem_type"] = elem_type
Request["tensor"][index]["shape"] = shape
Request["tensor"][index][data_key] = data_value
proto_index = self.feed_names_to_idx_[key]
Request["tensor"][index]["name"] = self.feed_real_names[proto_index]
Request["tensor"][index]["alias_name"] = key
if len(lod) > 0:
Request["tensor"][index]["lod"] = lod
index = index + 1
result = None
# request
web_url = "http://" + self.ip + ":" + self.server_port + self.service_name
postData = json.dumps(Request)
headers = {}
# 当数据区长度大于512字节时才压缩.
if self.try_request_gzip and total_data_number > 512:
postData = gzip.compress(bytes(postData, 'utf-8'))
headers["Content-Encoding"] = "gzip"
if self.try_response_gzip:
headers["Accept-encoding"] = "gzip"
# requests支持自动识别解压
result = requests.post(url=web_url, headers=headers, data=postData)
if result == None:
return None
if result.status_code == 200:
return result.json()
return result
...@@ -14,18 +14,16 @@ ...@@ -14,18 +14,16 @@
# pylint: disable=doc-string-missing # pylint: disable=doc-string-missing
from . import monitor from . import monitor
from . import rpc_service
from . import serve from . import serve
from . import version from . import version
__all__ = ["version", "server", "serve", "monitor", "rpc_service", "dag"] __all__ = ["version", "server", "serve", "monitor", "dag"]
from paddle_serving_server import ( from paddle_serving_server import (
version, version,
server, server,
serve, serve,
monitor, monitor,
rpc_service,
dag, ) dag, )
from .dag import * from .dag import *
......
文件模式从 100755 更改为 100644
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import numpy as np
import google.protobuf.text_format
from .proto import general_model_config_pb2 as m_config
from .proto import multi_lang_general_model_service_pb2
sys.path.append(
os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
from .proto import multi_lang_general_model_service_pb2_grpc
class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
MultiLangGeneralModelServiceServicer):
def __init__(self, model_config_path_list, is_multi_model, endpoints):
self.is_multi_model_ = is_multi_model
self.model_config_path_list = model_config_path_list
self.endpoints_ = endpoints
self._init_bclient(self.model_config_path_list, self.endpoints_)
self._parse_model_config(self.model_config_path_list)
def _init_bclient(self, model_config_path_list, endpoints, timeout_ms=None):
file_path_list = []
for single_model_config in model_config_path_list:
if os.path.isdir(single_model_config):
file_path_list.append("{}/serving_server_conf.prototxt".format(
single_model_config))
elif os.path.isfile(single_model_config):
file_path_list.append(single_model_config)
from paddle_serving_client import Client
self.bclient_ = Client()
if timeout_ms is not None:
self.bclient_.set_rpc_timeout_ms(timeout_ms)
self.bclient_.load_client_config(file_path_list)
self.bclient_.connect(endpoints)
def _parse_model_config(self, model_config_path_list):
if isinstance(model_config_path_list, str):
model_config_path_list = [model_config_path_list]
elif isinstance(model_config_path_list, list):
pass
file_path_list = []
for single_model_config in model_config_path_list:
if os.path.isdir(single_model_config):
file_path_list.append("{}/serving_server_conf.prototxt".format(
single_model_config))
elif os.path.isfile(single_model_config):
file_path_list.append(single_model_config)
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[0], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.feed_types_ = {}
self.feed_shapes_ = {}
self.lod_tensor_set_ = set()
for i, var in enumerate(model_conf.feed_var):
self.feed_types_[var.alias_name] = var.feed_type
self.feed_shapes_[var.alias_name] = var.shape
if var.is_lod_tensor:
self.lod_tensor_set_.add(var.alias_name)
if len(file_path_list) > 1:
model_conf = m_config.GeneralModelConfig()
f = open(file_path_list[-1], 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
self.fetch_types_ = {}
for i, var in enumerate(model_conf.fetch_var):
self.fetch_types_[var.alias_name] = var.fetch_type
if var.is_lod_tensor:
self.lod_tensor_set_.add(var.alias_name)
def _flatten_list(self, nested_list):
for item in nested_list:
if isinstance(item, (list, tuple)):
for sub_item in self._flatten_list(item):
yield sub_item
else:
yield item
def _unpack_inference_request(self, request):
feed_names = list(request.feed_var_names)
fetch_names = list(request.fetch_var_names)
is_python = request.is_python
log_id = request.log_id
feed_batch = []
for feed_inst in request.insts:
feed_dict = {}
for idx, name in enumerate(feed_names):
var = feed_inst.tensor_array[idx]
v_type = self.feed_types_[name]
data = None
if is_python:
if v_type == 0: # int64
data = np.frombuffer(var.data, dtype="int64")
elif v_type == 1: # float32
data = np.frombuffer(var.data, dtype="float32")
elif v_type == 2: # int32
data = np.frombuffer(var.data, dtype="int32")
else:
raise Exception("error type.")
else:
if v_type == 0: # int64
data = np.array(list(var.int64_data), dtype="int64")
elif v_type == 1: # float32
data = np.array(list(var.float_data), dtype="float32")
elif v_type == 2: # int32
data = np.array(list(var.int_data), dtype="int32")
else:
raise Exception("error type.")
data.shape = list(feed_inst.tensor_array[idx].shape)
feed_dict[name] = np.ascontiguousarray(data)
if len(var.lod) > 0:
feed_dict["{}.lod".format(name)] = var.lod
feed_batch.append(feed_dict)
return feed_batch, fetch_names, is_python, log_id
def _pack_inference_response(self, ret, fetch_names, is_python):
resp = multi_lang_general_model_service_pb2.InferenceResponse()
if ret is None:
resp.err_code = 1
return resp
results, tag = ret
resp.tag = tag
resp.err_code = 0
if not self.is_multi_model_:
results = {'general_infer_0': results}
for model_name, model_result in results.items():
model_output = multi_lang_general_model_service_pb2.ModelOutput()
inst = multi_lang_general_model_service_pb2.FetchInst()
for idx, name in enumerate(fetch_names):
tensor = multi_lang_general_model_service_pb2.Tensor()
v_type = self.fetch_types_[name]
if is_python:
tensor.data = model_result[name].tobytes()
else:
if v_type == 0: # int64
tensor.int64_data.extend(model_result[name].reshape(-1)
.tolist())
elif v_type == 1: # float32
tensor.float_data.extend(model_result[name].reshape(-1)
.tolist())
elif v_type == 2: # int32
tensor.int_data.extend(model_result[name].reshape(-1)
.tolist())
else:
raise Exception("error type.")
tensor.shape.extend(list(model_result[name].shape))
if "{}.lod".format(name) in model_result:
tensor.lod.extend(model_result["{}.lod".format(name)]
.tolist())
inst.tensor_array.append(tensor)
model_output.insts.append(inst)
model_output.engine_name = model_name
resp.outputs.append(model_output)
return resp
def SetTimeout(self, request, context):
# This porcess and Inference process cannot be operate at the same time.
# For performance reasons, do not add thread lock temporarily.
timeout_ms = request.timeout_ms
self._init_bclient(self.model_config_path_list, self.endpoints_,
timeout_ms)
resp = multi_lang_general_model_service_pb2.SimpleResponse()
resp.err_code = 0
return resp
def Inference(self, request, context):
feed_batch, fetch_names, is_python, log_id \
= self._unpack_inference_request(request)
ret = self.bclient_.predict(
feed=feed_batch,
fetch=fetch_names,
batch=True,
need_variant_tag=True,
log_id=log_id)
return self._pack_inference_response(ret, fetch_names, is_python)
def GetClientConfig(self, request, context):
#model_config_path_list is list right now.
#dict should be added when graphMaker is used.
resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
model_config_str = []
for single_model_config in self.model_config_path_list:
if os.path.isdir(single_model_config):
with open("{}/serving_server_conf.prototxt".format(
single_model_config)) as f:
model_config_str.append(str(f.read()))
elif os.path.isfile(single_model_config):
with open(single_model_config) as f:
model_config_str.append(str(f.read()))
resp.client_config_str = model_config_str[0]
return resp
...@@ -23,13 +23,87 @@ import json ...@@ -23,13 +23,87 @@ import json
import base64 import base64
import time import time
from multiprocessing import Process from multiprocessing import Process
from flask import Flask, request
import sys import sys
if sys.version_info.major == 2: if sys.version_info.major == 2:
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
elif sys.version_info.major == 3: elif sys.version_info.major == 3:
from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import BaseHTTPRequestHandler, HTTPServer
from contextlib import closing
import socket
# web_service.py is still used by Pipeline.
def port_is_available(port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
return True
else:
return False
def format_gpu_to_strlist(unformatted_gpus):
gpus_strlist = []
if isinstance(unformatted_gpus, int):
gpus_strlist = [str(unformatted_gpus)]
elif isinstance(unformatted_gpus, list):
if unformatted_gpus == [""]:
gpus_strlist = ["-1"]
elif len(unformatted_gpus) == 0:
gpus_strlist = ["-1"]
else:
gpus_strlist = [str(x) for x in unformatted_gpus]
elif isinstance(unformatted_gpus, str):
if unformatted_gpus == "":
gpus_strlist = ["-1"]
else:
gpus_strlist = [unformatted_gpus]
elif unformatted_gpus == None:
gpus_strlist = ["-1"]
else:
raise ValueError("error input of set_gpus")
# check cuda visible
if "CUDA_VISIBLE_DEVICES" in os.environ:
env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
for op_gpus_str in gpus_strlist:
op_gpu_list = op_gpus_str.split(",")
# op_gpu_list == ["-1"] means this op use CPU
# so don`t check cudavisible.
if op_gpu_list == ["-1"]:
continue
for ids in op_gpu_list:
if ids not in env_gpus:
print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
exit(-1)
# check gpuid is valid
for op_gpus_str in gpus_strlist:
op_gpu_list = op_gpus_str.split(",")
use_gpu = False
for ids in op_gpu_list:
if int(ids) < -1:
raise ValueError("The input of gpuid error.")
if int(ids) >= 0:
use_gpu = True
if int(ids) == -1 and use_gpu:
raise ValueError("You can not use CPU and GPU in one model.")
return gpus_strlist
def is_gpu_mode(unformatted_gpus):
gpus_strlist = format_gpu_to_strlist(unformatted_gpus)
for op_gpus_str in gpus_strlist:
op_gpu_list = op_gpus_str.split(",")
for ids in op_gpu_list:
if int(ids) >= 0:
return True
return False
def serve_args(): def serve_args():
parser = argparse.ArgumentParser("serve") parser = argparse.ArgumentParser("serve")
...@@ -38,8 +112,17 @@ def serve_args(): ...@@ -38,8 +112,17 @@ def serve_args():
parser.add_argument( parser.add_argument(
"--port", type=int, default=9292, help="Port of the starting gpu") "--port", type=int, default=9292, help="Port of the starting gpu")
parser.add_argument( parser.add_argument(
"--device", type=str, default="gpu", help="Type of device") "--device", type=str, default="cpu", help="Type of device")
parser.add_argument("--gpu_ids", type=str, default="", help="gpu ids") parser.add_argument(
"--gpu_ids", type=str, default="", nargs="+", help="gpu ids")
parser.add_argument(
"--op_num", type=int, default=0, nargs="+", help="Number of each op")
parser.add_argument(
"--op_max_batch",
type=int,
default=32,
nargs="+",
help="Max batch of each op")
parser.add_argument( parser.add_argument(
"--model", type=str, default="", nargs="+", help="Model for serving") "--model", type=str, default="", nargs="+", help="Model for serving")
parser.add_argument( parser.add_argument(
...@@ -47,8 +130,6 @@ def serve_args(): ...@@ -47,8 +130,6 @@ def serve_args():
type=str, type=str,
default="workdir", default="workdir",
help="Working dir of current service") help="Working dir of current service")
parser.add_argument(
"--name", type=str, default="None", help="Default service name")
parser.add_argument( parser.add_argument(
"--use_mkl", default=False, action="store_true", help="Use MKL") "--use_mkl", default=False, action="store_true", help="Use MKL")
parser.add_argument( parser.add_argument(
...@@ -78,11 +159,6 @@ def serve_args(): ...@@ -78,11 +159,6 @@ def serve_args():
default=False, default=False,
action="store_true", action="store_true",
help="Use encryption model") help="Use encryption model")
parser.add_argument(
"--use_multilang",
default=False,
action="store_true",
help="Use Multi-language-service")
parser.add_argument( parser.add_argument(
"--use_trt", default=False, action="store_true", help="Use TensorRT") "--use_trt", default=False, action="store_true", help="Use TensorRT")
parser.add_argument( parser.add_argument(
...@@ -99,94 +175,27 @@ def serve_args(): ...@@ -99,94 +175,27 @@ def serve_args():
type=str, type=str,
default=None, default=None,
help="container_id for authentication") help="container_id for authentication")
parser.add_argument(
"--gpu_multi_stream",
default=False,
action="store_true",
help="Use gpu_multi_stream")
return parser.parse_args() return parser.parse_args()
def start_standard_model(serving_port): # pylint: disable=doc-string-missing def start_gpu_card_model(gpu_mode, port, args): # pylint: disable=doc-string-missing
args = serve_args()
thread_num = args.thread
model = args.model
port = serving_port
workdir = args.workdir
device = args.device
mem_optim = args.mem_optim_off is False
ir_optim = args.ir_optim
max_body_size = args.max_body_size
use_mkl = args.use_mkl
use_encryption_model = args.use_encryption_model
use_multilang = args.use_multilang
if model == "":
print("You must specify your serving model")
exit(-1)
for single_model_config in args.model:
if os.path.isdir(single_model_config):
pass
elif os.path.isfile(single_model_config):
raise ValueError("The input of --model should be a dir not file.")
import paddle_serving_server as serving
op_maker = serving.OpMaker()
op_seq_maker = serving.OpSeqMaker()
read_op = op_maker.create('general_reader')
op_seq_maker.add_op(read_op)
for idx, single_model in enumerate(model):
infer_op_name = "general_infer"
#Temporary support for OCR model,it will be completely revised later
#If you want to use this, C++ server must compile with WITH_OPENCV option.
if len(model) == 2 and idx == 0 and model[0] == 'ocr_det_model':
infer_op_name = "general_detection"
general_infer_op = op_maker.create(infer_op_name)
op_seq_maker.add_op(general_infer_op)
general_response_op = op_maker.create('general_response')
op_seq_maker.add_op(general_response_op)
server = None
if use_multilang:
server = serving.MultiLangServer()
else:
server = serving.Server()
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num)
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.use_mkl(use_mkl)
server.set_max_body_size(max_body_size)
server.set_port(port)
server.set_precision(args.precision)
server.set_use_calib(args.use_calib)
server.use_encryption_model(use_encryption_model)
if args.product_name != None:
server.set_product_name(args.product_name)
if args.container_id != None:
server.set_container_id(args.container_id)
server.load_model_config(model)
server.prepare_server(workdir=workdir, port=port, device=device)
server.run_server()
device = "cpu"
if gpu_mode == True:
device = "gpu"
def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-string-missing
workdir = args.workdir
gpuid = int(gpuid)
device = "gpu"
if gpuid == -1:
device = "cpu"
elif gpuid >= 0:
port = port + index
thread_num = args.thread thread_num = args.thread
model = args.model model = args.model
mem_optim = args.mem_optim_off is False mem_optim = args.mem_optim_off is False
ir_optim = args.ir_optim ir_optim = args.ir_optim
use_mkl = args.use_mkl use_mkl = args.use_mkl
max_body_size = args.max_body_size max_body_size = args.max_body_size
use_multilang = args.use_multilang workdir = "{}_{}".format(args.workdir, port)
if gpuid >= 0:
workdir = "{}_{}".format(args.workdir, gpuid)
if model == "": if model == "":
print("You must specify your serving model") print("You must specify your serving model")
...@@ -204,7 +213,11 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin ...@@ -204,7 +213,11 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
op_seq_maker.add_op(read_op) op_seq_maker.add_op(read_op)
for idx, single_model in enumerate(model): for idx, single_model in enumerate(model):
infer_op_name = "general_infer" infer_op_name = "general_infer"
if len(model) == 2 and idx == 0: # 目前由于ocr的节点Det模型依赖于opencv的第三方库
# 只有使用ocr的时候,才会加入opencv的第三方库并编译GeneralDetectionOp
# 故此处做特殊处理,当不满足下述情况时,所添加的op默认为GeneralInferOp
# 以后可能考虑不用python脚本来生成配置
if len(model) == 2 and idx == 0 and single_model == "ocr_det_model":
infer_op_name = "general_detection" infer_op_name = "general_detection"
else: else:
infer_op_name = "general_infer" infer_op_name = "general_infer"
...@@ -214,10 +227,7 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin ...@@ -214,10 +227,7 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
general_response_op = op_maker.create('general_response') general_response_op = op_maker.create('general_response')
op_seq_maker.add_op(general_response_op) op_seq_maker.add_op(general_response_op)
if use_multilang: server = serving.Server()
server = serving.MultiLangServer()
else:
server = serving.Server()
server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num) server.set_num_threads(thread_num)
server.use_mkl(use_mkl) server.use_mkl(use_mkl)
...@@ -226,8 +236,19 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin ...@@ -226,8 +236,19 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
server.set_memory_optimize(mem_optim) server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim) server.set_ir_optimize(ir_optim)
server.set_max_body_size(max_body_size) server.set_max_body_size(max_body_size)
if args.use_trt:
if args.use_trt and device == "gpu":
server.set_trt() server.set_trt()
server.set_ir_optimize(True)
if args.gpu_multi_stream and device == "gpu":
server.set_gpu_multi_stream()
if args.op_num:
server.set_op_num(args.op_num)
if args.op_max_batch:
server.set_op_max_batch(args.op_max_batch)
if args.use_lite: if args.use_lite:
server.set_lite() server.set_lite()
...@@ -241,54 +262,27 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin ...@@ -241,54 +262,27 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
if args.container_id != None: if args.container_id != None:
server.set_container_id(args.container_id) server.set_container_id(args.container_id)
if gpu_mode == True:
server.set_gpuid(args.gpu_ids)
server.load_model_config(model) server.load_model_config(model)
server.prepare_server( server.prepare_server(
workdir=workdir, workdir=workdir,
port=port, port=port,
device=device, device=device,
use_encryption_model=args.use_encryption_model) use_encryption_model=args.use_encryption_model)
if gpuid >= 0:
server.set_gpuid(gpuid)
server.run_server() server.run_server()
def start_multi_card(args, serving_port=None): # pylint: disable=doc-string-missing def start_multi_card(args, serving_port=None): # pylint: disable=doc-string-missing
gpus = ""
if serving_port == None: if serving_port == None:
serving_port = args.port serving_port = args.port
if args.gpu_ids == "":
gpus = []
else:
gpus = args.gpu_ids.split(",")
if "CUDA_VISIBLE_DEVICES" in os.environ:
env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
for ids in gpus:
if ids not in env_gpus:
print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
exit(-1)
else:
env_gpus = []
if args.use_lite: if args.use_lite:
print("run using paddle-lite.") print("run using paddle-lite.")
start_gpu_card_model(-1, -1, serving_port, args) start_gpu_card_model(False, serving_port, args)
elif len(gpus) <= 0:
print("gpu_ids not set, going to run cpu service.")
start_gpu_card_model(-1, -1, serving_port, args)
else: else:
gpu_processes = [] start_gpu_card_model(is_gpu_mode(args.gpu_ids), serving_port, args)
for i, gpu_id in enumerate(gpus):
p = Process(
target=start_gpu_card_model,
args=(
i,
gpu_id,
serving_port,
args, ))
gpu_processes.append(p)
for p in gpu_processes:
p.start()
for p in gpu_processes:
p.join()
class MainService(BaseHTTPRequestHandler): class MainService(BaseHTTPRequestHandler):
...@@ -370,7 +364,9 @@ class MainService(BaseHTTPRequestHandler): ...@@ -370,7 +364,9 @@ class MainService(BaseHTTPRequestHandler):
if __name__ == "__main__": if __name__ == "__main__":
# args.device is not used at all.
# just keep the interface.
# so --device should not be recommended at the HomePage.
args = serve_args() args = serve_args()
for single_model_config in args.model: for single_model_config in args.model:
if os.path.isdir(single_model_config): if os.path.isdir(single_model_config):
...@@ -378,54 +374,14 @@ if __name__ == "__main__": ...@@ -378,54 +374,14 @@ if __name__ == "__main__":
elif os.path.isfile(single_model_config): elif os.path.isfile(single_model_config):
raise ValueError("The input of --model should be a dir not file.") raise ValueError("The input of --model should be a dir not file.")
if args.name == "None": if args.use_encryption_model:
from .web_service import port_is_available p_flag = False
if args.use_encryption_model: p = None
p_flag = False serving_port = 0
p = None server = HTTPServer(('0.0.0.0', int(args.port)), MainService)
serving_port = 0 print(
server = HTTPServer(('localhost', int(args.port)), MainService) 'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
print( )
'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop' server.serve_forever()
)
server.serve_forever()
else:
start_multi_card(args)
else: else:
from .web_service import WebService start_multi_card(args)
web_service = WebService(name=args.name)
web_service.load_model_config(args.model)
gpu_ids = args.gpu_ids
if gpu_ids == "":
if "CUDA_VISIBLE_DEVICES" in os.environ:
gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
if len(gpu_ids) > 0:
web_service.set_gpus(gpu_ids)
web_service.prepare_server(
workdir=args.workdir,
port=args.port,
device=args.device,
use_lite=args.use_lite,
use_xpu=args.use_xpu,
ir_optim=args.ir_optim,
thread_num=args.thread,
precision=args.precision,
use_calib=args.use_calib)
web_service.run_rpc_service()
app_instance = Flask(__name__)
@app_instance.before_first_request
def init():
web_service._launch_web_service()
service_name = "/" + web_service.name + "/prediction"
@app_instance.route(service_name, methods=["POST"])
def run():
return web_service.get_prediction(request)
app_instance.run(host="0.0.0.0",
port=web_service.port,
threaded=False,
processes=4)
...@@ -16,10 +16,9 @@ import os ...@@ -16,10 +16,9 @@ import os
import tarfile import tarfile
import socket import socket
import paddle_serving_server as paddle_serving_server import paddle_serving_server as paddle_serving_server
from paddle_serving_server.rpc_service import MultiLangServerServiceServicer from paddle_serving_server.serve import format_gpu_to_strlist
from .proto import server_configure_pb2 as server_sdk from .proto import server_configure_pb2 as server_sdk
from .proto import general_model_config_pb2 as m_config from .proto import general_model_config_pb2 as m_config
from .proto import multi_lang_general_model_service_pb2_grpc
import google.protobuf.text_format import google.protobuf.text_format
import time import time
from .version import version_tag, version_suffix, device_type from .version import version_tag, version_suffix, device_type
...@@ -32,7 +31,6 @@ if sys.platform.startswith('win') is False: ...@@ -32,7 +31,6 @@ if sys.platform.startswith('win') is False:
import shutil import shutil
import platform import platform
import numpy as np import numpy as np
import grpc
import sys import sys
import collections import collections
import subprocess import subprocess
...@@ -41,6 +39,8 @@ from multiprocessing import Pool, Process ...@@ -41,6 +39,8 @@ from multiprocessing import Pool, Process
from concurrent import futures from concurrent import futures
# The whole file is about to be discarded.
# We will use default config-file to start C++Server.
class Server(object): class Server(object):
def __init__(self): def __init__(self):
""" """
...@@ -81,8 +81,11 @@ class Server(object): ...@@ -81,8 +81,11 @@ class Server(object):
self.use_local_bin = False self.use_local_bin = False
self.mkl_flag = False self.mkl_flag = False
self.device = "cpu" self.device = "cpu"
self.gpuid = 0 self.gpuid = []
self.op_num = [0]
self.op_max_batch = [32]
self.use_trt = False self.use_trt = False
self.gpu_multi_stream = False
self.use_lite = False self.use_lite = False
self.use_xpu = False self.use_xpu = False
self.model_config_paths = collections.OrderedDict() self.model_config_paths = collections.OrderedDict()
...@@ -137,11 +140,13 @@ class Server(object): ...@@ -137,11 +140,13 @@ class Server(object):
def set_ir_optimize(self, flag=False): def set_ir_optimize(self, flag=False):
self.ir_optimization = flag self.ir_optimization = flag
# Multi-Server does not have this Function.
def set_product_name(self, product_name=None): def set_product_name(self, product_name=None):
if product_name == None: if product_name == None:
raise ValueError("product_name can't be None.") raise ValueError("product_name can't be None.")
self.product_name = product_name self.product_name = product_name
# Multi-Server does not have this Function.
def set_container_id(self, container_id): def set_container_id(self, container_id):
if container_id == None: if container_id == None:
raise ValueError("container_id can't be None.") raise ValueError("container_id can't be None.")
...@@ -163,12 +168,21 @@ class Server(object): ...@@ -163,12 +168,21 @@ class Server(object):
def set_device(self, device="cpu"): def set_device(self, device="cpu"):
self.device = device self.device = device
def set_gpuid(self, gpuid=0): def set_gpuid(self, gpuid):
self.gpuid = gpuid self.gpuid = format_gpu_to_strlist(gpuid)
def set_op_num(self, op_num):
self.op_num = op_num
def set_op_max_batch(self, op_max_batch):
self.op_max_batch = op_max_batch
def set_trt(self): def set_trt(self):
self.use_trt = True self.use_trt = True
def set_gpu_multi_stream(self):
self.gpu_multi_stream = True
def set_lite(self): def set_lite(self):
self.use_lite = True self.use_lite = True
...@@ -176,9 +190,33 @@ class Server(object): ...@@ -176,9 +190,33 @@ class Server(object):
self.use_xpu = True self.use_xpu = True
def _prepare_engine(self, model_config_paths, device, use_encryption_model): def _prepare_engine(self, model_config_paths, device, use_encryption_model):
self.device = device
if self.model_toolkit_conf == None: if self.model_toolkit_conf == None:
self.model_toolkit_conf = [] self.model_toolkit_conf = []
# Generally, self.gpuid = str[] or [].
# when len(self.gpuid) means no gpuid is specified.
# if self.device == "gpu" or self.use_trt:
# we assume you forget to set gpuid, so set gpuid = ['0'];
if len(self.gpuid) == 0 or self.gpuid == ["-1"]:
if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
self.gpuid = ["0"]
self.device = "gpu"
else:
self.gpuid = ["-1"]
if isinstance(self.op_num, int):
self.op_num = [self.op_num]
if len(self.op_num) == 0:
self.op_num.append(0)
if isinstance(self.op_max_batch, int):
self.op_max_batch = [self.op_max_batch]
if len(self.op_max_batch) == 0:
self.op_max_batch.append(32)
index = 0
for engine_name, model_config_path in model_config_paths.items(): for engine_name, model_config_path in model_config_paths.items():
engine = server_sdk.EngineDesc() engine = server_sdk.EngineDesc()
engine.name = engine_name engine.name = engine_name
...@@ -186,18 +224,39 @@ class Server(object): ...@@ -186,18 +224,39 @@ class Server(object):
engine.reloadable_meta = model_config_path + "/fluid_time_file" engine.reloadable_meta = model_config_path + "/fluid_time_file"
os.system("touch {}".format(engine.reloadable_meta)) os.system("touch {}".format(engine.reloadable_meta))
engine.reloadable_type = "timestamp_ne" engine.reloadable_type = "timestamp_ne"
engine.runtime_thread_num = 0 engine.runtime_thread_num = self.op_num[index % len(self.op_num)]
engine.batch_infer_size = 0 engine.batch_infer_size = self.op_max_batch[index %
engine.enable_batch_align = 0 len(self.op_max_batch)]
engine.enable_batch_align = 1
engine.model_dir = model_config_path engine.model_dir = model_config_path
engine.enable_memory_optimization = self.memory_optimization engine.enable_memory_optimization = self.memory_optimization
engine.enable_ir_optimization = self.ir_optimization engine.enable_ir_optimization = self.ir_optimization
engine.use_trt = self.use_trt engine.use_trt = self.use_trt
engine.gpu_multi_stream = self.gpu_multi_stream
engine.use_lite = self.use_lite engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu engine.use_xpu = self.use_xpu
engine.use_gpu = False engine.use_gpu = False
if self.device == "gpu":
if len(self.gpuid) == 0:
raise ValueError("CPU: self.gpuid = -1, GPU: must set it ")
op_gpu_list = self.gpuid[index % len(self.gpuid)].split(",")
for ids in op_gpu_list:
engine.gpu_ids.extend([int(ids)])
if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
engine.use_gpu = True engine.use_gpu = True
# this is for Mixed use of GPU and CPU
# if model-1 use GPU and set the device="gpu"
# but gpuid[1] = "-1" which means use CPU in Model-2
# so config about GPU should be False.
# op_gpu_list = gpuid[index].split(",")
# which is the gpuid for each engine.
if len(op_gpu_list) == 1:
if int(op_gpu_list[0]) == -1:
engine.use_gpu = False
engine.gpu_multi_stream = False
engine.use_trt = False
if os.path.exists('{}/__params__'.format(model_config_path)): if os.path.exists('{}/__params__'.format(model_config_path)):
engine.combined_model = True engine.combined_model = True
...@@ -208,6 +267,7 @@ class Server(object): ...@@ -208,6 +267,7 @@ class Server(object):
engine.type = "PADDLE_INFER" engine.type = "PADDLE_INFER"
self.model_toolkit_conf.append(server_sdk.ModelToolkitConf()) self.model_toolkit_conf.append(server_sdk.ModelToolkitConf())
self.model_toolkit_conf[-1].engines.extend([engine]) self.model_toolkit_conf[-1].engines.extend([engine])
index = index + 1
def _prepare_infer_service(self, port): def _prepare_infer_service(self, port):
if self.infer_service_conf == None: if self.infer_service_conf == None:
...@@ -332,7 +392,11 @@ class Server(object): ...@@ -332,7 +392,11 @@ class Server(object):
self.mkl_flag = flag self.mkl_flag = flag
def check_avx(self): def check_avx(self):
p = subprocess.Popen(['cat /proc/cpuinfo | grep avx 2>/dev/null'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) p = subprocess.Popen(
['cat /proc/cpuinfo | grep avx 2>/dev/null'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
out, err = p.communicate() out, err = p.communicate()
if err == b'' and len(out) > 0: if err == b'' and len(out) > 0:
return True return True
...@@ -428,9 +492,17 @@ class Server(object): ...@@ -428,9 +492,17 @@ class Server(object):
def prepare_server(self, def prepare_server(self,
workdir=None, workdir=None,
port=9292, port=9292,
device="cpu", device=None,
use_encryption_model=False, use_encryption_model=False,
cube_conf=None): cube_conf=None):
# if `device` is not set, use self.device
# self.device may not be changed.
# or self.device may have changed by set_device.
if device == None:
device = self.device
# if `device` is set, let self.device = device.
else:
self.device = device
if workdir == None: if workdir == None:
workdir = "./tmp" workdir = "./tmp"
os.system("mkdir -p {}".format(workdir)) os.system("mkdir -p {}".format(workdir))
...@@ -484,237 +556,39 @@ class Server(object): ...@@ -484,237 +556,39 @@ class Server(object):
else: else:
print("Use local bin : {}".format(self.bin_path)) print("Use local bin : {}".format(self.bin_path))
#self.check_cuda() #self.check_cuda()
# Todo: merge CPU and GPU code, remove device to model_toolkit command = "{} " \
if self.device == "cpu" or self.device == "arm": "-enable_model_toolkit " \
command = "{} " \ "-inferservice_path {} " \
"-enable_model_toolkit " \ "-inferservice_file {} " \
"-inferservice_path {} " \ "-max_concurrency {} " \
"-inferservice_file {} " \ "-num_threads {} " \
"-max_concurrency {} " \ "-port {} " \
"-num_threads {} " \ "-precision {} " \
"-port {} " \ "-use_calib {} " \
"-precision {} " \ "-reload_interval_s {} " \
"-use_calib {} " \ "-resource_path {} " \
"-reload_interval_s {} " \ "-resource_file {} " \
"-resource_path {} " \ "-workflow_path {} " \
"-resource_file {} " \ "-workflow_file {} " \
"-workflow_path {} " \ "-bthread_concurrency {} " \
"-workflow_file {} " \ "-max_body_size {} ".format(
"-bthread_concurrency {} " \ self.bin_path,
"-max_body_size {} ".format( self.workdir,
self.bin_path, self.infer_service_fn,
self.workdir, self.max_concurrency,
self.infer_service_fn, self.num_threads,
self.max_concurrency, self.port,
self.num_threads, self.precision,
self.port, self.use_calib,
self.precision, self.reload_interval_s,
self.use_calib, self.workdir,
self.reload_interval_s, self.resource_fn,
self.workdir, self.workdir,
self.resource_fn, self.workflow_fn,
self.workdir, self.num_threads,
self.workflow_fn, self.max_body_size)
self.num_threads,
self.max_body_size)
else:
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
"-inferservice_file {} " \
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-precision {} " \
"-use_calib {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
"-workflow_path {} " \
"-workflow_file {} " \
"-bthread_concurrency {} " \
"-gpuid {} " \
"-max_body_size {} ".format(
self.bin_path,
self.workdir,
self.infer_service_fn,
self.max_concurrency,
self.num_threads,
self.port,
self.precision,
self.use_calib,
self.reload_interval_s,
self.workdir,
self.resource_fn,
self.workdir,
self.workflow_fn,
self.num_threads,
self.gpuid,
self.max_body_size)
print("Going to Run Comand") print("Going to Run Comand")
print(command) print(command)
os.system(command) os.system(command)
class MultiLangServer(object):
def __init__(self):
self.bserver_ = Server()
self.worker_num_ = 4
self.body_size_ = 64 * 1024 * 1024
self.concurrency_ = 100000
self.is_multi_model_ = False # for model ensemble, which is not useful right now.
def set_max_concurrency(self, concurrency):
self.concurrency_ = concurrency
self.bserver_.set_max_concurrency(concurrency)
def set_device(self, device="cpu"):
self.device = device
def set_num_threads(self, threads):
self.worker_num_ = threads
self.bserver_.set_num_threads(threads)
def set_max_body_size(self, body_size):
self.bserver_.set_max_body_size(body_size)
if body_size >= self.body_size_:
self.body_size_ = body_size
else:
print(
"max_body_size is less than default value, will use default value in service."
)
def use_encryption_model(self, flag=False):
self.encryption_model = flag
def set_port(self, port):
self.gport_ = port
def set_precision(self, precision="fp32"):
self.precision = precision
def set_use_calib(self, use_calib=False):
self.use_calib = use_calib
def set_reload_interval(self, interval):
self.bserver_.set_reload_interval(interval)
def set_op_sequence(self, op_seq):
self.bserver_.set_op_sequence(op_seq)
def set_op_graph(self, op_graph):
self.bserver_.set_op_graph(op_graph)
def use_mkl(self, flag):
self.bserver_.use_mkl(flag)
def set_memory_optimize(self, flag=False):
self.bserver_.set_memory_optimize(flag)
def set_ir_optimize(self, flag=False):
self.bserver_.set_ir_optimize(flag)
def set_gpuid(self, gpuid=0):
self.bserver_.set_gpuid(gpuid)
def load_model_config(self,
server_config_dir_paths,
client_config_path=None):
if isinstance(server_config_dir_paths, str):
server_config_dir_paths = [server_config_dir_paths]
elif isinstance(server_config_dir_paths, list):
pass
else:
raise Exception("The type of model_config_paths must be str or list"
", not {}.".format(type(server_config_dir_paths)))
for single_model_config in server_config_dir_paths:
if os.path.isdir(single_model_config):
pass
elif os.path.isfile(single_model_config):
raise ValueError(
"The input of --model should be a dir not file.")
self.bserver_.load_model_config(server_config_dir_paths)
if client_config_path is None:
#now dict is not useful.
if isinstance(server_config_dir_paths, dict):
self.is_multi_model_ = True
client_config_path = []
for server_config_path_items in list(
server_config_dir_paths.items()):
client_config_path.append(server_config_path_items[1])
elif isinstance(server_config_dir_paths, list):
self.is_multi_model_ = False
client_config_path = server_config_dir_paths
else:
raise Exception(
"The type of model_config_paths must be str or list or "
"dict({op: model_path}), not {}.".format(
type(server_config_dir_paths)))
if isinstance(client_config_path, str):
client_config_path = [client_config_path]
elif isinstance(client_config_path, list):
pass
else: # dict is not support right now.
raise Exception(
"The type of client_config_path must be str or list or "
"dict({op: model_path}), not {}.".format(
type(client_config_path)))
if len(client_config_path) != len(server_config_dir_paths):
raise Warning(
"The len(client_config_path) is {}, != len(server_config_dir_paths) {}."
.format(len(client_config_path), len(server_config_dir_paths)))
self.bclient_config_path_list = client_config_path
def prepare_server(self,
workdir=None,
port=9292,
device="cpu",
use_encryption_model=False,
cube_conf=None):
if not self._port_is_available(port):
raise SystemExit("Port {} is already used".format(port))
default_port = 12000
self.port_list_ = []
for i in range(1000):
if default_port + i != port and self._port_is_available(default_port
+ i):
self.port_list_.append(default_port + i)
break
self.bserver_.prepare_server(
workdir=workdir,
port=self.port_list_[0],
device=device,
use_encryption_model=use_encryption_model,
cube_conf=cube_conf)
self.set_port(port)
def _launch_brpc_service(self, bserver):
bserver.run_server()
def _port_is_available(self, port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex(('0.0.0.0', port))
return result != 0
def run_server(self):
p_bserver = Process(
target=self._launch_brpc_service, args=(self.bserver_, ))
p_bserver.start()
options = [('grpc.max_send_message_length', self.body_size_),
('grpc.max_receive_message_length', self.body_size_)]
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=self.worker_num_),
options=options,
maximum_concurrent_rpcs=self.concurrency_)
multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
MultiLangServerServiceServicer(
self.bclient_config_path_list, self.is_multi_model_,
["0.0.0.0:{}".format(self.port_list_[0])]), server)
server.add_insecure_port('[::]:{}'.format(self.gport_))
server.start()
p_bserver.join()
server.wait_for_termination()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#!flask/bin/python #!flask/bin/python
# pylint: disable=doc-string-missing # pylint: disable=doc-string-missing
# Now, this is only for Pipeline.
from flask import Flask, request, abort from flask import Flask, request, abort
from contextlib import closing from contextlib import closing
from multiprocessing import Pool, Process, Queue from multiprocessing import Pool, Process, Queue
...@@ -26,6 +27,7 @@ import numpy as np ...@@ -26,6 +27,7 @@ import numpy as np
import os import os
from paddle_serving_server import pipeline from paddle_serving_server import pipeline
from paddle_serving_server.pipeline import Op from paddle_serving_server.pipeline import Op
from paddle_serving_server.serve import format_gpu_to_strlist
def port_is_available(port): def port_is_available(port):
...@@ -44,7 +46,7 @@ class WebService(object): ...@@ -44,7 +46,7 @@ class WebService(object):
# pipeline # pipeline
self._server = pipeline.PipelineServer(self.name) self._server = pipeline.PipelineServer(self.name)
self.gpus = [] # deprecated self.gpus = ["-1"] # deprecated
self.rpc_service_list = [] # deprecated self.rpc_service_list = [] # deprecated
def get_pipeline_response(self, read_op): def get_pipeline_response(self, read_op):
...@@ -91,7 +93,7 @@ class WebService(object): ...@@ -91,7 +93,7 @@ class WebService(object):
f = open(file_path_list[0], 'r') f = open(file_path_list[0], 'r')
model_conf = google.protobuf.text_format.Merge( model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf) str(f.read()), model_conf)
self.feed_vars = {var.name: var for var in model_conf.feed_var} self.feed_vars = {var.alias_name: var for var in model_conf.feed_var}
if len(file_path_list) > 1: if len(file_path_list) > 1:
model_conf = m_config.GeneralModelConfig() model_conf = m_config.GeneralModelConfig()
...@@ -99,31 +101,58 @@ class WebService(object): ...@@ -99,31 +101,58 @@ class WebService(object):
model_conf = google.protobuf.text_format.Merge( model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf) str(f.read()), model_conf)
self.fetch_vars = {var.name: var for var in model_conf.fetch_var} self.fetch_vars = {var.alias_name: var for var in model_conf.fetch_var}
if client_config_path == None: if client_config_path == None:
self.client_config_path = file_path_list self.client_config_path = file_path_list
# after this function, self.gpus should be a list of str or [].
def set_gpus(self, gpus): def set_gpus(self, gpus):
print("This API will be deprecated later. Please do not use it") print("This API will be deprecated later. Please do not use it")
self.gpus = [int(x) for x in gpus.split(",")] self.gpus = format_gpu_to_strlist(gpus)
# this function can be called by user
# or by Function create_rpc_config
# if by user, user can set_gpus or pass the `gpus`
# if `gpus` == None, which means it`s not set at all.
# at this time, we should use self.gpus instead.
# otherwise, we should use the `gpus` first.
# which means if set_gpus and `gpus` is both set.
# `gpus` will be used.
def default_rpc_service(self, def default_rpc_service(self,
workdir="conf", workdir,
port=9292, port=9292,
gpuid=0, gpus=None,
thread_num=2, thread_num=2,
mem_optim=True, mem_optim=True,
use_lite=False, use_lite=False,
use_xpu=False, use_xpu=False,
ir_optim=False, ir_optim=False,
precision="fp32", precision="fp32",
use_calib=False): use_calib=False,
device = "gpu" use_trt=False,
if gpuid == -1: gpu_multi_stream=False,
op_num=None,
op_max_batch=None):
device = "cpu"
server = Server()
# only when `gpus == None`, which means it`s not set at all
# we will use the self.gpus.
if gpus == None:
gpus = self.gpus
gpus = format_gpu_to_strlist(gpus)
server.set_gpuid(gpus)
if len(gpus) == 0 or gpus == ["-1"]:
if use_lite: if use_lite:
device = "arm" device = "arm"
else: else:
device = "cpu" device = "cpu"
else:
device = "gpu"
op_maker = OpMaker() op_maker = OpMaker()
op_seq_maker = OpSeqMaker() op_seq_maker = OpSeqMaker()
...@@ -142,7 +171,6 @@ class WebService(object): ...@@ -142,7 +171,6 @@ class WebService(object):
general_response_op = op_maker.create('general_response') general_response_op = op_maker.create('general_response')
op_seq_maker.add_op(general_response_op) op_seq_maker.add_op(general_response_op)
server = Server()
server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num) server.set_num_threads(thread_num)
server.set_memory_optimize(mem_optim) server.set_memory_optimize(mem_optim)
...@@ -151,6 +179,19 @@ class WebService(object): ...@@ -151,6 +179,19 @@ class WebService(object):
server.set_precision(precision) server.set_precision(precision)
server.set_use_calib(use_calib) server.set_use_calib(use_calib)
if use_trt and device == "gpu":
server.set_trt()
server.set_ir_optimize(True)
if gpu_multi_stream and device == "gpu":
server.set_gpu_multi_stream()
if op_num:
server.set_op_num(op_num)
if op_max_batch:
server.set_op_max_batch(op_max_batch)
if use_lite: if use_lite:
server.set_lite() server.set_lite()
if use_xpu: if use_xpu:
...@@ -158,86 +199,87 @@ class WebService(object): ...@@ -158,86 +199,87 @@ class WebService(object):
server.load_model_config(self.server_config_dir_paths server.load_model_config(self.server_config_dir_paths
) #brpc Server support server_config_dir_paths ) #brpc Server support server_config_dir_paths
if gpuid >= 0:
server.set_gpuid(gpuid)
server.prepare_server(workdir=workdir, port=port, device=device) server.prepare_server(workdir=workdir, port=port, device=device)
return server return server
def _launch_rpc_service(self, service_idx): def _launch_rpc_service(self, service_idx):
self.rpc_service_list[service_idx].run_server() self.rpc_service_list[service_idx].run_server()
# if use this function, self.gpus must be set before.
# if not, we will use the default value, self.gpus = ["-1"].
# so we always pass the `gpus` = self.gpus.
def create_rpc_config(self): def create_rpc_config(self):
if len(self.gpus) == 0: self.rpc_service_list.append(
# init cpu service self.default_rpc_service(
self.rpc_service_list.append( self.workdir,
self.default_rpc_service( self.port_list[0],
self.workdir, self.gpus,
self.port_list[0], thread_num=self.thread_num,
-1, mem_optim=self.mem_optim,
thread_num=self.thread_num, use_lite=self.use_lite,
mem_optim=self.mem_optim, use_xpu=self.use_xpu,
use_lite=self.use_lite, ir_optim=self.ir_optim,
use_xpu=self.use_xpu, precision=self.precision,
ir_optim=self.ir_optim, use_calib=self.use_calib,
precision=self.precision, use_trt=self.use_trt,
use_calib=self.use_calib)) gpu_multi_stream=self.gpu_multi_stream,
else: op_num=self.op_num,
for i, gpuid in enumerate(self.gpus): op_max_batch=self.op_max_batch))
self.rpc_service_list.append(
self.default_rpc_service(
"{}_{}".format(self.workdir, i),
self.port_list[i],
gpuid,
thread_num=self.thread_num,
mem_optim=self.mem_optim,
use_lite=self.use_lite,
use_xpu=self.use_xpu,
ir_optim=self.ir_optim,
precision=self.precision,
use_calib=self.use_calib))
def prepare_server(self, def prepare_server(self,
workdir="", workdir,
port=9393, port=9393,
device="gpu", device="cpu",
precision="fp32", precision="fp32",
use_calib=False, use_calib=False,
use_lite=False, use_lite=False,
use_xpu=False, use_xpu=False,
ir_optim=False, ir_optim=False,
gpuid=0,
thread_num=2, thread_num=2,
mem_optim=True): mem_optim=True,
use_trt=False,
gpu_multi_stream=False,
op_num=None,
op_max_batch=None,
gpuid=None):
print("This API will be deprecated later. Please do not use it") print("This API will be deprecated later. Please do not use it")
self.workdir = workdir self.workdir = workdir
self.port = port self.port = port
self.thread_num = thread_num self.thread_num = thread_num
self.device = device # self.device is not used at all.
# device is set by gpuid.
self.precision = precision self.precision = precision
self.use_calib = use_calib self.use_calib = use_calib
self.use_lite = use_lite self.use_lite = use_lite
self.use_xpu = use_xpu self.use_xpu = use_xpu
self.ir_optim = ir_optim self.ir_optim = ir_optim
self.mem_optim = mem_optim self.mem_optim = mem_optim
self.gpuid = gpuid
self.port_list = [] self.port_list = []
self.use_trt = use_trt
self.gpu_multi_stream = gpu_multi_stream
self.op_num = op_num
self.op_max_batch = op_max_batch
# if gpuid != None, we will use gpuid first.
# otherwise, keep the self.gpus unchanged.
# maybe self.gpus is set by the Function set_gpus.
if gpuid != None:
self.gpus = format_gpu_to_strlist(gpuid)
else:
pass
default_port = 12000 default_port = 12000
for i in range(1000): for i in range(1000):
if port_is_available(default_port + i): if port_is_available(default_port + i):
self.port_list.append(default_port + i) self.port_list.append(default_port + i)
if len(self.port_list) > len(self.gpus):
break break
def _launch_web_service(self): def _launch_web_service(self):
gpu_num = len(self.gpus)
self.client = Client() self.client = Client()
self.client.load_client_config(self.client_config_path) self.client.load_client_config(self.client_config_path)
endpoints = "" endpoints = ""
if gpu_num > 0: endpoints = "127.0.0.1:{}".format(self.port_list[0])
for i in range(gpu_num):
endpoints += "127.0.0.1:{},".format(self.port_list[i])
else:
endpoints = "127.0.0.1:{}".format(self.port_list[0])
self.client.connect([endpoints]) self.client.connect([endpoints])
def get_prediction(self, request): def get_prediction(self, request):
...@@ -322,12 +364,13 @@ class WebService(object): ...@@ -322,12 +364,13 @@ class WebService(object):
if gpu: if gpu:
# if user forget to call function `set_gpus` to set self.gpus. # if user forget to call function `set_gpus` to set self.gpus.
# default self.gpus = [0]. # default self.gpus = [0].
if len(self.gpus) == 0: if len(self.gpus) == 0 or self.gpus == ["-1"]:
self.gpus.append(0) self.gpus = ["0"]
# right now, local Predictor only support 1 card.
# no matter how many gpu_id is in gpus, we only use the first one.
gpu_id = (self.gpus[0].split(","))[0]
self.client.load_model_config( self.client.load_model_config(
self.server_config_dir_paths[0], self.server_config_dir_paths[0], use_gpu=True, gpu_id=gpu_id)
use_gpu=True,
gpu_id=self.gpus[0])
else: else:
self.client.load_model_config( self.client.load_model_config(
self.server_config_dir_paths[0], use_gpu=False) self.server_config_dir_paths[0], use_gpu=False)
......
...@@ -16,7 +16,7 @@ from time import time as _time ...@@ -16,7 +16,7 @@ from time import time as _time
import time import time
import threading import threading
import multiprocessing import multiprocessing
from paddle_serving_client import MultiLangClient, Client from paddle_serving_client import Client
from concurrent import futures from concurrent import futures
import logging import logging
import func_timeout import func_timeout
...@@ -330,8 +330,9 @@ class Op(object): ...@@ -330,8 +330,9 @@ class Op(object):
if self.client_type == 'brpc': if self.client_type == 'brpc':
client = Client() client = Client()
client.load_client_config(client_config) client.load_client_config(client_config)
elif self.client_type == 'grpc': # 待测试完成后,使用brpc-http替代。
client = MultiLangClient() # elif self.client_type == 'grpc':
# client = MultiLangClient()
elif self.client_type == 'local_predictor': elif self.client_type == 'local_predictor':
if self.local_predictor is None: if self.local_predictor is None:
raise ValueError("local predictor not yet created") raise ValueError("local predictor not yet created")
...@@ -474,10 +475,13 @@ class Op(object): ...@@ -474,10 +475,13 @@ class Op(object):
fetch=self._fetch_names, fetch=self._fetch_names,
batch=True, batch=True,
log_id=typical_logid) log_id=typical_logid)
# 后续用HttpClient替代
'''
if isinstance(self.client, MultiLangClient): if isinstance(self.client, MultiLangClient):
if call_result is None or call_result["serving_status_code"] != 0: if call_result is None or call_result["serving_status_code"] != 0:
return None return None
call_result.pop("serving_status_code") call_result.pop("serving_status_code")
'''
return call_result return call_result
def postprocess(self, input_data, fetch_data, log_id=0): def postprocess(self, input_data, fetch_data, log_id=0):
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -21,17 +21,33 @@ option cc_generic_services = true; ...@@ -21,17 +21,33 @@ option cc_generic_services = true;
message Tensor { message Tensor {
repeated bytes data = 1; repeated bytes data = 1;
optional int32 elem_type = 2; repeated int32 int_data = 2;
repeated int32 shape = 3; repeated int64 int64_data = 3;
repeated float float_data = 4;
optional int32 elem_type =
5; // 0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
repeated int32 shape = 6; // shape should include batch
repeated int32 lod = 7; // only for fetch tensor currently
optional string name = 8; // get from the Model prototxt
optional string alias_name = 9; // get from the Model prototxt
}; };
message FeedInst { repeated Tensor tensor_array = 1; }; message Request {
repeated Tensor tensor = 1;
message FetchInst { repeated Tensor tensor_array = 1; }; repeated string fetch_var_names = 2;
optional bool profile_server = 3 [ default = false ];
required uint64 log_id = 4 [ default = 0 ];
};
message Request { repeated FeedInst insts = 1; }; message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
};
message Response { repeated FetchInst insts = 1; }; message ModelOutput {
repeated Tensor tensor = 1;
optional string engine_name = 2;
}
service GeneralModelService { service GeneralModelService {
rpc inference(Request) returns (Response); rpc inference(Request) returns (Response);
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -41,7 +41,7 @@ build_whl_list=(build_cpu_server build_gpu_server build_client build_app) ...@@ -41,7 +41,7 @@ build_whl_list=(build_cpu_server build_gpu_server build_client build_app)
rpc_model_list=(grpc_fit_a_line grpc_yolov4 pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc \ rpc_model_list=(grpc_fit_a_line grpc_yolov4 pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc \
lac_rpc cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \ lac_rpc cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu faster_rcnn_hrnetv2p_w18_1x_encrypt \ criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu faster_rcnn_hrnetv2p_w18_1x_encrypt \
low_precision_resnet50_int8 ocr_c++_service) faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service)
http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http \ http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http \
pipeline_ocr_cpu_http) pipeline_ocr_cpu_http)
...@@ -120,31 +120,66 @@ function check() { ...@@ -120,31 +120,66 @@ function check() {
fi fi
} }
function check_gpu_memory() {
gpu_memory=`nvidia-smi --id=$1 --format=csv,noheader --query-gpu=memory.used | awk '{print $1}'`
echo -e "${GREEN_COLOR}-------id-$1 gpu_memory_used: ${gpu_memory}${RES}"
if [ ${gpu_memory} -le 100 ]; then
echo "-------GPU-$1 is not used"
status="GPU-$1 is not used"
else
echo "-------GPU_memory used is expected"
fi
}
function check_result() { function check_result() {
if [ $? == 0 ]; then if [ $? == 0 ]; then
echo -e "${GREEN_COLOR}$1 execute normally${RES}" echo -e "${GREEN_COLOR}$1 execute normally${RES}"
if [ $1 == "server" ]; then if [ $1 == "server" ]; then
sleep $2 sleep $2
tail ${dir}server_log.txt | tee -a ${log_dir}server_total.txt cat ${dir}server_log.txt | tee -a ${log_dir}server_total.txt
fi fi
if [ $1 == "client" ]; then if [ $1 == "client" ]; then
tail ${dir}client_log.txt | tee -a ${log_dir}client_total.txt cat ${dir}client_log.txt | tee -a ${log_dir}client_total.txt
grep -E "${error_words}" ${dir}client_log.txt > /dev/null grep -E "${error_words}" ${dir}client_log.txt > /dev/null
if [ $? == 0 ]; then if [ $? == 0 ]; then
if [ "${status}" != "" ]; then
status="${status}|Failed"
else
status="Failed"
fi
echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
echo -e "--------------pipeline.log:----------------\n" echo "--------------server log:--------------"
cat ${dir}server_log.txt
echo "--------------client log:--------------"
cat ${dir}client_log.txt
echo "--------------pipeline.log:----------------"
cat PipelineServingLogs/pipeline.log cat PipelineServingLogs/pipeline.log
echo -e "-------------------------------------------\n" echo "-------------------------------------------\n"
error_log $2 error_log $2
else else
if [ "${status}" != "" ]; then
error_log $2
fi
echo -e "${GREEN_COLOR}$2${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt echo -e "${GREEN_COLOR}$2${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
fi fi
fi fi
else else
echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
tail ${dir}client_log.txt | tee -a ${log_dir}client_total.txt echo "--------------server log:--------------"
cat ${dir}server_log.txt
echo "--------------client log:--------------"
cat ${dir}client_log.txt
echo "--------------pipeline.log:----------------"
cat PipelineServingLogs/pipeline.log
echo "-------------------------------------------\n"
if [ "${status}" != "" ]; then
status="${status}|Failed"
else
status="Failed"
fi
error_log $2 error_log $2
fi fi
status=""
} }
function error_log() { function error_log() {
...@@ -163,7 +198,7 @@ function error_log() { ...@@ -163,7 +198,7 @@ function error_log() {
echo "deployment: ${deployment// /_}" | tee -a ${log_dir}error_models.txt echo "deployment: ${deployment// /_}" | tee -a ${log_dir}error_models.txt
echo "py_version: ${py_version}" | tee -a ${log_dir}error_models.txt echo "py_version: ${py_version}" | tee -a ${log_dir}error_models.txt
echo "cuda_version: ${cuda_version}" | tee -a ${log_dir}error_models.txt echo "cuda_version: ${cuda_version}" | tee -a ${log_dir}error_models.txt
echo "status: Failed" | tee -a ${log_dir}error_models.txt echo "status: ${status}" | tee -a ${log_dir}error_models.txt
echo -e "-----------------------------\n\n" | tee -a ${log_dir}error_models.txt echo -e "-----------------------------\n\n" | tee -a ${log_dir}error_models.txt
prefix=${arg//\//_} prefix=${arg//\//_}
for file in ${dir}* for file in ${dir}*
...@@ -192,7 +227,7 @@ function link_data() { ...@@ -192,7 +227,7 @@ function link_data() {
function before_hook() { function before_hook() {
setproxy setproxy
cd ${build_path}/python cd ${build_path}/python
${py_version} -m pip install --upgrade pip ${py_version} -m pip install --upgrade pip==21.1.3
${py_version} -m pip install requests ${py_version} -m pip install requests
${py_version} -m pip install -r requirements.txt ${py_version} -m pip install -r requirements.txt
${py_version} -m pip install numpy==1.16.4 ${py_version} -m pip install numpy==1.16.4
...@@ -325,7 +360,7 @@ function low_precision_resnet50_int8 () { ...@@ -325,7 +360,7 @@ function low_precision_resnet50_int8 () {
${py_version} -m paddle_serving_client.convert --dirname ResNet50_quant ${py_version} -m paddle_serving_client.convert --dirname ResNet50_quant
echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt
${py_version} -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 > ${dir}server_log.txt 2>&1 &
check_result server 10 check_result server 15
echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt
${py_version} resnet50_client.py > ${dir}client_log.txt 2>&1 ${py_version} resnet50_client.py > ${dir}client_log.txt 2>&1
check_result client "low_precision_resnet50_int8_GPU_RPC server test completed" check_result client "low_precision_resnet50_int8_GPU_RPC server test completed"
...@@ -341,7 +376,7 @@ function faster_rcnn_hrnetv2p_w18_1x_encrypt() { ...@@ -341,7 +376,7 @@ function faster_rcnn_hrnetv2p_w18_1x_encrypt() {
${py_version} encrypt.py ${py_version} encrypt.py
unsetproxy unsetproxy
echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt
${py_version} -m paddle_serving_server.serve --model encrypt_server/ --port 9494 --use_trt --gpu_ids 0 --use_encryption_model > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model encrypt_server/ --port 9494 --gpu_ids 0 --use_encryption_model > ${dir}server_log.txt 2>&1 &
check_result server 3 check_result server 3
echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt
${py_version} test_encryption.py 000000570688.jpg > ${dir}client_log.txt 2>&1 ${py_version} test_encryption.py 000000570688.jpg > ${dir}client_log.txt 2>&1
...@@ -379,6 +414,7 @@ function bert_rpc_gpu() { ...@@ -379,6 +414,7 @@ function bert_rpc_gpu() {
ls -hlst ls -hlst
${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 8860 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 8860 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
check_result server 15 check_result server 15
check_gpu_memory 0
nvidia-smi nvidia-smi
head data-c.txt | ${py_version} bert_client.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1 head data-c.txt | ${py_version} bert_client.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
check_result client "bert_GPU_RPC server test completed" check_result client "bert_GPU_RPC server test completed"
...@@ -429,6 +465,7 @@ function ResNet50_rpc() { ...@@ -429,6 +465,7 @@ function ResNet50_rpc() {
sed -i 's/9696/8863/g' resnet50_rpc_client.py sed -i 's/9696/8863/g' resnet50_rpc_client.py
${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 8863 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 8863 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1 ${py_version} resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
check_result client "ResNet50_GPU_RPC server test completed" check_result client "ResNet50_GPU_RPC server test completed"
...@@ -446,6 +483,7 @@ function ResNet101_rpc() { ...@@ -446,6 +483,7 @@ function ResNet101_rpc() {
sed -i "22cclient.connect(['127.0.0.1:8864'])" image_rpc_client.py sed -i "22cclient.connect(['127.0.0.1:8864'])" image_rpc_client.py
${py_version} -m paddle_serving_server.serve --model ResNet101_vd_model --port 8864 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model ResNet101_vd_model --port 8864 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} image_rpc_client.py ResNet101_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1 ${py_version} image_rpc_client.py ResNet101_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
check_result client "ResNet101_GPU_RPC server test completed" check_result client "ResNet101_GPU_RPC server test completed"
...@@ -536,10 +574,11 @@ function faster_rcnn_model_rpc() { ...@@ -536,10 +574,11 @@ function faster_rcnn_model_rpc() {
data_dir=${data}detection/faster_rcnn_r50_fpn_1x_coco/ data_dir=${data}detection/faster_rcnn_r50_fpn_1x_coco/
link_data ${data_dir} link_data ${data_dir}
sed -i 's/9494/8870/g' test_client.py sed -i 's/9494/8870/g' test_client.py
${py_version} -m paddle_serving_server.serve --model serving_server --port 8870 --gpu_ids 0 --thread 2 --use_trt > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model serving_server --port 8870 --gpu_ids 1 --thread 2 > ${dir}server_log.txt 2>&1 &
echo "faster rcnn running ..." echo "faster rcnn running ..."
nvidia-smi nvidia-smi
check_result server 10 check_result server 10
check_gpu_memory 1
${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1 ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
check_result client "faster_rcnn_GPU_RPC server test completed" check_result client "faster_rcnn_GPU_RPC server test completed"
...@@ -556,6 +595,7 @@ function cascade_rcnn_rpc() { ...@@ -556,6 +595,7 @@ function cascade_rcnn_rpc() {
sed -i "s/9292/8879/g" test_client.py sed -i "s/9292/8879/g" test_client.py
${py_version} -m paddle_serving_server.serve --model serving_server --port 8879 --gpu_ids 0 --thread 2 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model serving_server --port 8879 --gpu_ids 0 --thread 2 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} test_client.py > ${dir}client_log.txt 2>&1 ${py_version} test_client.py > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -573,6 +613,7 @@ function deeplabv3_rpc() { ...@@ -573,6 +613,7 @@ function deeplabv3_rpc() {
sed -i "s/9494/8880/g" deeplabv3_client.py sed -i "s/9494/8880/g" deeplabv3_client.py
${py_version} -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 8880 --thread 2 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 8880 --thread 2 > ${dir}server_log.txt 2>&1 &
check_result server 10 check_result server 10
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} deeplabv3_client.py > ${dir}client_log.txt 2>&1 ${py_version} deeplabv3_client.py > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -590,6 +631,7 @@ function mobilenet_rpc() { ...@@ -590,6 +631,7 @@ function mobilenet_rpc() {
sed -i "s/9393/8881/g" mobilenet_tutorial.py sed -i "s/9393/8881/g" mobilenet_tutorial.py
${py_version} -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 8881 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 8881 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} mobilenet_tutorial.py > ${dir}client_log.txt 2>&1 ${py_version} mobilenet_tutorial.py > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -605,8 +647,9 @@ function unet_rpc() { ...@@ -605,8 +647,9 @@ function unet_rpc() {
data_dir=${data}unet_for_image_seg/ data_dir=${data}unet_for_image_seg/
link_data ${data_dir} link_data ${data_dir}
sed -i "s/9494/8882/g" seg_client.py sed -i "s/9494/8882/g" seg_client.py
${py_version} -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 8882 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model unet_model --gpu_ids 1 --port 8882 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 1
nvidia-smi nvidia-smi
${py_version} seg_client.py > ${dir}client_log.txt 2>&1 ${py_version} seg_client.py > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -624,6 +667,7 @@ function resnetv2_rpc() { ...@@ -624,6 +667,7 @@ function resnetv2_rpc() {
sed -i 's/9393/8883/g' resnet50_v2_tutorial.py sed -i 's/9393/8883/g' resnet50_v2_tutorial.py
${py_version} -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 8883 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 8883 > ${dir}server_log.txt 2>&1 &
check_result server 10 check_result server 10
check_gpu_memory 0
nvidia-smi nvidia-smi
${py_version} resnet50_v2_tutorial.py > ${dir}client_log.txt 2>&1 ${py_version} resnet50_v2_tutorial.py > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -671,8 +715,9 @@ function criteo_ctr_rpc_gpu() { ...@@ -671,8 +715,9 @@ function criteo_ctr_rpc_gpu() {
data_dir=${data}criteo_ctr/ data_dir=${data}criteo_ctr/
link_data ${data_dir} link_data ${data_dir}
sed -i "s/8885/8886/g" test_client.py sed -i "s/8885/8886/g" test_client.py
${py_version} -m paddle_serving_server.serve --model ctr_serving_model/ --port 8886 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model ctr_serving_model/ --port 8886 --gpu_ids 1 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 1
nvidia-smi nvidia-smi
${py_version} test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0 > ${dir}client_log.txt 2>&1 ${py_version} test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0 > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
...@@ -691,6 +736,7 @@ function yolov4_rpc_gpu() { ...@@ -691,6 +736,7 @@ function yolov4_rpc_gpu() {
${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
nvidia-smi nvidia-smi
check_result server 8 check_result server 8
check_gpu_memory 0
${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1 ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
check_result client "yolov4_GPU_RPC server test completed" check_result client "yolov4_GPU_RPC server test completed"
...@@ -708,6 +754,7 @@ function senta_rpc_cpu() { ...@@ -708,6 +754,7 @@ function senta_rpc_cpu() {
${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 & ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
nvidia-smi nvidia-smi
check_result server 8 check_result server 8
check_gpu_memory 0
${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1 ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
nvidia-smi nvidia-smi
check_result client "senta_GPU_RPC server test completed" check_result client "senta_GPU_RPC server test completed"
...@@ -783,13 +830,14 @@ function ResNet50_http() { ...@@ -783,13 +830,14 @@ function ResNet50_http() {
cd ${build_path}/python/examples/imagenet cd ${build_path}/python/examples/imagenet
${py_version} resnet50_web_service.py ResNet50_vd_model gpu 8876 > ${dir}server_log.txt 2>&1 & ${py_version} resnet50_web_service.py ResNet50_vd_model gpu 8876 > ${dir}server_log.txt 2>&1 &
check_result server 10 check_result server 10
check_gpu_memory 0
curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:8876/image/prediction > ${dir}client_log.txt 2>&1 curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:8876/image/prediction > ${dir}client_log.txt 2>&1
check_result client "ResNet50_GPU_HTTP server test completed" check_result client "ResNet50_GPU_HTTP server test completed"
kill_server_process kill_server_process
} }
function bert_http() { function bert_http() {
dir=${log_dir}http_model/ResNet50_http/ dir=${log_dir}http_model/bert_http/
check_dir ${dir} check_dir ${dir}
unsetproxy unsetproxy
cd ${build_path}/python/examples/bert cd ${build_path}/python/examples/bert
...@@ -804,42 +852,45 @@ function bert_http() { ...@@ -804,42 +852,45 @@ function bert_http() {
} }
function grpc_fit_a_line() { function grpc_fit_a_line() {
dir=${log_dir}rpc_model/grpc_fit_a_line/ echo "pass"
check_dir ${dir} # dir=${log_dir}rpc_model/grpc_fit_a_line/
unsetproxy # check_dir ${dir}
cd ${build_path}/python/examples/grpc_impl_example/fit_a_line # unsetproxy
data_dir=${data}fit_a_line/ # cd ${build_path}/python/examples/grpc_impl_example/fit_a_line
link_data ${data_dir} # data_dir=${data}fit_a_line/
${py_version} test_server.py uci_housing_model/ > ${dir}server_log.txt 2>&1 & # link_data ${data_dir}
check_result server 5 # ${py_version} test_server.py uci_housing_model/ > ${dir}server_log.txt 2>&1 &
echo "sync predict" > ${dir}client_log.txt 2>&1 # check_result server 5
${py_version} test_sync_client.py >> ${dir}client_log.txt 2>&1 # echo "sync predict" > ${dir}client_log.txt 2>&1
check_result client "grpc_impl_example_fit_a_line_sync_CPU_gRPC server sync test completed" # ${py_version} test_sync_client.py >> ${dir}client_log.txt 2>&1
echo "async predict" >> ${dir}client_log.txt 2>&1 # check_result client "grpc_impl_example_fit_a_line_sync_CPU_gRPC server sync test completed"
${py_version} test_asyn_client.py >> ${dir}client_log.txt 2>&1 # echo "async predict" >> ${dir}client_log.txt 2>&1
check_result client "grpc_impl_example_fit_a_line_asyn_CPU_gRPC server asyn test completed" # ${py_version} test_asyn_client.py >> ${dir}client_log.txt 2>&1
echo "batch predict" >> ${dir}client_log.txt 2>&1 # check_result client "grpc_impl_example_fit_a_line_asyn_CPU_gRPC server asyn test completed"
${py_version} test_batch_client.py >> ${dir}client_log.txt 2>&1 # echo "batch predict" >> ${dir}client_log.txt 2>&1
check_result client "grpc_impl_example_fit_a_line_batch_CPU_gRPC server batch test completed" # ${py_version} test_batch_client.py >> ${dir}client_log.txt 2>&1
echo "timeout predict" >> ${dir}client_log.txt 2>&1 # check_result client "grpc_impl_example_fit_a_line_batch_CPU_gRPC server batch test completed"
${py_version} test_timeout_client.py >> ${dir}client_log.txt 2>&1 # echo "timeout predict" >> ${dir}client_log.txt 2>&1
check_result client "grpc_impl_example_fit_a_line_timeout_CPU_gRPC server timeout test completed" # ${py_version} test_timeout_client.py >> ${dir}client_log.txt 2>&1
kill_server_process # check_result client "grpc_impl_example_fit_a_line_timeout_CPU_gRPC server timeout test completed"
# kill_server_process
} }
function grpc_yolov4() { function grpc_yolov4() {
dir=${log_dir}rpc_model/grpc_yolov4/ echo "pass"
cd ${build_path}/python/examples/grpc_impl_example/yolov4 # dir=${log_dir}rpc_model/grpc_yolov4/
check_dir ${dir} # cd ${build_path}/python/examples/grpc_impl_example/yolov4
data_dir=${data}yolov4/ # check_dir ${dir}
link_data ${data_dir} # data_dir=${data}yolov4/
echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC server started${RES}" # link_data ${data_dir}
${py_version} -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang > ${dir}server_log.txt 2>&1 & # echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC server started${RES}"
check_result server 10 # ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang > ${dir}server_log.txt 2>&1 &
echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC client started${RES}" # check_result server 15
${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1 # check_gpu_memory 0
check_result client "grpc_yolov4_GPU_GRPC server test completed" # echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC client started${RES}"
kill_server_process # ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
# check_result client "grpc_yolov4_GPU_GRPC server test completed"
# kill_server_process
} }
function ocr_c++_service() { function ocr_c++_service() {
...@@ -857,6 +908,7 @@ function ocr_c++_service() { ...@@ -857,6 +908,7 @@ function ocr_c++_service() {
echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC server started${RES}" echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC server started${RES}"
$py_version -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 > ${dir}server_log.txt 2>&1 & $py_version -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 > ${dir}server_log.txt 2>&1 &
check_result server 8 check_result server 8
check_gpu_memory 0
echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC client started${RES}" echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC client started${RES}"
echo "------------------first:" echo "------------------first:"
$py_version ocr_cpp_client.py ocr_det_client ocr_rec_client $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册