diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index d0cbac9f329b2890c7510f85bf66810862a5ae4f..b3481462ef01097257e49c4bfd389b64e90f3f3e 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -171,11 +171,23 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib") LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib") +LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib) + +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib") +LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib) + if (NOT WITH_MKLML) ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a) endif() +ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so) + +ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0) + ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so) if (WITH_ASCEND_CL) diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index d1d9ee256d10d031f5482685cf062646cd683500..f5edd23fd104c9e4fb65b308aa119e2ea0db10d4 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -49,6 +49,20 @@ message EngineDesc { optional bool gpu_multi_stream = 20; optional bool use_ascend_cl = 21; + /* + * "gpu_memory_mb": allocate gpu memory by config.EnableUseGpu() + * "cpu_math_thread_num": set thread numbers of cpu math by + * config.SetCpuMathLibraryNumThreads() + * "trt_workspace_size": set TensorRT workspace size by + * config.EnableTensorRtEngine(), 1 << 25 default + * "trt_use_static": If true, save the optimization information of the TRT + * serialized to the disk, and load from the disk. + */ + optional int32 gpu_memory_mb = 22 [ default = 100 ]; + optional int32 cpu_math_thread_num = 23 [ default = 1 ]; + optional int32 trt_workspace_size = 24 [ default = 33554432 ]; + optional bool trt_use_static = 25 [ default = false ]; + /* * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling * mode. diff --git a/core/general-server/op/general_remote_op.cpp b/core/general-server/op/general_remote_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e77067ddad9190d58b741361efff9f1e704f9b0 --- /dev/null +++ b/core/general-server/op/general_remote_op.cpp @@ -0,0 +1,126 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "core/general-server/op/general_remote_op.h" +#include +#include +#include "core/util/include/timer.h" + +// paddle inference 2.1 support: FLOAT32, INT64, INT32, UINT8, INT8 +// will support: FLOAT16 +#define BRPC_MAX_BODY_SIZE 2 * 1024 * 1024 * 1024 +const std::string LODABALANCE = ""; + +namespace baidu { +namespace paddle_serving { +namespace serving { + +using baidu::paddle_serving::Timer; +using baidu::paddle_serving::predictor::general_model::Tensor; +using baidu::paddle_serving::predictor::general_model::Request; +using baidu::paddle_serving::predictor::general_model::Response; + +brpc::Channel BRPCStub::brpc_channels[MAX_MP_NUM]; + +brpc::ChannelOptions BRPCStub::options; +std::atomic BRPCStub::inited(0); + +int GeneralRemoteOp::inference() { + LOG(INFO) << "Enter GeneralRemoteOp:inference()"; + int expected = 0; + std::vector op_address = address(); + if (BRPCStub::inited.compare_exchange_strong(expected, 1)) { + BRPCStub::options.protocol = "baidu_std"; + BRPCStub::options.connection_type = "short"; + BRPCStub::options.timeout_ms = 80000 /*milliseconds*/; + BRPCStub::options.max_retry = 100; + brpc::fLU64::FLAGS_max_body_size = BRPC_MAX_BODY_SIZE; + + LOG(ERROR) << "address size: " << op_address.size(); + for (int i = 0; i < op_address.size(); ++i) { + LOG(INFO) << i + 1 << " address is " << op_address[i].c_str(); + BRPCStub::brpc_channels[i].Init( + op_address[i].c_str(), LODABALANCE.c_str(), &BRPCStub::options); + } + + BRPCStub::inited++; + } + while (BRPCStub::inited < 2) { + } + + Timer timeline; + int64_t start = timeline.TimeStampUS(); + timeline.Start(); + VLOG(2) << "Going to run Remote inference"; + + Request* req = (Request*)(get_request_message()); + Response* res = mutable_data(); + uint64_t log_id = req->log_id(); + + brpc::Controller brpc_controllers[MAX_MP_NUM]; + brpc::CallId brpc_callids[MAX_MP_NUM]; + Response brpc_response_tmp; + + size_t i = 0; + // Init BRPC controllers, callids and stubs + for (i = 0; i < op_address.size(); ++i) { + brpc_controllers[i].set_log_id(log_id); + brpc_callids[i] = brpc_controllers[i].call_id(); + } + for (i = 0; i < op_address.size(); ++i) { + baidu::paddle_serving::predictor::general_model::GeneralModelService_Stub + stub(&BRPCStub::brpc_channels[i]); + LOG(INFO) << "Sended 1 request to Slave Sever " << i; + if (0 == i) { + stub.inference(&brpc_controllers[i], req, res, brpc::DoNothing()); + continue; + } + stub.inference( + &brpc_controllers[i], req, &brpc_response_tmp, brpc::DoNothing()); + } + + LOG(INFO) << "All request are sended, waiting for all responses."; + + // Wait RPC done. + for (i = 0; i < op_address.size(); ++i) { + brpc::Join(brpc_callids[i]); + } + + // Print RPC Results + for (i = 0; i < op_address.size(); ++i) { + LOG(INFO) << "brpc_controller_" << i + << " status:" << brpc_controllers[i].Failed(); + if (!brpc_controllers[i].Failed()) { + LOG(INFO) << "Received response from " + << brpc_controllers[i].remote_side() + << " Latency=" << brpc_controllers[i].latency_us() << "us"; + } else { + LOG(ERROR) << brpc_controllers[i].ErrorText(); + } + } + LOG(INFO) << "All brpc remote stubs joined done."; + + res->set_log_id(log_id); + res->set_profile_server(req->profile_server()); + int64_t end = timeline.TimeStampUS(); + res->add_profile_time(start); + res->add_profile_time(end); + + return 0; +} + +DEFINE_OP(GeneralRemoteOp); +} // namespace serving +} // namespace paddle_serving +} // namespace baidu diff --git a/core/general-server/op/general_remote_op.h b/core/general-server/op/general_remote_op.h new file mode 100644 index 0000000000000000000000000000000000000000..94bfcb9f671866432c572ea67ccbdaf48344fcea --- /dev/null +++ b/core/general-server/op/general_remote_op.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/general-server/general_model_service.pb.h" + +#include "core/sdk-cpp/builtin_format.pb.h" +#include "core/sdk-cpp/general_model_service.pb.h" +#include "core/sdk-cpp/include/common.h" +#include "core/sdk-cpp/include/predictor_sdk.h" + +#define MAX_MP_NUM 16 + +namespace baidu { +namespace paddle_serving { +namespace serving { + +using baidu::paddle_serving::predictor::general_model::Request; +using baidu::paddle_serving::predictor::general_model::Response; + +class GeneralRemoteOp + : public baidu::paddle_serving::predictor::OpWithChannel< + baidu::paddle_serving::predictor::general_model::Response> { + public: + DECLARE_OP(GeneralRemoteOp); + int inference(); +}; + +class BRPCStub { + public: + static brpc::Channel brpc_channels[MAX_MP_NUM]; + static brpc::ChannelOptions options; + static std::atomic inited; +}; + +} // namespace serving +} // namespace paddle_serving +} // namespace baidu diff --git a/core/predictor/common/constant.cpp b/core/predictor/common/constant.cpp index b0acb886950face9383518cb7da227137a9c14be..5df873017ef2406e96e0b1316c4c5062d4208552 100644 --- a/core/predictor/common/constant.cpp +++ b/core/predictor/common/constant.cpp @@ -20,7 +20,7 @@ namespace predictor { DEFINE_bool(use_parallel_infer_service, false, ""); DEFINE_int32(el_log_level, 16, ""); -DEFINE_int32(idle_timeout_s, 16, ""); +DEFINE_int32(idle_timeout_s, 80, ""); DEFINE_int32(port, 8010, ""); DEFINE_string(workflow_path, "./conf", ""); DEFINE_string(workflow_file, "workflow.prototxt", ""); diff --git a/core/predictor/framework/bsf-inl.h b/core/predictor/framework/bsf-inl.h index fc468bfe9a3fd42c472ada774c6eb77b4de7dac7..f1885ae357c910f1590502b4aec7e4a6de0289f3 100644 --- a/core/predictor/framework/bsf-inl.h +++ b/core/predictor/framework/bsf-inl.h @@ -341,7 +341,7 @@ bool TaskExecutor::move_task_to_batch( LOG(INFO) << "Hit auto padding, merge " << padding_task_count << " tasks into 1 batch."; } - LOG(INFO) << "Number of tasks remaining in _task_queue is" + LOG(INFO) << "Number of tasks remaining in _task_queue is " << _task_queue.size(); return true; } diff --git a/paddle_inference/paddle/include/paddle_engine.h b/paddle_inference/paddle/include/paddle_engine.h index 1fbb7222c0f32c7598b24c51f076d47e863f25b6..2d76730555acb6ed0408584db1334e842db126c3 100644 --- a/paddle_inference/paddle/include/paddle_engine.h +++ b/paddle_inference/paddle/include/paddle_engine.h @@ -241,10 +241,10 @@ class PaddleInferenceEngine : public EngineCore { } config.SwitchSpecifyInputNames(true); - config.SetCpuMathLibraryNumThreads(1); + config.SetCpuMathLibraryNumThreads(engine_conf.cpu_math_thread_num()); if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) { // 2000MB GPU memory - config.EnableUseGpu(50, gpu_id); + config.EnableUseGpu(engine_conf.gpu_memory_mb(), gpu_id); if (engine_conf.has_gpu_multi_stream() && engine_conf.gpu_multi_stream()) { config.EnableGpuMultiStream(); @@ -267,17 +267,17 @@ class PaddleInferenceEngine : public EngineCore { if (engine_conf.has_use_trt() && engine_conf.use_trt()) { config.SwitchIrOptim(true); if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) { - config.EnableUseGpu(50, gpu_id); + config.EnableUseGpu(engine_conf.gpu_memory_mb(), gpu_id); if (engine_conf.has_gpu_multi_stream() && engine_conf.gpu_multi_stream()) { config.EnableGpuMultiStream(); } } - config.EnableTensorRtEngine(1 << 25, + config.EnableTensorRtEngine(engine_conf.trt_workspace_size(), max_batch, local_min_subgraph_size, precision_type, - false, + engine_conf.trt_use_static(), FLAGS_use_calib); std::map> min_input_shape; std::map> max_input_shape; @@ -413,7 +413,11 @@ class PaddleInferenceEngine : public EngineCore { << ", use_ascend_cl: " << engine_conf.has_use_ascend_cl() << ", use_xpu: " << engine_conf.use_xpu() << ", enable_memory_optimization: " - << engine_conf.enable_memory_optimization(); + << engine_conf.enable_memory_optimization() + << ", gpu_memory_mb: " << engine_conf.gpu_memory_mb() + << ", cpu_math_thread_num: " << engine_conf.cpu_math_thread_num() + << ", trt_workspace_size: " << engine_conf.trt_workspace_size() + << ", trt_use_static: " << engine_conf.trt_use_static(); VLOG(2) << "create paddle predictor sucess, path: " << model_path; return 0; diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 13f3739d19543486b6079450c7fbd86e14be6554..8ac473d36b11bd41909975692215caa166bc94e0 100755 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -280,6 +280,27 @@ def serve_args(): default="", nargs="+", help="min_subgraph_size") + parser.add_argument( + "--gpu_memory_mb", + type=int, + default=50, + help="Initially allocate GPU storage size") + parser.add_argument( + "--cpu_math_thread_num", + type=int, + default=1, + help="Initialize the number of CPU computing threads") + parser.add_argument( + "--trt_workspace_size", + type=int, + default=33554432, + help="Initialize allocation 1 << 25 GPU storage size") + parser.add_argument( + "--trt_use_static", + default=False, + action="store_true", + help="Initialize TRT with static data") + return parser.parse_args() @@ -396,10 +417,14 @@ def start_gpu_card_model(gpu_mode, port, args): # pylint: disable=doc-string-mi server.set_dist_endpoints(args.dist_endpoints.split(",")) server.set_dist_subgraph_index(args.dist_subgraph_index) server.set_min_subgraph_size(args.min_subgraph_size) + server.set_gpu_memory_mb(args.gpu_memory_mb) + server.set_cpu_math_thread_num(args.cpu_math_thread_num) if args.use_trt and device == "gpu": server.set_trt() server.set_ir_optimize(True) + server.set_trt_workspace_size(args.trt_workspace_size) + server.set_trt_use_static(args.trt_use_static) if is_ocr: info = set_ocr_dynamic_shape_info() server.set_trt_dynamic_shape_info(info) diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py index 266efc3e2f1ca0b383d14c2d0c1f6236347888d3..c0090b75c11fbf5bffa43d67f30b99cd335bdfba 100755 --- a/python/paddle_serving_server/server.py +++ b/python/paddle_serving_server/server.py @@ -119,6 +119,10 @@ class Server(object): self.dist_master_serving = False self.min_subgraph_size = [] self.trt_dynamic_shape_info = [] + self.gpu_memory_mb = 50 + self.cpu_math_thread_num = 1 + self.trt_workspace_size = 33554432 # 1 << 25 + self.trt_use_static = False def get_fetch_list(self, infer_node_idx=-1): fetch_names = [ @@ -289,6 +293,18 @@ class Server(object): def set_trt_dynamic_shape_info(self, info): self.trt_dynamic_shape_info = info + def set_gpu_memory_mb(self, gpu_memory_mb): + self.gpu_memory_mb = gpu_memory_mb + + def set_cpu_math_thread_num(self, cpu_math_thread_num): + self.cpu_math_thread_num = cpu_math_thread_num + + def set_trt_workspace_size(self, trt_workspace_size): + self.trt_workspace_size = trt_workspace_size + + def set_trt_use_static(self, trt_use_static): + self.trt_use_static = trt_use_static + def _prepare_engine(self, model_config_paths, device, use_encryption_model): self.device = device if self.model_toolkit_conf == None: @@ -342,6 +358,10 @@ class Server(object): engine.use_xpu = self.use_xpu engine.use_ascend_cl = self.use_ascend_cl engine.use_gpu = False + engine.gpu_memory_mb = self.gpu_memory_mb + engine.cpu_math_thread_num = self.cpu_math_thread_num + engine.trt_workspace_size = self.trt_workspace_size + engine.trt_use_static = self.trt_use_static # use distributed model. if self.dist_subgraph_index >= 0: