提交 c79f06d3 编写于 作者: N nhzlx

cherry-pick from feature/anakin-engine: add batch interface for pd-anakin #16178

上级 69d37f81
...@@ -90,10 +90,12 @@ class AnakinOpConverter { ...@@ -90,10 +90,12 @@ class AnakinOpConverter {
for (int i = 0; i < var_shape.size(); i++) { for (int i = 0; i < var_shape.size(); i++) {
input_shape.push_back(var_shape[i]); input_shape.push_back(var_shape[i]);
} }
input_shape[0] = 1; input_shape[0] = engine->GetMaxBatch();
engine->SetInputShape(input, input_shape); engine->SetInputShape(input, input_shape);
} }
// engine->Graph()->RegistAllOut();
engine->Optimize(); engine->Optimize();
engine->InitGraph(); engine->InitGraph();
} }
......
...@@ -34,10 +34,12 @@ namespace anakin { ...@@ -34,10 +34,12 @@ namespace anakin {
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary, AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary,
int device) int device,
int max_batch_size)
: graph_(new AnakinGraphT<TargetT, PrecisionType>()), : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) { net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
device_ = device; device_ = device;
max_batch_size_ = max_batch_size;
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
...@@ -71,8 +73,8 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -71,8 +73,8 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
for (const auto &input : inputs) { for (const auto &input : inputs) {
auto *tensor = input.second; auto *tensor = input.second;
auto *data = tensor->data<float>(); auto *data = tensor->data<float>();
auto fluid_input_shape = framework::vectorize2int(tensor->dims());
auto fluid_input_shape = framework::vectorize2int(tensor->dims());
auto *anakin_input = net_->get_in(input.first); auto *anakin_input = net_->get_in(input.first);
auto net_shape = anakin_input->shape(); auto net_shape = anakin_input->shape();
if (tensor->numel() > net_shape.count()) { if (tensor->numel() > net_shape.count()) {
...@@ -84,11 +86,13 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -84,11 +86,13 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
anakin_input->reshape(fluid_input_shape); anakin_input->reshape(fluid_input_shape);
net_shape = anakin_input->shape(); net_shape = anakin_input->shape();
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
net_shape); // net_shape);
anakin_input->share_from(tmp_anakin_tensor); fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor);
} }
cudaDeviceSynchronize();
net_->prediction(); net_->prediction();
for (const auto &output : outputs) { for (const auto &output : outputs) {
platform::CUDAPlace gpu_place(device_); platform::CUDAPlace gpu_place(device_);
...@@ -98,12 +102,10 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -98,12 +102,10 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
auto anakin_output_shape = anakin_output->valid_shape(); auto anakin_output_shape = anakin_output->valid_shape();
tensor->Resize(framework::make_ddim(anakin_output_shape)); tensor->Resize(framework::make_ddim(anakin_output_shape));
auto *fluid_data = tensor->mutable_data<float>(gpu_place); auto *fluid_data = tensor->mutable_data<float>(gpu_place);
memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place, memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
static_cast<void *>(anakin_data), static_cast<void *>(anakin_data),
tensor->numel() * sizeof(float), stream); tensor->numel() * sizeof(float), stream);
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
......
...@@ -55,7 +55,8 @@ class AnakinEngine { ...@@ -55,7 +55,8 @@ class AnakinEngine {
using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>; using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
public: public:
explicit AnakinEngine(bool need_summary = false, int device = 0); explicit AnakinEngine(bool need_summary = false, int device = 0,
int max_batch_size = 1);
~AnakinEngine(); ~AnakinEngine();
void InitGraph(); void InitGraph();
void SetInputShape(const std::string &name, std::vector<int> shape); void SetInputShape(const std::string &name, std::vector<int> shape);
...@@ -70,10 +71,12 @@ class AnakinEngine { ...@@ -70,10 +71,12 @@ class AnakinEngine {
"Add operation's attribution."); "Add operation's attribution.");
} }
NetT *Net() { return net_.get(); } NetT *Net() { return net_.get(); }
GraphT *Graph() { return graph_.get(); }
std::unique_ptr<AnakinEngine> Clone(); std::unique_ptr<AnakinEngine> Clone();
void Freeze(); void Freeze();
void Optimize(); void Optimize();
void Save(std::string path) { graph_->save(path); } void Save(std::string path) { graph_->save(path); }
int GetMaxBatch() { return max_batch_size_; }
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data); // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// } // }
// void LoadSerializedData(const std::string& data) { // void LoadSerializedData(const std::string& data) {
...@@ -83,6 +86,7 @@ class AnakinEngine { ...@@ -83,6 +86,7 @@ class AnakinEngine {
cudaStream_t stream); cudaStream_t stream);
private: private:
int max_batch_size_;
int device_; int device_;
std::unique_ptr<GraphT> graph_; std::unique_ptr<GraphT> graph_;
std::unique_ptr<NetT> net_; std::unique_ptr<NetT> net_;
...@@ -100,10 +104,11 @@ class AnakinEngineManager { ...@@ -100,10 +104,11 @@ class AnakinEngineManager {
return engines_.at(name).get(); return engines_.at(name).get();
} }
AnakinNvEngineT *Create(bool need_summary, int device, AnakinNvEngineT *Create(bool need_summary, int device, int max_batch_size,
std::string engine_name) { std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_); std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device); auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device,
max_batch_size);
engines_[engine_name].reset(p); engines_[engine_name].reset(p);
return p; return p;
} }
......
...@@ -150,6 +150,7 @@ struct Argument { ...@@ -150,6 +150,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool); bool);
DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
// Memory optimized related. // Memory optimized related.
......
...@@ -77,6 +77,7 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -77,6 +77,7 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("engine_opt_info", new std::map<std::string, std::string>( pass->Set("engine_opt_info", new std::map<std::string, std::string>(
argument->engine_opt_info())); argument->engine_opt_info()));
pass->Set("predictor_id", new int(argument->predictor_id())); pass->Set("predictor_id", new int(argument->predictor_id()));
pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
} }
if (pass_name == "tensorrt_subgraph_pass") { if (pass_name == "tensorrt_subgraph_pass") {
...@@ -91,6 +92,10 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -91,6 +92,10 @@ void IRPassManager::CreatePasses(Argument *argument,
AnalysisConfig::Precision::kInt8; AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory();
if ((!model_from_memory && use_static_engine)) {
std::string model_opt_cache_dir = std::string model_opt_cache_dir =
argument->Has("model_dir") argument->Has("model_dir")
? argument->model_dir() ? argument->model_dir()
...@@ -98,9 +103,9 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -98,9 +103,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set( pass->Set(
"model_opt_cache_dir", "model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
}
pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", pass->Set("use_static_engine", new bool(use_static_engine));
new bool(argument->tensorrt_use_static_engine()));
pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("engine_opt_info", new std::map<std::string, std::string>( pass->Set("engine_opt_info", new std::map<std::string, std::string>(
argument->engine_opt_info())); argument->engine_opt_info()));
......
...@@ -256,10 +256,11 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -256,10 +256,11 @@ void AnakinSubgraphPass::CreateAnakinOp(
input_names_with_id, output_names_with_id, std::to_string(predictor_id)); input_names_with_id, output_names_with_id, std::to_string(predictor_id));
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
int max_batch_size = Get<int>("max_batch_size");
auto *anakin_engine = auto *anakin_engine =
inference::Singleton<anakin::AnakinEngineManager>::Global().Create( inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
true, Get<int>("gpu_device_id"), engine_key); true, Get<int>("gpu_device_id"), max_batch_size, engine_key);
auto *scope = param_scope(); auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end()); std::unordered_set<std::string> param_set(params.begin(), params.end());
......
...@@ -245,8 +245,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -245,8 +245,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine_serialized_data.empty()) { trt_engine_serialized_data.empty()) {
std::copy(params.begin(), params.end(), std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params)); std::back_inserter(*repetitive_params));
if (use_static_engine && !load_from_memory) {
trt_engine_serialized_data = GetTrtEngineSerializedData( trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key); Get<std::string>("model_opt_cache_dir"), engine_key);
}
if (trt_engine_serialized_data.empty()) { if (trt_engine_serialized_data.empty()) {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
...@@ -267,10 +270,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -267,10 +270,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine_serialized_data = trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(), std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size()); serialized_engine_data->size());
if (use_static_engine && !load_from_memory) {
SaveTrtEngineSerializedDataToFile( SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"), GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key), engine_key),
trt_engine_serialized_data); trt_engine_serialized_data);
}
} else { } else {
LOG(INFO) << "Load TRT Optimized Info from " LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath( << GetTrtEngineSerializedPath(
......
...@@ -109,6 +109,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -109,6 +109,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
CP_MEMBER(use_anakin_);
CP_MEMBER(anakin_max_batchsize_);
// Ir related. // Ir related.
CP_MEMBER(enable_ir_optim_); CP_MEMBER(enable_ir_optim_);
CP_MEMBER(use_feed_fetch_ops_); CP_MEMBER(use_feed_fetch_ops_);
...@@ -352,7 +355,8 @@ void AnalysisConfig::SwitchIrDebug(int x) { ...@@ -352,7 +355,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_ = x; ir_debug_ = x;
Update(); Update();
} }
void AnalysisConfig::EnableAnakinEngine() { void AnalysisConfig::EnableAnakinEngine(int max_batch_size) {
anakin_max_batchsize_ = max_batch_size;
use_anakin_ = true; use_anakin_ = true;
Update(); Update();
} }
......
...@@ -379,6 +379,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -379,6 +379,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
if (config_.use_gpu() && config_.anakin_engine_enabled()) { if (config_.use_gpu() && config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
LOG(INFO) << "Anakin subgraph engine is enabled"; LOG(INFO) << "Anakin subgraph engine is enabled";
} }
......
...@@ -145,7 +145,7 @@ struct AnalysisConfig { ...@@ -145,7 +145,7 @@ struct AnalysisConfig {
/** /**
* \brief Turn on the usage of Anakin sub-graph engine. * \brief Turn on the usage of Anakin sub-graph engine.
*/ */
void EnableAnakinEngine(); void EnableAnakinEngine(int max_batch_size = 1);
/** A boolean state indicating whether the Anakin sub-graph engine is used. /** A boolean state indicating whether the Anakin sub-graph engine is used.
*/ */
...@@ -270,6 +270,7 @@ struct AnalysisConfig { ...@@ -270,6 +270,7 @@ struct AnalysisConfig {
mutable std::unique_ptr<PassStrategy> pass_builder_; mutable std::unique_ptr<PassStrategy> pass_builder_;
bool use_anakin_{false}; bool use_anakin_{false};
int anakin_max_batchsize_;
std::map<std::string, std::string> engine_opt_info_; std::map<std::string, std::string> engine_opt_info_;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册