From c407dfa3cb5607683e78c5f2aaecc1065ad07d1a Mon Sep 17 00:00:00 2001 From: nhzlx <zlx_hg@163.com> Date: Wed, 20 Mar 2019 13:32:33 +0000 Subject: [PATCH] cherry-pick from feature/anakin-engine: refine paddle-anakin to new interface. #16276 --- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../ir/transpose_flatten_concat_fuse_pass.cc | 6 +++ .../inference/anakin/convert/op_converter.h | 21 ++------ paddle/fluid/inference/anakin/engine.cc | 3 +- paddle/fluid/inference/anakin/engine.h | 11 ++-- .../inference/api/paddle_pass_builder.cc | 2 +- .../fluid/operators/anakin/anakin_engine_op.h | 53 +++++++++++++------ 7 files changed, 56 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 49fa323fc66..87c69d3accb 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -79,7 +79,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference) # be detected by our pass. The index here represents the number of structures in the # pattern. We use index 3 ~ 6, because these quantities of structures are # common in the models. -foreach (index RANGE 3 6) +foreach (index RANGE 2 6) file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") endforeach() diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index fda43948d56..cab69c408de 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <memory> #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/ir/graph_viz_pass.h" @@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl( } template class TransposeFlattenConcatFusePass<1>; +template class TransposeFlattenConcatFusePass<2>; template class TransposeFlattenConcatFusePass<3>; template class TransposeFlattenConcatFusePass<4>; template class TransposeFlattenConcatFusePass<5>; @@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>; REGISTER_PASS(transpose_flatten_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<1>); +REGISTER_PASS(transpose_flatten2_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<2>); + REGISTER_PASS(transpose_flatten3_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<3>); diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 2eb7f24ce54..4603681e1e8 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -117,27 +117,14 @@ class AnakinOpConverter { } temp_max_input_shape[input] = input_shape; engine->SetInputShape(input, input_shape); - // engine->Graph()->RegistVar(input); // For share from data. + engine->Graph()->RegistVar(input); // For share from data. } engine->SetMaxInputShape(temp_max_input_shape); - engine->Optimize(); + + // For anakin share with fluid tensor. + engine->AllocTmpMem(); engine->InitGraph(); - /* - for(auto& input : inputs) { - platform::CUDAPlace gpu_place(engine->GetDevice()); - auto input_var = scope->Var(); - auto input_tensor = input_var->GetMutable<framework::LoDTensor>(); - auto input_max_shape = temp_max_input_shape[input]; - input_tensor->Resize(framework::make_ddim(input_max_shape)); - auto input_data = input_tensor->mutable_data<float>(gpu_place); - auto* anakin_input = engine->Net()->get_in(input); - - ::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data, - ::anakin::saber::NV(), 0, input_max_shape); - anakin_input->share_from(tmp_anakin_tensor); - } - */ } void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 176bc1254b5..543ac9d6385 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -97,15 +97,14 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( anakin_input = net_->get_in(input.first); } */ - anakin_input->reshape(fluid_input_shape); ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, fluid_input_shape); anakin_input->copy_from(tmp_anakin_tensor); } - cudaDeviceSynchronize(); net_->prediction(); + cudaDeviceSynchronize(); for (const auto &output : outputs) { platform::CUDAPlace gpu_place(device_); auto *tensor = output.second; diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 3835ead1946..4845ffdf5b9 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -84,17 +84,20 @@ class AnakinEngine { int GetMaxBatchSize() { return max_batch_size_; } void Freeze(); void Optimize(); + void AllocTmpMem() { + PADDLE_ENFORCE(net_->alloc_memory_first(*graph_), + "anakin alloc temp memory first failed"); + } void Save(std::string path) { graph_->save(path); } + + bool IsInit() { return initialized_; } int GetDevice() { return device_; } - // void SaveSerializedData(std::string& data) { graph_->save_to_string(data); - // } - // void LoadSerializedData(const std::string& data) { - // graph_->load_from_string(data); } void Execute(const std::map<std::string, framework::LoDTensor *> &inputs, const std::map<std::string, framework::LoDTensor *> &outputs, cudaStream_t stream); private: + bool initialized_{false}; int max_batch_size_; std::map<std::string, std::vector<int>> max_input_shape_; int device_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 8db636274fb..182aa1b6b16 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -99,7 +99,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { #endif }); - for (int i = 6; i >= 3; i--) { + for (int i = 6; i >= 2; i--) { passes_.push_back("transpose_flatten" + std::to_string(i) + "_concat_fuse_pass"); } diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index bbe9a221b2c..5da3cc17776 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase { if (param_names_.count(x)) continue; auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(scope, x); - /* - auto t_shape = framework::vectorize(t.dims()); - auto *anakin_input = engine->Net()->get_in(x); - auto net_shape = anakin_input->shape(); - size_t anakin_net_input_size = net_shape.count() * sizeof(float); - size_t fluid_input_size = t.memory_size(); - - if (fluid_input_size < anakin_net_input_size) { - framework::LoDTensor temp_t; - auto t_dims = t.dims(); - temp_t.Resize(t_dims); - TensorCopySync(t, dev_place, &temp_t); - t.Resize(framework::make_ddim(net_shape)); - t.mutable_data<float>(dev_place); - TensorCopySync(temp_t, dev_place, &t); - } - */ + inputs.insert({x, &t}); } @@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase { inference::Singleton<inference::anakin::AnakinEngineManager>::Global() .Get(engine_key_); } + // BUG here, detect that the tensor data pointer here will change sometime. + // Will fix it later. + /* + // For share with the tensor from fluid, We do the net init in the first net + precit. + if (!anakin_engine_->IsInit()) { + auto temp_max_input_shape = anakin_engine_->GetMaxInputShape(); + anakin_engine_->AllocTmpMem(); + for(auto& input : Inputs("Xs")) { + if (param_names_.count(input)) continue; + platform::CUDAPlace + gpu_place(boost::get<platform::CUDAPlace>(dev_place).device); + auto *input_var = scope.FindVar(input); + auto input_tensor = input_var->GetMutable<framework::LoDTensor>(); + auto input_max_shape = temp_max_input_shape[input]; + + framework::LoDTensor temp_t; + auto t_dims = input_tensor->dims(); + temp_t.Resize(t_dims); + TensorCopySync(*input_tensor, dev_place, &temp_t); + input_tensor->Resize(framework::make_ddim(input_max_shape)); + input_tensor->mutable_data<float>(dev_place); + TensorCopySync(temp_t, dev_place, input_tensor); + + auto* input_data = input_tensor->mutable_data<float>(gpu_place); + auto* anakin_input = anakin_engine_->Net()->get_in(input); + + ::anakin::saber::Tensor<::anakin::saber::NV> + tmp_anakin_tensor(input_data, + ::anakin::saber::NV(), 0, input_max_shape); + anakin_input->share_from(tmp_anakin_tensor); + } + anakin_engine_->InitGraph(); + } + */ return anakin_engine_; } -- GitLab