From c407dfa3cb5607683e78c5f2aaecc1065ad07d1a Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 20 Mar 2019 13:32:33 +0000
Subject: [PATCH] cherry-pick from feature/anakin-engine: refine paddle-anakin
 to new interface. #16276

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  2 +-
 .../ir/transpose_flatten_concat_fuse_pass.cc  |  6 +++
 .../inference/anakin/convert/op_converter.h   | 21 ++------
 paddle/fluid/inference/anakin/engine.cc       |  3 +-
 paddle/fluid/inference/anakin/engine.h        | 11 ++--
 .../inference/api/paddle_pass_builder.cc      |  2 +-
 .../fluid/operators/anakin/anakin_engine_op.h | 53 +++++++++++++------
 7 files changed, 56 insertions(+), 42 deletions(-)
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 49fa323fc66..87c69d3accb 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -79,7 +79,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference)
 # be detected by our pass. The index here represents the number of structures in the
 # pattern. We use index 3 ~ 6, because these quantities of structures are
 # common in the models.
-foreach (index RANGE 3 6)
+foreach (index RANGE 2 6)
    file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
 
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index fda43948d56..cab69c408de 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
 }
 
 template class TransposeFlattenConcatFusePass<1>;
+template class TransposeFlattenConcatFusePass<2>;
 template class TransposeFlattenConcatFusePass<3>;
 template class TransposeFlattenConcatFusePass<4>;
 template class TransposeFlattenConcatFusePass<5>;
@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
 
+REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
+
 REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
 
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 2eb7f24ce54..4603681e1e8 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -117,27 +117,14 @@ class AnakinOpConverter {
       }
       temp_max_input_shape[input] = input_shape;
       engine->SetInputShape(input, input_shape);
-      // engine->Graph()->RegistVar(input); // For share from data.
+      engine->Graph()->RegistVar(input);  // For share from data.
     }
     engine->SetMaxInputShape(temp_max_input_shape);
-
     engine->Optimize();
+
+    // For anakin share with fluid tensor.
+    engine->AllocTmpMem();
     engine->InitGraph();
-    /*
-    for(auto& input : inputs) {
-      platform::CUDAPlace gpu_place(engine->GetDevice());
-      auto input_var = scope->Var();
-      auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
-      auto input_max_shape = temp_max_input_shape[input];
-      input_tensor->Resize(framework::make_ddim(input_max_shape));
-      auto input_data = input_tensor->mutable_data<float>(gpu_place);
-      auto* anakin_input = engine->Net()->get_in(input);
-
-      ::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data,
-    ::anakin::saber::NV(), 0, input_max_shape);
-      anakin_input->share_from(tmp_anakin_tensor);
-    }
-    */
   }
 
   void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index 176bc1254b5..543ac9d6385 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -97,15 +97,14 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
       anakin_input = net_->get_in(input.first);
     }
     */
-
     anakin_input->reshape(fluid_input_shape);
 
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
                                                        fluid_input_shape);
     anakin_input->copy_from(tmp_anakin_tensor);
   }
-  cudaDeviceSynchronize();
   net_->prediction();
+  cudaDeviceSynchronize();
   for (const auto &output : outputs) {
     platform::CUDAPlace gpu_place(device_);
     auto *tensor = output.second;
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
index 3835ead1946..4845ffdf5b9 100644
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -84,17 +84,20 @@ class AnakinEngine {
   int GetMaxBatchSize() { return max_batch_size_; }
   void Freeze();
   void Optimize();
+  void AllocTmpMem() {
+    PADDLE_ENFORCE(net_->alloc_memory_first(*graph_),
+                   "anakin alloc temp memory first failed");
+  }
   void Save(std::string path) { graph_->save(path); }
+
+  bool IsInit() { return initialized_; }
   int GetDevice() { return device_; }
-  // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
-  // }
-  // void LoadSerializedData(const std::string& data) {
-  // graph_->load_from_string(data); }
   void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
                const std::map<std::string, framework::LoDTensor *> &outputs,
                cudaStream_t stream);
 
  private:
+  bool initialized_{false};
   int max_batch_size_;
   std::map<std::string, std::vector<int>> max_input_shape_;
   int device_;
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 8db636274fb..182aa1b6b16 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -99,7 +99,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
 #endif
   });
 
-  for (int i = 6; i >= 3; i--) {
+  for (int i = 6; i >= 2; i--) {
     passes_.push_back("transpose_flatten" + std::to_string(i) +
                       "_concat_fuse_pass");
   }
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
index bbe9a221b2c..5da3cc17776 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase {
       if (param_names_.count(x)) continue;
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      /*
-      auto t_shape = framework::vectorize(t.dims());
-      auto *anakin_input = engine->Net()->get_in(x);
-      auto net_shape = anakin_input->shape();
-      size_t anakin_net_input_size = net_shape.count() * sizeof(float);
-      size_t fluid_input_size = t.memory_size();
-
-      if (fluid_input_size < anakin_net_input_size) {
-        framework::LoDTensor temp_t;
-        auto t_dims = t.dims();
-        temp_t.Resize(t_dims);
-        TensorCopySync(t, dev_place, &temp_t);
-        t.Resize(framework::make_ddim(net_shape));
-        t.mutable_data<float>(dev_place);
-        TensorCopySync(temp_t, dev_place, &t);
-      }
-      */
+
       inputs.insert({x, &t});
     }
 
@@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase {
           inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
               .Get(engine_key_);
     }
+    // BUG here, detect that the tensor data pointer here will change sometime.
+    // Will fix it later.
+    /*
+    // For share with the tensor from fluid, We do the net init in the first net
+    precit.
+    if (!anakin_engine_->IsInit()) {
+       auto temp_max_input_shape = anakin_engine_->GetMaxInputShape();
+       anakin_engine_->AllocTmpMem();
+       for(auto& input : Inputs("Xs")) {
+          if (param_names_.count(input)) continue;
+          platform::CUDAPlace
+    gpu_place(boost::get<platform::CUDAPlace>(dev_place).device);
+          auto *input_var = scope.FindVar(input);
+          auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
+          auto input_max_shape = temp_max_input_shape[input];
+
+          framework::LoDTensor temp_t;
+          auto t_dims = input_tensor->dims();
+          temp_t.Resize(t_dims);
+          TensorCopySync(*input_tensor, dev_place, &temp_t);
+          input_tensor->Resize(framework::make_ddim(input_max_shape));
+          input_tensor->mutable_data<float>(dev_place);
+          TensorCopySync(temp_t, dev_place, input_tensor);
+
+          auto* input_data = input_tensor->mutable_data<float>(gpu_place);
+          auto* anakin_input = anakin_engine_->Net()->get_in(input);
+
+          ::anakin::saber::Tensor<::anakin::saber::NV>
+    tmp_anakin_tensor(input_data,
+                ::anakin::saber::NV(), 0, input_max_shape);
+          anakin_input->share_from(tmp_anakin_tensor);
+      }
+      anakin_engine_->InitGraph();
+    }
+    */
     return anakin_engine_;
   }
 
-- 
GitLab