From 789112e80c74193f265dac3a1a25460804ca389f Mon Sep 17 00:00:00 2001
From: jackzhang235 <zhangmingwei@cambricon.com>
Date: Thu, 9 Apr 2020 13:55:04 +0000
Subject: [PATCH] support changable input dims

---
 lite/kernels/mlu/bridges/batch_norm_op.cc |  15 +--
 lite/kernels/mlu/bridges/graph.h          |  47 +++++++---
 lite/kernels/mlu/bridges/tensor.cc        |   1 +
 lite/kernels/mlu/bridges/tensor.h         |   3 +
 lite/kernels/mlu/subgraph_compute.h       | 109 +++++++++++++++-------
 5 files changed, 122 insertions(+), 53 deletions(-)
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
index 61f098ec8b..0eadd6b62c 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -61,12 +61,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   int co = static_cast<int>(mean_dims[0]);
 
+  std::vector<float> variance_trans(co);
+  std::vector<float> mean_trans(co);
   for (int i = 0; i < co; ++i) {
-    variance->mutable_data<float>()[i] =
+    variance_trans[i] =
         scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
-    mean->mutable_data<float>()[i] =
-        mean->data<float>()[i] -
-        bias->data<float>()[i] / variance->data<float>()[i];
+    mean_trans[i] =
+        mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
   }
 
   auto input_tensor = graph->GetNode(x_var_name);
@@ -77,8 +78,10 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                          mean_tensor->mlu_tensor(),
                                          variance_tensor->mlu_tensor()));
 
-  graph->BindConstData(variance_var_name, variance);
-  graph->BindConstData(mean_var_name, mean);
+  graph->BindConstRawData(
+      variance_var_name, variance_trans.data(), variance_trans.size(), true);
+  graph->BindConstRawData(
+      mean_var_name, mean_trans.data(), mean_trans.size(), true);
   graph->FuseOp(bn_op);
 
   CNML_CALL(cnmlDestroyBaseOp(&bn_op));
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index 0583a0c953..c4fb10bdb5 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -89,6 +89,14 @@ class Graph {
     output_tensors_.push_back(tensor);
   }
 
+  std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
+    return &input_tensors_;
+  }
+
+  std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
+    return &output_tensors_;
+  }
+
   void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
 
   void Compile(cnmlCoreVersion_t core_version, int core_number) {
@@ -100,15 +108,18 @@ class Graph {
     CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
     CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
     CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
-    for (auto in : input_tensors_) {
-      input_addrs_.push_back(in->mlu_data());
-    }
-    for (auto out : output_tensors_) {
-      output_addrs_.push_back(out->mlu_data());
-    }
   }
 
   void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    input_addrs_.resize(input_tensors_.size());
+    output_addrs_.resize(output_tensors_.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = input_tensors_[i]->mlu_data();
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = output_tensors_[i]->mlu_data();
+    }
+
 #if PRINT_HW_TIME
     thread_local float hw_time;
     CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
@@ -159,7 +170,7 @@ class Graph {
       CNML_CALL(cnmlBindConstData_V2(
           nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
       CNRT_CALL(
           cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
                            CNRT_FLOAT32,
@@ -174,7 +185,7 @@ class Graph {
     }
   }
 
-  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+  void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
     const float* data = tensor->data<float>();
     size_t len = tensor->data_size();
     if (fp_type_ == CNML_DATA_FLOAT32) {
@@ -183,10 +194,14 @@ class Graph {
           const_cast<void*>(static_cast<const void*>(data)),
           false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
-      for (size_t i = 0; i < len; ++i) {
-        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
-      }
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
       CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
                                      static_cast<void*>(data_fp16),
                                      false));
@@ -207,12 +222,13 @@ class Graph {
     CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
   }
 
-  void SetFPType(::paddle::lite_api::PrecisionType type) {
+  void SetFPType(paddle::lite_api::PrecisionType type) {
+    origin_fp_type_ = type;
     switch (type) {
-      case ::paddle::lite_api::PrecisionType::kFP16:
+      case paddle::lite_api::PrecisionType::kFP16:
         fp_type_ = CNML_DATA_FLOAT16;
         break;
-      case ::paddle::lite_api::PrecisionType::kFloat:
+      case paddle::lite_api::PrecisionType::kFloat:
         fp_type_ = CNML_DATA_FLOAT32;
         break;
       default:
@@ -224,6 +240,7 @@ class Graph {
 
  private:
   cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
+  paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
   std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
   std::vector<cnmlTensor_t> inputs_;
   std::vector<cnmlTensor_t> outputs_;
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
index c426069c7d..4fd1d573f1 100644
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -46,6 +46,7 @@ void MLUTensor::remember(const std::vector<int>& shape,
                          cnmlDataOrder_t shape_order) {
   tensor_type_ = tensor_type;
   mlu_dtype_ = mlu_dtype;
+  origin_shape_.assign(shape.begin(), shape.end());
 
   int size = 4;
   if (shape.size() > 4 || shape_order == CNML_ARRAY) {
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
index 2a4cc23a73..24e181a47b 100644
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -51,6 +51,8 @@ class MLUTensor {
 
   void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
 
+  const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
+
   ~MLUTensor();
 
   void ToFile(std::string file_name);
@@ -59,6 +61,7 @@ class MLUTensor {
   cnmlTensor_t mlu_tensor_;
 
   std::vector<int> shape_;
+  std::vector<int64_t> origin_shape_;
   cnmlTensorType_t tensor_type_;
   cnmlDataType_t mlu_dtype_;
   int dim_{0};
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 7d9db21133..00ec78a0b4 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -40,11 +41,10 @@ class SubgraphEngine : public subgraph::Engine {
                  const std::vector<std::string>& input_names,
                  const std::vector<std::string>& output_names,
                  Scope* scope,
-                 ::paddle::lite_api::PrecisionType type)
+                 paddle::lite_api::PrecisionType type)
       : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
-    graph_.SetFPType(type);
-  }
+            ctx, block_idx, block_desc, input_names, output_names, scope),
+        fp_type_(type) {}
 
   int Build() {
     // In order to attach all of the ops of the block desc, we need to build
@@ -72,24 +72,44 @@ class SubgraphEngine : public subgraph::Engine {
     return 0;
   }
 
+  bool InputShapeChanged() {
+    std::vector<std::vector<int64_t>> new_shape;
+    for (auto origin_itensor : origin_itensors_) {
+      new_shape.push_back(origin_itensor->dims().Vectorize());
+    }
+    inputs_shape_ = new_shape;
+    if (shape_graph_map_.count(inputs_shape_) > 0) {
+      return false;
+    }
+    return true;
+  }
+
  protected:
   int BuildDeviceProgram() override {
     int status = 0;
+    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
+    graph->SetFPType(fp_type_);
+    std::vector<std::vector<int64_t>> new_shape;
+    origin_itensors_.clear();
+    origin_otensors_.clear();
+
     // Convert all of input data vars and added into the MLU IR graph
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     for (auto& input_name : input_names_) {
       auto input_tensor = scope_->FindMutableTensor(input_name);
+
+      origin_itensors_.push_back(input_tensor);
+      new_shape.push_back(input_tensor->dims().Vectorize());
+
       CHECK(input_tensor);
-      auto input_node =
-          graph_.AddNode(input_name,
-                         input_tensor->dims().Vectorize(),
-                         CNML_TENSOR,
-                         CNML_NCHW,
-                         graph_.FPType(),
-                         const_cast<void*>(input_tensor->raw_data()));
+      auto input_node = graph->AddNode(input_name,
+                                       input_tensor->dims().Vectorize(),
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       graph->FPType());
       CHECK(input_node);
       // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
       // the program when the shape of any input tensor is changed.
-      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     }
     LOG(INFO) << "START TO CONVERT ";
     // Convert all of ops and its weights and added into the MLU IR graph
@@ -106,7 +126,7 @@ class SubgraphEngine : public subgraph::Engine {
       }
       auto kernel = inst.kernel();
       status |= bridges.Select(op_type, TARGET(kMLU))(
-          reinterpret_cast<void*>(&graph_),
+          reinterpret_cast<void*>(graph.get()),
           const_cast<OpLite*>(op),
           const_cast<KernelBase*>(kernel));
       if (subgraph::CHECK_FAILED(status)) {
@@ -115,33 +135,51 @@ class SubgraphEngine : public subgraph::Engine {
     }
     // Obtain the output nodes of the MLU IR graph and build the graph to MLU
     // runtime
-    std::vector<std::string> valid_output_names;
     for (auto& output_name : output_names_) {
-      if (graph_.HasNode(output_name)) {
-        graph_.AddOutput(graph_.GetNode(output_name));
+      if (graph->HasNode(output_name)) {
+        graph->AddOutput(graph->GetNode(output_name));
         auto output_tensor = scope_->FindMutableTensor(output_name);
-        void* p_data = static_cast<void*>(
-            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
-                                            FPTypeTraits<Precision>::T>(
-                TARGET(kMLU)));
-        auto node = graph_.GetNode(output_name);
-        CHECK(p_data);
-        node->set_mlu_ptr(p_data);
-        valid_output_names.push_back(output_name);
+        origin_otensors_.push_back(output_tensor);
+
+        // auto node = graph->GetNode(output_name);
+        // CHECK(p_data);
+        // node->set_mlu_ptr(p_data);
       }
     }
     for (auto& input_name : input_names_) {
-      graph_.AddInput(graph_.GetNode(input_name));
+      graph->AddInput(graph->GetNode(input_name));
     }
-    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+
+    CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto core_version = mlu_context.MLUCoreVersion();
     auto core_number = mlu_context.MLUCoreNumber();
-    graph_.Compile(core_version, core_number);
+    graph->Compile(core_version, core_number);
+    shape_graph_map_[new_shape] = graph;
     return status;
   }
 
   int LaunchDeviceProgram() override {
+    // prepare input and output memory
+    auto graph = shape_graph_map_[inputs_shape_];
+    auto* graph_input = graph->MutableInputs();
+    auto* graph_output = graph->MutableOutputs();
+    CHECK_EQ(graph_input->size(), origin_itensors_.size());
+    CHECK_EQ(graph_output->size(), origin_otensors_.size());
+
+    for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+      graph_input->at(i)->set_mlu_ptr(
+          const_cast<void*>(origin_itensors_[i]->raw_data()));
+    }
+    for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+      origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
+      void* p_data = static_cast<void*>(
+          origin_otensors_[i]
+              ->mutable_data<typename paddle::lite::subgraph::mlu::FPTypeTraits<
+                  Precision>::T>(TARGET(kMLU)));
+      graph_output->at(i)->set_mlu_ptr(p_data);
+    }
+
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto exec_queue = mlu_context.exec_queue();
     u32_t affinity = mlu_context.affinity();
@@ -150,11 +188,13 @@ class SubgraphEngine : public subgraph::Engine {
     forward_param.data_parallelism = &data_param;
     forward_param.affinity = &affinity;
     forward_param.end = CNRT_PARAM_END;
-    graph_.Compute(forward_param, exec_queue);
+
+    graph->Compute(forward_param, exec_queue);
 
     // // =========== DUMP ===================
     // for (auto input_name : input_names_) {
-    //   auto input_tensor = graph_.GetNode(input_name);
+    //   auto input_tensor =
+    //   shape_graph_map_[inputs_shape_]->GetNode(input_name);
     //   auto dump_name = input_name;
     //   while (dump_name.find("/") != std::string::npos) {
     //     dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
@@ -163,8 +203,9 @@ class SubgraphEngine : public subgraph::Engine {
     //   input_tensor->ToFile(dump_name);
     // }
     // for (auto output_name : output_names_) {
-    //   if (graph_.HasNode(output_name)) {
-    //     auto output_tensor = graph_.GetNode(output_name);
+    //   if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
+    //     auto output_tensor =
+    //     shape_graph_map_[inputs_shape_]->GetNode(output_name);
     //     auto dump_name = output_name;
     //     while (dump_name.find("/") != std::string::npos) {
     //       dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
@@ -180,7 +221,11 @@ class SubgraphEngine : public subgraph::Engine {
     return 0;
   }
 
-  paddle::lite::subgraph::mlu::Graph graph_;
+  paddle::lite_api::PrecisionType fp_type_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>,
+           std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
+      shape_graph_map_{};
 };
 
 template <PrecisionType Precision>
-- 
GitLab