support changable input dims

789112e8 · jackzhang235 · jackzhang235 · 63da1451 · 789112e8 · 789112e8
5 changed file
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -61,12 +61,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {

  int co = static_cast<int>(mean_dims[0]);

+  std::vector<float> variance_trans(co);
+  std::vector<float> mean_trans(co);
  for (int i = 0; i < co; ++i) {
-    variance->mutable_data<float>()[i] =
+    variance_trans[i] =
        scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
-    mean->mutable_data<float>()[i] =
-        mean->data<float>()[i] -
-        bias->data<float>()[i] / variance->data<float>()[i];
+    mean_trans[i] =
+        mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
  }

  auto input_tensor = graph->GetNode(x_var_name);
@@ -77,8 +78,10 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                         mean_tensor->mlu_tensor(),
                                         variance_tensor->mlu_tensor()));

-  graph->BindConstData(variance_var_name, variance);
-  graph->BindConstData(mean_var_name, mean);
+  graph->BindConstRawData(
+      variance_var_name, variance_trans.data(), variance_trans.size(), true);
+  graph->BindConstRawData(
+      mean_var_name, mean_trans.data(), mean_trans.size(), true);
  graph->FuseOp(bn_op);

  CNML_CALL(cnmlDestroyBaseOp(&bn_op));

--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -89,6 +89,14 @@ class Graph {
    output_tensors_.push_back(tensor);
  }

+  std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
+    return &input_tensors_;
+  }
+
+  std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
+    return &output_tensors_;
+  }
+
  void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }

  void Compile(cnmlCoreVersion_t core_version, int core_number) {
@@ -100,15 +108,18 @@ class Graph {
    CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
    CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
    CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
-    for (auto in : input_tensors_) {
-      input_addrs_.push_back(in->mlu_data());
-    }
-    for (auto out : output_tensors_) {
-      output_addrs_.push_back(out->mlu_data());
-    }
  }

  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    input_addrs_.resize(input_tensors_.size());
+    output_addrs_.resize(output_tensors_.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = input_tensors_[i]->mlu_data();
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = output_tensors_[i]->mlu_data();
+    }
+
 #if PRINT_HW_TIME
    thread_local float hw_time;
    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
@@ -159,7 +170,7 @@ class Graph {
      CNML_CALL(cnmlBindConstData_V2(
          nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
    } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
      CNRT_CALL(
          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
                           CNRT_FLOAT32,
@@ -174,7 +185,7 @@ class Graph {
    }
  }

-  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+  void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
    const float* data = tensor->data<float>();
    size_t len = tensor->data_size();
    if (fp_type_ == CNML_DATA_FLOAT32) {
@@ -183,10 +194,14 @@ class Graph {
          const_cast<void*>(static_cast<const void*>(data)),
          false));
    } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
-      for (size_t i = 0; i < len; ++i) {
-        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
-      }
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
      CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
                                     static_cast<void*>(data_fp16),
                                     false));
@@ -207,12 +222,13 @@ class Graph {
    CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
  }

-  void SetFPType(::paddle::lite_api::PrecisionType type) {
+  void SetFPType(paddle::lite_api::PrecisionType type) {
+    origin_fp_type_ = type;
    switch (type) {
-      case ::paddle::lite_api::PrecisionType::kFP16:
+      case paddle::lite_api::PrecisionType::kFP16:
        fp_type_ = CNML_DATA_FLOAT16;
        break;
-      case ::paddle::lite_api::PrecisionType::kFloat:
+      case paddle::lite_api::PrecisionType::kFloat:
        fp_type_ = CNML_DATA_FLOAT32;
        break;
      default:
@@ -224,6 +240,7 @@ class Graph {

 private:
  cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
+  paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
  std::vector<cnmlTensor_t> inputs_;
  std::vector<cnmlTensor_t> outputs_;

--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -46,6 +46,7 @@ void MLUTensor::remember(const std::vector<int>& shape,
                         cnmlDataOrder_t shape_order) {
  tensor_type_ = tensor_type;
  mlu_dtype_ = mlu_dtype;
+  origin_shape_.assign(shape.begin(), shape.end());

  int size = 4;
  if (shape.size() > 4 || shape_order == CNML_ARRAY) {

--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -51,6 +51,8 @@ class MLUTensor {

  void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }

+  const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
+
  ~MLUTensor();

  void ToFile(std::string file_name);
@@ -59,6 +61,7 @@ class MLUTensor {
  cnmlTensor_t mlu_tensor_;

  std::vector<int> shape_;
+  std::vector<int64_t> origin_shape_;
  cnmlTensorType_t tensor_type_;
  cnmlDataType_t mlu_dtype_;
  int dim_{0};

--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -40,11 +41,10 @@ class SubgraphEngine : public subgraph::Engine {
                 const std::vector<std::string>& input_names,
                 const std::vector<std::string>& output_names,
                 Scope* scope,
-                 ::paddle::lite_api::PrecisionType type)
+                 paddle::lite_api::PrecisionType type)
      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
-    graph_.SetFPType(type);
-  }
+            ctx, block_idx, block_desc, input_names, output_names, scope),
+        fp_type_(type) {}

  int Build() {
    // In order to attach all of the ops of the block desc, we need to build
@@ -72,24 +72,44 @@ class SubgraphEngine : public subgraph::Engine {
    return 0;
  }

+  bool InputShapeChanged() {
+    std::vector<std::vector<int64_t>> new_shape;
+    for (auto origin_itensor : origin_itensors_) {
+      new_shape.push_back(origin_itensor->dims().Vectorize());
+    }
+    inputs_shape_ = new_shape;
+    if (shape_graph_map_.count(inputs_shape_) > 0) {
+      return false;
+    }
+    return true;
+  }
+
 protected:
  int BuildDeviceProgram() override {
    int status = 0;
+    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
+    graph->SetFPType(fp_type_);
+    std::vector<std::vector<int64_t>> new_shape;
+    origin_itensors_.clear();
+    origin_otensors_.clear();
+
    // Convert all of input data vars and added into the MLU IR graph
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
    for (auto& input_name : input_names_) {
      auto input_tensor = scope_->FindMutableTensor(input_name);
+
+      origin_itensors_.push_back(input_tensor);
+      new_shape.push_back(input_tensor->dims().Vectorize());
+
      CHECK(input_tensor);
-      auto input_node =
-          graph_.AddNode(input_name,
-                         input_tensor->dims().Vectorize(),
-                         CNML_TENSOR,
-                         CNML_NCHW,
-                         graph_.FPType(),
-                         const_cast<void*>(input_tensor->raw_data()));
+      auto input_node = graph->AddNode(input_name,
+                                       input_tensor->dims().Vectorize(),
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       graph->FPType());
      CHECK(input_node);
      // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
      // the program when the shape of any input tensor is changed.
-      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
    }
    LOG(INFO) << "START TO CONVERT ";
    // Convert all of ops and its weights and added into the MLU IR graph
@@ -106,7 +126,7 @@ class SubgraphEngine : public subgraph::Engine {
      }
      auto kernel = inst.kernel();
      status |= bridges.Select(op_type, TARGET(kMLU))(
-          reinterpret_cast<void*>(&graph_),
+          reinterpret_cast<void*>(graph.get()),
          const_cast<OpLite*>(op),
          const_cast<KernelBase*>(kernel));
      if (subgraph::CHECK_FAILED(status)) {
@@ -115,33 +135,51 @@ class SubgraphEngine : public subgraph::Engine {
    }
    // Obtain the output nodes of the MLU IR graph and build the graph to MLU
    // runtime
-    std::vector<std::string> valid_output_names;
    for (auto& output_name : output_names_) {
-      if (graph_.HasNode(output_name)) {
-        graph_.AddOutput(graph_.GetNode(output_name));
+      if (graph->HasNode(output_name)) {
+        graph->AddOutput(graph->GetNode(output_name));
        auto output_tensor = scope_->FindMutableTensor(output_name);
-        void* p_data = static_cast<void*>(
-            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
-                                            FPTypeTraits<Precision>::T>(
-                TARGET(kMLU)));
-        auto node = graph_.GetNode(output_name);
-        CHECK(p_data);
-        node->set_mlu_ptr(p_data);
-        valid_output_names.push_back(output_name);
+        origin_otensors_.push_back(output_tensor);
+
+        // auto node = graph->GetNode(output_name);
+        // CHECK(p_data);
+        // node->set_mlu_ptr(p_data);
      }
    }
    for (auto& input_name : input_names_) {
-      graph_.AddInput(graph_.GetNode(input_name));
+      graph->AddInput(graph->GetNode(input_name));
    }
-    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+
+    CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
    auto& mlu_context = this->ctx_->template As<MLUContext>();
    auto core_version = mlu_context.MLUCoreVersion();
    auto core_number = mlu_context.MLUCoreNumber();
-    graph_.Compile(core_version, core_number);
+    graph->Compile(core_version, core_number);
+    shape_graph_map_[new_shape] = graph;
    return status;
  }

  int LaunchDeviceProgram() override {
+    // prepare input and output memory
+    auto graph = shape_graph_map_[inputs_shape_];
+    auto* graph_input = graph->MutableInputs();
+    auto* graph_output = graph->MutableOutputs();
+    CHECK_EQ(graph_input->size(), origin_itensors_.size());
+    CHECK_EQ(graph_output->size(), origin_otensors_.size());
+
+    for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+      graph_input->at(i)->set_mlu_ptr(
+          const_cast<void*>(origin_itensors_[i]->raw_data()));
+    }
+    for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+      origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
+      void* p_data = static_cast<void*>(
+          origin_otensors_[i]
+              ->mutable_data<typename paddle::lite::subgraph::mlu::FPTypeTraits<
+                  Precision>::T>(TARGET(kMLU)));
+      graph_output->at(i)->set_mlu_ptr(p_data);
+    }
+
    auto& mlu_context = this->ctx_->template As<MLUContext>();
    auto exec_queue = mlu_context.exec_queue();
    u32_t affinity = mlu_context.affinity();
@@ -150,11 +188,13 @@ class SubgraphEngine : public subgraph::Engine {
    forward_param.data_parallelism = &data_param;
    forward_param.affinity = &affinity;
    forward_param.end = CNRT_PARAM_END;
-    graph_.Compute(forward_param, exec_queue);
+
+    graph->Compute(forward_param, exec_queue);

    // // =========== DUMP ===================
    // for (auto input_name : input_names_) {
-    //   auto input_tensor = graph_.GetNode(input_name);
+    //   auto input_tensor =
+    //   shape_graph_map_[inputs_shape_]->GetNode(input_name);
    //   auto dump_name = input_name;
    //   while (dump_name.find("/") != std::string::npos) {
    //     dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
@@ -163,8 +203,9 @@ class SubgraphEngine : public subgraph::Engine {
    //   input_tensor->ToFile(dump_name);
    // }
    // for (auto output_name : output_names_) {
-    //   if (graph_.HasNode(output_name)) {
-    //     auto output_tensor = graph_.GetNode(output_name);
+    //   if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
+    //     auto output_tensor =
+    //     shape_graph_map_[inputs_shape_]->GetNode(output_name);
    //     auto dump_name = output_name;
    //     while (dump_name.find("/") != std::string::npos) {
    //       dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
@@ -180,7 +221,11 @@ class SubgraphEngine : public subgraph::Engine {
    return 0;
  }

-  paddle::lite::subgraph::mlu::Graph graph_;
+  paddle::lite_api::PrecisionType fp_type_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>,
+           std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
+      shape_graph_map_{};
 };

 template <PrecisionType Precision>