From ef5c01653051ce276c64e6abaa0ee06505d02e70 Mon Sep 17 00:00:00 2001
From: zhaoying <zhaoying@cambricon.com>
Date: Wed, 1 Apr 2020 10:40:23 +0800
Subject: [PATCH] (ref):     1. disable conv activation pass by default     2.
 set fc_fuser'param with_relu false while mlu fc kernel does not     support
 relu     3. change fc filter shape from 2 dim to 4 dim while input dim == 4  
   4. add ToFile func in mlu tensor for debug convenience     5. enable 4-dim
 input in elementwise_ops     6. add transpose2d in utility.cc

---
 lite/core/mir/fusion/fc_fuse_pass.cc        |  5 ++
 lite/core/optimizer.h                       |  9 ++-
 lite/kernels/mlu/bridges/conv_op.cc         | 37 +++--------
 lite/kernels/mlu/bridges/elementwise_ops.cc |  2 +-
 lite/kernels/mlu/bridges/fc_op.cc           | 73 +++++++++++++++++----
 lite/kernels/mlu/bridges/tensor.cc          | 54 +++++++++++++++
 lite/kernels/mlu/bridges/tensor.h           |  4 ++
 lite/kernels/mlu/bridges/utility.cc         | 15 +++++
 lite/kernels/mlu/bridges/utility.h          |  3 +
 lite/kernels/mlu/subgraph_compute.h         | 26 ++++++++
 10 files changed, 182 insertions(+), 46 deletions(-)
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index 46695be396..6b1ef03198 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -24,8 +24,13 @@ namespace mir {
 
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_X86
+#ifdef LITE_WITH_MLU
+  fusion::FcFuser fuser(false);
+  fuser(graph.get());
+#elif
   fusion::FcFuser fuser(true);
   fuser(graph.get());
+#endif
 #endif
 
   fusion::FcFuser fuser2(false);
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 025e5e769c..4348f9eeaa 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -60,9 +60,12 @@ class Optimizer {
            "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
            "lite_conv_bn_fuse_pass",               //
            "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
-           // TODO(Superjomn) Refine the fusion related design to select fusion
-           // kernels for devices automatically.
-           "lite_conv_activation_fuse_pass",              //
+// TODO(Superjomn) Refine the fusion related design to select fusion
+// kernels for devices automatically.
+#ifndef LITE_WITH_MLU  // mlu can not treat conv-conv parttern because kernel
+                       // picker expect a int8 conv2d kernel
+           "lite_conv_activation_fuse_pass",  //
+#endif
            "lite_var_conv_2d_activation_fuse_pass",       //
            "lite_fc_fuse_pass",                           //
            "lite_shuffle_channel_fuse_pass",              //
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index 2db9cfbd78..67682c5d6c 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -74,10 +74,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       padding_algorithm,
                                       input_dims,
                                       filter_dims);
-  bool is_group_mode = false;
-  if (groups > 1) {
-    is_group_mode = true;
-  }
+  bool is_group_mode = groups > 1;
 
   bool is_depthwise_mode = false;
   if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
@@ -86,26 +83,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     is_group_mode = false;
   }
 
-  // ================ DEBUG =======================
-
-  VLOG(4) << "conv2d op input_var_name : " << input_var_name  << std::endl;
-  VLOG(4) << "conv2d op : filter_var_name " << filter_var_name << std::endl;
-  VLOG(4) << "conv2d op : output_var_name " <<  output_var_name << std::endl;
-  VLOG(4) << "conv2d op : groups " <<  groups << std::endl;
-  VLOG(4) << "conv2d op : is_depthwise_mode " <<  is_depthwise_mode<< std::endl;
-  VLOG(4) << "conv2d op : is_group_mode " <<  is_group_mode << std::endl;
-
-  // ================ DEBUG EDN =======================
-  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  const auto output_tensor = graph->AddNode(output_var_name,
-                                            output_shape,
-                                            CNML_TENSOR,
-                                            CNML_NHWC,
-                                            graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
-
+  const auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
   std::vector<int64_t> cnml_filter_shape = {
       filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
   if (is_depthwise_mode) {
@@ -118,11 +97,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Create filter node
-  std::shared_ptr<MLUTensor> filter_tensor = graph->AddNode(filter_var_name,
-                                                            cnml_filter_shape,
-                                                            CNML_FILTER,
-                                                            CNML_NCHW,
-                                                            graph->FPType());
+  const auto filter_tensor = graph->AddNode(filter_var_name,
+                                            cnml_filter_shape,
+                                            CNML_FILTER,
+                                            CNML_NCHW,
+                                            graph->FPType());
   const auto weight_scale =
       op_info->GetAttr<std::vector<float>>("weight_scale");
 
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
index 41526a0100..f58b68290c 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -23,7 +23,7 @@ namespace mlu {
 
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
   auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
   auto y_dims = y->dims();
   CHECK_GE(x_dims.size(), y_dims.size());
 
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
index 286feec8d4..b74b18bdbc 100644
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(w_dims.size(), 2UL);
 
   // Create w node
-  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  std::vector<int64_t> cnml_w_shape;
+  if (x_dims.size() == 4) {
+    if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
+      cnml_w_shape = {
+          static_cast<int>(w_dims[1]),
+          static_cast<int>(x_dims[1]),  // input_c
+          static_cast<int>(x_dims[2]),  //  input_h
+          static_cast<int>(x_dims[3]),  //  input_w
+      };
+    } else {
+      LOG(FATAL)
+          << "in fc op, we expect input_h * input_w * input_c == filter_c"
+          << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
+          << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
+          << std::endl;
+    }
+  } else {
+    cnml_w_shape = {w_dims[1], w_dims[0]};
+  }
+
   auto w_tensor = graph->AddNode(
-      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+      w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
 
   auto input_scale = op_info->GetAttr<float>("input_scale");
 
@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (w->precision() == PrecisionType::kUnk ||
       w->precision() == PrecisionType::kInt8) {
     std::vector<float> w_dequant(w->data_size());
-    dequant(w_dequant.data(),
-            w->mutable_data<int8_t>(),
-            1,
-            w_dims[1],
-            w_dims[0],
-            weight_scale);
-    for (int i = 0; i < w_dims[1]; i++) {
-      for (int j = 0; j < w_dims[0]; j++) {
-        w->mutable_data<float>()[i * w_dims[0] + j] =
-            w_dequant[i + j * w_dims[1]];
-      }
+    if (cnml_w_shape.size() == 2) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1],
+              weight_scale);
+      transpose2d(w_dequant.data(),
+                  w->mutable_data<float>(),
+                  {static_cast<int>(cnml_w_shape[0]),
+                   static_cast<int>(cnml_w_shape[1])});
+    } else if (cnml_w_shape.size() == 4) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
+              weight_scale);
+
+      int c_o_num = cnml_w_shape[0];
+      int c_i_num = cnml_w_shape[1];
+      int h_i_num = cnml_w_shape[2];
+      int w_i_num = cnml_w_shape[3];
+
+      // chw == ci * hi * wi == w_dim[0]
+      // first trans [chw, co] -> [co,chw]
+      std::vector<float> first_trans_output(w_dequant.size());
+      int chw = c_i_num * h_i_num * w_i_num;
+      transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
+
+      // second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
+      transpose(first_trans_output.data(),
+                w->mutable_data<float>(),
+                {c_o_num, c_i_num, h_i_num, w_i_num},
+                {0, 2, 3, 1});
+    } else {
+      LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
+                 << cnml_w_shape.size() << std::endl;
     }
+
     w->set_precision(PrecisionType::kFloat);
   } else if (w->precision() != PrecisionType::kFloat) {
     LOG(FATAL) << "UnSupported weight precision!";
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
index be7e1f09be..c426069c7d 100644
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -16,6 +16,7 @@
 #include <glog/logging.h>
 #include <algorithm>
 #include <climits>
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -258,6 +259,59 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
   return mlu_tensor_;
 }
 
+void MLUTensor::ToFile(std::string file_name) {
+  if (mlu_ptr_) {
+    VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name
+            << std::endl;
+    int count = 1;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      count *= shape_[i];
+    }
+    VLOG(6) << " dump count: " << count << std::endl;
+    VLOG(6) << " dump shape: " << std::endl;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      VLOG(6) << shape_[i] << " ";
+    }
+
+    VLOG(6) << std::endl;
+
+    std::vector<float> cpu_data_fp32(count);
+    // fp16 to fp32
+    if (mlu_dtype_ == CNML_DATA_FLOAT16) {
+      VLOG(6) << " convert fp16 to fp32 " << std::endl;
+      std::vector<uint16_t> cpu_data_fp16(count);
+      cnrtMemcpy(cpu_data_fp16.data(),
+                 mlu_ptr_,
+                 count * sizeof(uint16_t),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+      for (size_t i = 0; i < count; i++) {
+        cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
+      }
+    } else {
+      cnrtMemcpy(cpu_data_fp32.data(),
+                 mlu_ptr_,
+                 count * sizeof(float),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+    }
+
+    // trans to nchw
+    std::vector<float> cpu_data_trans(count);
+    transpose(
+        cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 3, 1, 2});
+
+    // to file
+    std::ofstream of;
+    of.open(file_name, std::ios::out);
+    for (size_t i = 0; i < count; i++) {
+      of << cpu_data_trans[i] << std::endl;
+    }
+    of.close();
+  } else {
+    LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " << file_name
+               << std::endl;
+  }
+}
+
 MLUTensor::~MLUTensor() {
   if (mlu_tensor_ != nullptr) {
     CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
index 12dc97a772..2a4cc23a73 100644
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <fstream>
+#include <string>
 #include <vector>
 #include "lite/kernels/mlu/bridges/utility.h"
 
@@ -51,6 +53,8 @@ class MLUTensor {
 
   ~MLUTensor();
 
+  void ToFile(std::string file_name);
+
  private:
   cnmlTensor_t mlu_tensor_;
 
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
index f18a46518c..a8a19ec94a 100644
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -20,6 +20,21 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape) {
+  CHECK_EQ(input_shape.size(), 2);
+  int old_index = -1;
+  int new_index = -1;
+  for (size_t i = 0; i < input_shape[0]; i++) {
+    for (size_t j = 0; j < input_shape[1]; j++) {
+      old_index = i * input_shape[1] + j;
+      new_index = j * input_shape[0] + i;
+      output_data[new_index] = input_data[old_index];
+    }
+  }
+}
+
 void transpose(float* input_data,
                float* output_data,
                std::vector<int> input_shape,
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
index fa8fb1597c..cc28ea892e 100644
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -29,6 +29,9 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape);
 void transpose(float* input_data,
                float* output_data,
                std::vector<int> input_shape,
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 51a9c0ffe0..7d9db21133 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -151,6 +151,32 @@ class SubgraphEngine : public subgraph::Engine {
     forward_param.affinity = &affinity;
     forward_param.end = CNRT_PARAM_END;
     graph_.Compute(forward_param, exec_queue);
+
+    // // =========== DUMP ===================
+    // for (auto input_name : input_names_) {
+    //   auto input_tensor = graph_.GetNode(input_name);
+    //   auto dump_name = input_name;
+    //   while (dump_name.find("/") != std::string::npos) {
+    //     dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+    //   }
+    //   VLOG(6) << "dump_name: " << dump_name;
+    //   input_tensor->ToFile(dump_name);
+    // }
+    // for (auto output_name : output_names_) {
+    //   if (graph_.HasNode(output_name)) {
+    //     auto output_tensor = graph_.GetNode(output_name);
+    //     auto dump_name = output_name;
+    //     while (dump_name.find("/") != std::string::npos) {
+    //       dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+    //     }
+    //     VLOG(6) << "dump_name: " << dump_name;
+    //     output_tensor->ToFile(dump_name);
+    //   } else {
+    //     VLOG(6) << "graph does not have " << output_name << " as output"
+    //             << std::endl;
+    //   }
+    // }
+    // // =========== DUMP END ================
     return 0;
   }
 
-- 
GitLab