(ref):

1. disable conv activation pass by default 2. set fc_fuser'param with_relu false while mlu fc kernel does not support relu 3. change fc filter shape from 2 dim to 4 dim while input dim == 4 4. add ToFile func in mlu tensor for debug convenience 5. enable 4-dim input in elementwise_ops 6. add transpose2d in utility.cc

(ref):
1. disable conv activation pass by default 2. set fc_fuser'param with_relu false while mlu fc kernel does not support relu 3. change fc filter shape from 2 dim to 4 dim while input dim == 4 4. add ToFile func in mlu tensor for debug convenience 5. enable 4-dim input in elementwise_ops 6. add transpose2d in utility.cc
ef5c0165 · zhaoying · jackzhang235 · 4d35336b · ef5c0165 · ef5c0165
10 changed file
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -24,8 +24,13 @@ namespace mir {
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_X86
+#ifdef LITE_WITH_MLU
+  fusion::FcFuser fuser(false);
+  fuser(graph.get());
+#elif
  fusion::FcFuser fuser(true);
  fuser(graph.get());
+#endif
 #endif
  fusion::FcFuser fuser2(false);

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -60,9 +60,12 @@ class Optimizer {
           "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
           "lite_conv_bn_fuse_pass",               //
           "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
-           // TODO(Superjomn) Refine the fusion related design to select fusion
+// TODO(Superjomn) Refine the fusion related design to select fusion
-           // kernels for devices automatically.
+// kernels for devices automatically.
-           "lite_conv_activation_fuse_pass",              //
+#ifndef LITE_WITH_MLU  // mlu can not treat conv-conv parttern because kernel
+                       // picker expect a int8 conv2d kernel
+           "lite_conv_activation_fuse_pass",  //
+#endif
           "lite_var_conv_2d_activation_fuse_pass",       //
           "lite_fc_fuse_pass",                           //
           "lite_shuffle_channel_fuse_pass",              //

--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -74,10 +74,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                      padding_algorithm,
                                      input_dims,
                                      filter_dims);
-  bool is_group_mode = false;
+  bool is_group_mode = groups > 1;
-  if (groups > 1) {
-    is_group_mode = true;
-  }
  bool is_depthwise_mode = false;
  if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
@@ -86,26 +83,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    is_group_mode = false;
  }
-  // ================ DEBUG =======================
+  const auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
-  VLOG(4) << "conv2d op input_var_name : " << input_var_name  << std::endl;
-  VLOG(4) << "conv2d op : filter_var_name " << filter_var_name << std::endl;
-  VLOG(4) << "conv2d op : output_var_name " <<  output_var_name << std::endl;
-  VLOG(4) << "conv2d op : groups " <<  groups << std::endl;
-  VLOG(4) << "conv2d op : is_depthwise_mode " <<  is_depthwise_mode<< std::endl;
-  VLOG(4) << "conv2d op : is_group_mode " <<  is_group_mode << std::endl;
-  // ================ DEBUG EDN =======================
-  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  const auto output_tensor = graph->AddNode(output_var_name,
-                                            output_shape,
-                                            CNML_TENSOR,
-                                            CNML_NHWC,
-                                            graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
  std::vector<int64_t> cnml_filter_shape = {
      filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
  if (is_depthwise_mode) {
@@ -118,11 +97,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  // Create filter node
-  std::shared_ptr<MLUTensor> filter_tensor = graph->AddNode(filter_var_name,
+  const auto filter_tensor = graph->AddNode(filter_var_name,
-                                                            cnml_filter_shape,
+                                            cnml_filter_shape,
-                                                            CNML_FILTER,
+                                            CNML_FILTER,
-                                                            CNML_NCHW,
+                                            CNML_NCHW,
-                                                            graph->FPType());
+                                            graph->FPType());
  const auto weight_scale =
      op_info->GetAttr<std::vector<float>>("weight_scale");

--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -23,7 +23,7 @@ namespace mlu {
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
  auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
  auto y_dims = y->dims();
  CHECK_GE(x_dims.size(), y_dims.size());

--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(w_dims.size(), 2UL);
  // Create w node
-  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  std::vector<int64_t> cnml_w_shape;
+  if (x_dims.size() == 4) {
+    if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
+      cnml_w_shape = {
+          static_cast<int>(w_dims[1]),
+          static_cast<int>(x_dims[1]),  // input_c
+          static_cast<int>(x_dims[2]),  //  input_h
+          static_cast<int>(x_dims[3]),  //  input_w
+      };
+    } else {
+      LOG(FATAL)
+          << "in fc op, we expect input_h * input_w * input_c == filter_c"
+          << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
+          << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
+          << std::endl;
+    }
+  } else {
+    cnml_w_shape = {w_dims[1], w_dims[0]};
+  }
  auto w_tensor = graph->AddNode(
-      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+      w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
  auto input_scale = op_info->GetAttr<float>("input_scale");
@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (w->precision() == PrecisionType::kUnk ||
      w->precision() == PrecisionType::kInt8) {
    std::vector<float> w_dequant(w->data_size());
-    dequant(w_dequant.data(),
+    if (cnml_w_shape.size() == 2) {
-            w->mutable_data<int8_t>(),
+      dequant(w_dequant.data(),
-            1,
+              w->mutable_data<int8_t>(),
-            w_dims[1],
+              1,
-            w_dims[0],
+              cnml_w_shape[0],
-            weight_scale);
+              cnml_w_shape[1],
-    for (int i = 0; i < w_dims[1]; i++) {
+              weight_scale);
-      for (int j = 0; j < w_dims[0]; j++) {
+      transpose2d(w_dequant.data(),
-        w->mutable_data<float>()[i * w_dims[0] + j] =
+                  w->mutable_data<float>(),
-            w_dequant[i + j * w_dims[1]];
+                  {static_cast<int>(cnml_w_shape[0]),
-      }
+                   static_cast<int>(cnml_w_shape[1])});
+    } else if (cnml_w_shape.size() == 4) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
+              weight_scale);
+      int c_o_num = cnml_w_shape[0];
+      int c_i_num = cnml_w_shape[1];
+      int h_i_num = cnml_w_shape[2];
+      int w_i_num = cnml_w_shape[3];
+      // chw == ci * hi * wi == w_dim[0]
+      // first trans [chw, co] -> [co,chw]
+      std::vector<float> first_trans_output(w_dequant.size());
+      int chw = c_i_num * h_i_num * w_i_num;
+      transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
+      // second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
+      transpose(first_trans_output.data(),
+                w->mutable_data<float>(),
+                {c_o_num, c_i_num, h_i_num, w_i_num},
+                {0, 2, 3, 1});
+    } else {
+      LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
+                 << cnml_w_shape.size() << std::endl;
    }
    w->set_precision(PrecisionType::kFloat);
  } else if (w->precision() != PrecisionType::kFloat) {
    LOG(FATAL) << "UnSupported weight precision!";

--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -16,6 +16,7 @@
 #include <glog/logging.h>
 #include <algorithm>
 #include <climits>
+#include <string>
 #include <vector>
 namespace paddle {
@@ -258,6 +259,59 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
  return mlu_tensor_;
 }
+void MLUTensor::ToFile(std::string file_name) {
+  if (mlu_ptr_) {
+    VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name
+            << std::endl;
+    int count = 1;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      count *= shape_[i];
+    }
+    VLOG(6) << " dump count: " << count << std::endl;
+    VLOG(6) << " dump shape: " << std::endl;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      VLOG(6) << shape_[i] << " ";
+    }
+    VLOG(6) << std::endl;
+    std::vector<float> cpu_data_fp32(count);
+    // fp16 to fp32
+    if (mlu_dtype_ == CNML_DATA_FLOAT16) {
+      VLOG(6) << " convert fp16 to fp32 " << std::endl;
+      std::vector<uint16_t> cpu_data_fp16(count);
+      cnrtMemcpy(cpu_data_fp16.data(),
+                 mlu_ptr_,
+                 count * sizeof(uint16_t),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+      for (size_t i = 0; i < count; i++) {
+        cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
+      }
+    } else {
+      cnrtMemcpy(cpu_data_fp32.data(),
+                 mlu_ptr_,
+                 count * sizeof(float),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+    }
+    // trans to nchw
+    std::vector<float> cpu_data_trans(count);
+    transpose(
+        cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 3, 1, 2});
+    // to file
+    std::ofstream of;
+    of.open(file_name, std::ios::out);
+    for (size_t i = 0; i < count; i++) {
+      of << cpu_data_trans[i] << std::endl;
+    }
+    of.close();
+  } else {
+    LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " << file_name
+               << std::endl;
+  }
+}
 MLUTensor::~MLUTensor() {
  if (mlu_tensor_ != nullptr) {
    CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));

--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <fstream>
+#include <string>
 #include <vector>
 #include "lite/kernels/mlu/bridges/utility.h"
@@ -51,6 +53,8 @@ class MLUTensor {
  ~MLUTensor();
+  void ToFile(std::string file_name);
 private:
  cnmlTensor_t mlu_tensor_;

--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -20,6 +20,21 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape) {
+  CHECK_EQ(input_shape.size(), 2);
+  int old_index = -1;
+  int new_index = -1;
+  for (size_t i = 0; i < input_shape[0]; i++) {
+    for (size_t j = 0; j < input_shape[1]; j++) {
+      old_index = i * input_shape[1] + j;
+      new_index = j * input_shape[0] + i;
+      output_data[new_index] = input_data[old_index];
+    }
+  }
+}
 void transpose(float* input_data,
               float* output_data,
               std::vector<int> input_shape,

--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -29,6 +29,9 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape);
 void transpose(float* input_data,
               float* output_data,
               std::vector<int> input_shape,

--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -151,6 +151,32 @@ class SubgraphEngine : public subgraph::Engine {
    forward_param.affinity = &affinity;
    forward_param.end = CNRT_PARAM_END;
    graph_.Compute(forward_param, exec_queue);
+    // // =========== DUMP ===================
+    // for (auto input_name : input_names_) {
+    //   auto input_tensor = graph_.GetNode(input_name);
+    //   auto dump_name = input_name;
+    //   while (dump_name.find("/") != std::string::npos) {
+    //     dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+    //   }
+    //   VLOG(6) << "dump_name: " << dump_name;
+    //   input_tensor->ToFile(dump_name);
+    // }
+    // for (auto output_name : output_names_) {
+    //   if (graph_.HasNode(output_name)) {
+    //     auto output_tensor = graph_.GetNode(output_name);
+    //     auto dump_name = output_name;
+    //     while (dump_name.find("/") != std::string::npos) {
+    //       dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+    //     }
+    //     VLOG(6) << "dump_name: " << dump_name;
+    //     output_tensor->ToFile(dump_name);
+    //   } else {
+    //     VLOG(6) << "graph does not have " << output_name << " as output"
+    //             << std::endl;
+    //   }
+    // }
+    // // =========== DUMP END ================
    return 0;
  }