From ef5c01653051ce276c64e6abaa0ee06505d02e70 Mon Sep 17 00:00:00 2001 From: zhaoying Date: Wed, 1 Apr 2020 10:40:23 +0800 Subject: [PATCH] (ref): 1. disable conv activation pass by default 2. set fc_fuser'param with_relu false while mlu fc kernel does not support relu 3. change fc filter shape from 2 dim to 4 dim while input dim == 4 4. add ToFile func in mlu tensor for debug convenience 5. enable 4-dim input in elementwise_ops 6. add transpose2d in utility.cc --- lite/core/mir/fusion/fc_fuse_pass.cc | 5 ++ lite/core/optimizer.h | 9 ++- lite/kernels/mlu/bridges/conv_op.cc | 37 +++-------- lite/kernels/mlu/bridges/elementwise_ops.cc | 2 +- lite/kernels/mlu/bridges/fc_op.cc | 73 +++++++++++++++++---- lite/kernels/mlu/bridges/tensor.cc | 54 +++++++++++++++ lite/kernels/mlu/bridges/tensor.h | 4 ++ lite/kernels/mlu/bridges/utility.cc | 15 +++++ lite/kernels/mlu/bridges/utility.h | 3 + lite/kernels/mlu/subgraph_compute.h | 26 ++++++++ 10 files changed, 182 insertions(+), 46 deletions(-) diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index 46695be396..6b1ef03198 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -24,8 +24,13 @@ namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { #ifdef LITE_WITH_X86 +#ifdef LITE_WITH_MLU + fusion::FcFuser fuser(false); + fuser(graph.get()); +#elif fusion::FcFuser fuser(true); fuser(graph.get()); +#endif #endif fusion::FcFuser fuser2(false); diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 025e5e769c..4348f9eeaa 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -60,9 +60,12 @@ class Optimizer { "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - // TODO(Superjomn) Refine the fusion related design to select fusion - // kernels for devices automatically. - "lite_conv_activation_fuse_pass", // +// TODO(Superjomn) Refine the fusion related design to select fusion +// kernels for devices automatically. +#ifndef LITE_WITH_MLU // mlu can not treat conv-conv parttern because kernel + // picker expect a int8 conv2d kernel + "lite_conv_activation_fuse_pass", // +#endif "lite_var_conv_2d_activation_fuse_pass", // "lite_fc_fuse_pass", // "lite_shuffle_channel_fuse_pass", // diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc index 2db9cfbd78..67682c5d6c 100644 --- a/lite/kernels/mlu/bridges/conv_op.cc +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -74,10 +74,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { padding_algorithm, input_dims, filter_dims); - bool is_group_mode = false; - if (groups > 1) { - is_group_mode = true; - } + bool is_group_mode = groups > 1; bool is_depthwise_mode = false; if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 && @@ -86,26 +83,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { is_group_mode = false; } - // ================ DEBUG ======================= - - VLOG(4) << "conv2d op input_var_name : " << input_var_name << std::endl; - VLOG(4) << "conv2d op : filter_var_name " << filter_var_name << std::endl; - VLOG(4) << "conv2d op : output_var_name " << output_var_name << std::endl; - VLOG(4) << "conv2d op : groups " << groups << std::endl; - VLOG(4) << "conv2d op : is_depthwise_mode " << is_depthwise_mode<< std::endl; - VLOG(4) << "conv2d op : is_group_mode " << is_group_mode << std::endl; - - // ================ DEBUG EDN ======================= - const auto output_shape_nhwc = DimNCHW2NHWC(output_shape); - const auto output_tensor = graph->AddNode(output_var_name, - output_shape, - CNML_TENSOR, - CNML_NHWC, - graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); - + const auto output_tensor = graph->AddNode( + output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); std::vector cnml_filter_shape = { filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]}; if (is_depthwise_mode) { @@ -118,11 +97,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Create filter node - std::shared_ptr filter_tensor = graph->AddNode(filter_var_name, - cnml_filter_shape, - CNML_FILTER, - CNML_NCHW, - graph->FPType()); + const auto filter_tensor = graph->AddNode(filter_var_name, + cnml_filter_shape, + CNML_FILTER, + CNML_NCHW, + graph->FPType()); const auto weight_scale = op_info->GetAttr>("weight_scale"); diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc index 41526a0100..f58b68290c 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -23,7 +23,7 @@ namespace mlu { std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { auto x_dims = x.dims(); - CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; + // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; auto y_dims = y->dims(); CHECK_GE(x_dims.size(), y_dims.size()); diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc index 286feec8d4..b74b18bdbc 100644 --- a/lite/kernels/mlu/bridges/fc_op.cc +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(w_dims.size(), 2UL); // Create w node - std::vector w_shape{w_dims[1], w_dims[0]}; + std::vector cnml_w_shape; + if (x_dims.size() == 4) { + if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) { + cnml_w_shape = { + static_cast(w_dims[1]), + static_cast(x_dims[1]), // input_c + static_cast(x_dims[2]), // input_h + static_cast(x_dims[3]), // input_w + }; + } else { + LOG(FATAL) + << "in fc op, we expect input_h * input_w * input_c == filter_c" + << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2] + << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0] + << std::endl; + } + } else { + cnml_w_shape = {w_dims[1], w_dims[0]}; + } + auto w_tensor = graph->AddNode( - w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); + w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); auto input_scale = op_info->GetAttr("input_scale"); @@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (w->precision() == PrecisionType::kUnk || w->precision() == PrecisionType::kInt8) { std::vector w_dequant(w->data_size()); - dequant(w_dequant.data(), - w->mutable_data(), - 1, - w_dims[1], - w_dims[0], - weight_scale); - for (int i = 0; i < w_dims[1]; i++) { - for (int j = 0; j < w_dims[0]; j++) { - w->mutable_data()[i * w_dims[0] + j] = - w_dequant[i + j * w_dims[1]]; - } + if (cnml_w_shape.size() == 2) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1], + weight_scale); + transpose2d(w_dequant.data(), + w->mutable_data(), + {static_cast(cnml_w_shape[0]), + static_cast(cnml_w_shape[1])}); + } else if (cnml_w_shape.size() == 4) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3], + weight_scale); + + int c_o_num = cnml_w_shape[0]; + int c_i_num = cnml_w_shape[1]; + int h_i_num = cnml_w_shape[2]; + int w_i_num = cnml_w_shape[3]; + + // chw == ci * hi * wi == w_dim[0] + // first trans [chw, co] -> [co,chw] + std::vector first_trans_output(w_dequant.size()); + int chw = c_i_num * h_i_num * w_i_num; + transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num}); + + // second trans [co,ci,hi,wi] -> [co,hi,wi,ci] + transpose(first_trans_output.data(), + w->mutable_data(), + {c_o_num, c_i_num, h_i_num, w_i_num}, + {0, 2, 3, 1}); + } else { + LOG(FATAL) << "expect w_shape.size == 2 or 4, but got " + << cnml_w_shape.size() << std::endl; } + w->set_precision(PrecisionType::kFloat); } else if (w->precision() != PrecisionType::kFloat) { LOG(FATAL) << "UnSupported weight precision!"; diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc index be7e1f09be..c426069c7d 100644 --- a/lite/kernels/mlu/bridges/tensor.cc +++ b/lite/kernels/mlu/bridges/tensor.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include namespace paddle { @@ -258,6 +259,59 @@ cnmlTensor_t MLUTensor::mlu_tensor() { return mlu_tensor_; } +void MLUTensor::ToFile(std::string file_name) { + if (mlu_ptr_) { + VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name + << std::endl; + int count = 1; + for (size_t i = 0; i < shape_.size(); i++) { + count *= shape_[i]; + } + VLOG(6) << " dump count: " << count << std::endl; + VLOG(6) << " dump shape: " << std::endl; + for (size_t i = 0; i < shape_.size(); i++) { + VLOG(6) << shape_[i] << " "; + } + + VLOG(6) << std::endl; + + std::vector cpu_data_fp32(count); + // fp16 to fp32 + if (mlu_dtype_ == CNML_DATA_FLOAT16) { + VLOG(6) << " convert fp16 to fp32 " << std::endl; + std::vector cpu_data_fp16(count); + cnrtMemcpy(cpu_data_fp16.data(), + mlu_ptr_, + count * sizeof(uint16_t), + CNRT_MEM_TRANS_DIR_DEV2HOST); + for (size_t i = 0; i < count; i++) { + cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]); + } + } else { + cnrtMemcpy(cpu_data_fp32.data(), + mlu_ptr_, + count * sizeof(float), + CNRT_MEM_TRANS_DIR_DEV2HOST); + } + + // trans to nchw + std::vector cpu_data_trans(count); + transpose( + cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 3, 1, 2}); + + // to file + std::ofstream of; + of.open(file_name, std::ios::out); + for (size_t i = 0; i < count; i++) { + of << cpu_data_trans[i] << std::endl; + } + of.close(); + } else { + LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " << file_name + << std::endl; + } +} + MLUTensor::~MLUTensor() { if (mlu_tensor_ != nullptr) { CNML_CALL(cnmlDestroyTensor(&mlu_tensor_)); diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 12dc97a772..2a4cc23a73 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include "lite/kernels/mlu/bridges/utility.h" @@ -51,6 +53,8 @@ class MLUTensor { ~MLUTensor(); + void ToFile(std::string file_name); + private: cnmlTensor_t mlu_tensor_; diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc index f18a46518c..a8a19ec94a 100644 --- a/lite/kernels/mlu/bridges/utility.cc +++ b/lite/kernels/mlu/bridges/utility.cc @@ -20,6 +20,21 @@ namespace lite { namespace subgraph { namespace mlu { +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape) { + CHECK_EQ(input_shape.size(), 2); + int old_index = -1; + int new_index = -1; + for (size_t i = 0; i < input_shape[0]; i++) { + for (size_t j = 0; j < input_shape[1]; j++) { + old_index = i * input_shape[1] + j; + new_index = j * input_shape[0] + i; + output_data[new_index] = input_data[old_index]; + } + } +} + void transpose(float* input_data, float* output_data, std::vector input_shape, diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index fa8fb1597c..cc28ea892e 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -29,6 +29,9 @@ namespace lite { namespace subgraph { namespace mlu { +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape); void transpose(float* input_data, float* output_data, std::vector input_shape, diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 51a9c0ffe0..7d9db21133 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -151,6 +151,32 @@ class SubgraphEngine : public subgraph::Engine { forward_param.affinity = &affinity; forward_param.end = CNRT_PARAM_END; graph_.Compute(forward_param, exec_queue); + + // // =========== DUMP =================== + // for (auto input_name : input_names_) { + // auto input_tensor = graph_.GetNode(input_name); + // auto dump_name = input_name; + // while (dump_name.find("/") != std::string::npos) { + // dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + // } + // VLOG(6) << "dump_name: " << dump_name; + // input_tensor->ToFile(dump_name); + // } + // for (auto output_name : output_names_) { + // if (graph_.HasNode(output_name)) { + // auto output_tensor = graph_.GetNode(output_name); + // auto dump_name = output_name; + // while (dump_name.find("/") != std::string::npos) { + // dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + // } + // VLOG(6) << "dump_name: " << dump_name; + // output_tensor->ToFile(dump_name); + // } else { + // VLOG(6) << "graph does not have " << output_name << " as output" + // << std::endl; + // } + // } + // // =========== DUMP END ================ return 0; } -- GitLab