提交 ef5c0165 编写于 作者: Z zhaoying 提交者: jackzhang235

(ref):

    1. disable conv activation pass by default
    2. set fc_fuser'param with_relu false while mlu fc kernel does not
    support relu
    3. change fc filter shape from 2 dim to 4 dim while input dim == 4
    4. add ToFile func in mlu tensor for debug convenience
    5. enable 4-dim input in elementwise_ops
    6. add transpose2d in utility.cc
上级 4d35336b
......@@ -24,8 +24,13 @@ namespace mir {
void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_X86
#ifdef LITE_WITH_MLU
fusion::FcFuser fuser(false);
fuser(graph.get());
#elif
fusion::FcFuser fuser(true);
fuser(graph.get());
#endif
#endif
fusion::FcFuser fuser2(false);
......
......@@ -60,9 +60,12 @@ class Optimizer {
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
#ifndef LITE_WITH_MLU // mlu can not treat conv-conv parttern because kernel
// picker expect a int8 conv2d kernel
"lite_conv_activation_fuse_pass", //
#endif
"lite_var_conv_2d_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
......
......@@ -74,10 +74,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
padding_algorithm,
input_dims,
filter_dims);
bool is_group_mode = false;
if (groups > 1) {
is_group_mode = true;
}
bool is_group_mode = groups > 1;
bool is_depthwise_mode = false;
if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
......@@ -86,26 +83,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
is_group_mode = false;
}
// ================ DEBUG =======================
VLOG(4) << "conv2d op input_var_name : " << input_var_name << std::endl;
VLOG(4) << "conv2d op : filter_var_name " << filter_var_name << std::endl;
VLOG(4) << "conv2d op : output_var_name " << output_var_name << std::endl;
VLOG(4) << "conv2d op : groups " << groups << std::endl;
VLOG(4) << "conv2d op : is_depthwise_mode " << is_depthwise_mode<< std::endl;
VLOG(4) << "conv2d op : is_group_mode " << is_group_mode << std::endl;
// ================ DEBUG EDN =======================
const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
const auto output_tensor = graph->AddNode(output_var_name,
output_shape,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
const auto output_tensor = graph->AddNode(
output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
std::vector<int64_t> cnml_filter_shape = {
filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
if (is_depthwise_mode) {
......@@ -118,11 +97,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
// Create filter node
std::shared_ptr<MLUTensor> filter_tensor = graph->AddNode(filter_var_name,
cnml_filter_shape,
CNML_FILTER,
CNML_NCHW,
graph->FPType());
const auto filter_tensor = graph->AddNode(filter_var_name,
cnml_filter_shape,
CNML_FILTER,
CNML_NCHW,
graph->FPType());
const auto weight_scale =
op_info->GetAttr<std::vector<float>>("weight_scale");
......
......@@ -23,7 +23,7 @@ namespace mlu {
std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
auto x_dims = x.dims();
CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
// CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
auto y_dims = y->dims();
CHECK_GE(x_dims.size(), y_dims.size());
......
......@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK_EQ(w_dims.size(), 2UL);
// Create w node
std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
std::vector<int64_t> cnml_w_shape;
if (x_dims.size() == 4) {
if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
cnml_w_shape = {
static_cast<int>(w_dims[1]),
static_cast<int>(x_dims[1]), // input_c
static_cast<int>(x_dims[2]), // input_h
static_cast<int>(x_dims[3]), // input_w
};
} else {
LOG(FATAL)
<< "in fc op, we expect input_h * input_w * input_c == filter_c"
<< " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
<< " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
<< std::endl;
}
} else {
cnml_w_shape = {w_dims[1], w_dims[0]};
}
auto w_tensor = graph->AddNode(
w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
auto input_scale = op_info->GetAttr<float>("input_scale");
......@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (w->precision() == PrecisionType::kUnk ||
w->precision() == PrecisionType::kInt8) {
std::vector<float> w_dequant(w->data_size());
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
w_dims[1],
w_dims[0],
weight_scale);
for (int i = 0; i < w_dims[1]; i++) {
for (int j = 0; j < w_dims[0]; j++) {
w->mutable_data<float>()[i * w_dims[0] + j] =
w_dequant[i + j * w_dims[1]];
}
if (cnml_w_shape.size() == 2) {
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
cnml_w_shape[0],
cnml_w_shape[1],
weight_scale);
transpose2d(w_dequant.data(),
w->mutable_data<float>(),
{static_cast<int>(cnml_w_shape[0]),
static_cast<int>(cnml_w_shape[1])});
} else if (cnml_w_shape.size() == 4) {
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
cnml_w_shape[0],
cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
weight_scale);
int c_o_num = cnml_w_shape[0];
int c_i_num = cnml_w_shape[1];
int h_i_num = cnml_w_shape[2];
int w_i_num = cnml_w_shape[3];
// chw == ci * hi * wi == w_dim[0]
// first trans [chw, co] -> [co,chw]
std::vector<float> first_trans_output(w_dequant.size());
int chw = c_i_num * h_i_num * w_i_num;
transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
// second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
transpose(first_trans_output.data(),
w->mutable_data<float>(),
{c_o_num, c_i_num, h_i_num, w_i_num},
{0, 2, 3, 1});
} else {
LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
<< cnml_w_shape.size() << std::endl;
}
w->set_precision(PrecisionType::kFloat);
} else if (w->precision() != PrecisionType::kFloat) {
LOG(FATAL) << "UnSupported weight precision!";
......
......@@ -16,6 +16,7 @@
#include <glog/logging.h>
#include <algorithm>
#include <climits>
#include <string>
#include <vector>
namespace paddle {
......@@ -258,6 +259,59 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
return mlu_tensor_;
}
void MLUTensor::ToFile(std::string file_name) {
if (mlu_ptr_) {
VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name
<< std::endl;
int count = 1;
for (size_t i = 0; i < shape_.size(); i++) {
count *= shape_[i];
}
VLOG(6) << " dump count: " << count << std::endl;
VLOG(6) << " dump shape: " << std::endl;
for (size_t i = 0; i < shape_.size(); i++) {
VLOG(6) << shape_[i] << " ";
}
VLOG(6) << std::endl;
std::vector<float> cpu_data_fp32(count);
// fp16 to fp32
if (mlu_dtype_ == CNML_DATA_FLOAT16) {
VLOG(6) << " convert fp16 to fp32 " << std::endl;
std::vector<uint16_t> cpu_data_fp16(count);
cnrtMemcpy(cpu_data_fp16.data(),
mlu_ptr_,
count * sizeof(uint16_t),
CNRT_MEM_TRANS_DIR_DEV2HOST);
for (size_t i = 0; i < count; i++) {
cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
}
} else {
cnrtMemcpy(cpu_data_fp32.data(),
mlu_ptr_,
count * sizeof(float),
CNRT_MEM_TRANS_DIR_DEV2HOST);
}
// trans to nchw
std::vector<float> cpu_data_trans(count);
transpose(
cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 3, 1, 2});
// to file
std::ofstream of;
of.open(file_name, std::ios::out);
for (size_t i = 0; i < count; i++) {
of << cpu_data_trans[i] << std::endl;
}
of.close();
} else {
LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " << file_name
<< std::endl;
}
}
MLUTensor::~MLUTensor() {
if (mlu_tensor_ != nullptr) {
CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
......
......@@ -14,6 +14,8 @@
#pragma once
#include <fstream>
#include <string>
#include <vector>
#include "lite/kernels/mlu/bridges/utility.h"
......@@ -51,6 +53,8 @@ class MLUTensor {
~MLUTensor();
void ToFile(std::string file_name);
private:
cnmlTensor_t mlu_tensor_;
......
......@@ -20,6 +20,21 @@ namespace lite {
namespace subgraph {
namespace mlu {
void transpose2d(float* input_data,
float* output_data,
std::vector<int> input_shape) {
CHECK_EQ(input_shape.size(), 2);
int old_index = -1;
int new_index = -1;
for (size_t i = 0; i < input_shape[0]; i++) {
for (size_t j = 0; j < input_shape[1]; j++) {
old_index = i * input_shape[1] + j;
new_index = j * input_shape[0] + i;
output_data[new_index] = input_data[old_index];
}
}
}
void transpose(float* input_data,
float* output_data,
std::vector<int> input_shape,
......
......@@ -29,6 +29,9 @@ namespace lite {
namespace subgraph {
namespace mlu {
void transpose2d(float* input_data,
float* output_data,
std::vector<int> input_shape);
void transpose(float* input_data,
float* output_data,
std::vector<int> input_shape,
......
......@@ -151,6 +151,32 @@ class SubgraphEngine : public subgraph::Engine {
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph_.Compute(forward_param, exec_queue);
// // =========== DUMP ===================
// for (auto input_name : input_names_) {
// auto input_tensor = graph_.GetNode(input_name);
// auto dump_name = input_name;
// while (dump_name.find("/") != std::string::npos) {
// dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
// }
// VLOG(6) << "dump_name: " << dump_name;
// input_tensor->ToFile(dump_name);
// }
// for (auto output_name : output_names_) {
// if (graph_.HasNode(output_name)) {
// auto output_tensor = graph_.GetNode(output_name);
// auto dump_name = output_name;
// while (dump_name.find("/") != std::string::npos) {
// dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
// }
// VLOG(6) << "dump_name: " << dump_name;
// output_tensor->ToFile(dump_name);
// } else {
// VLOG(6) << "graph does not have " << output_name << " as output"
// << std::endl;
// }
// }
// // =========== DUMP END ================
return 0;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册