未验证 提交 d14e57f7 编写于 作者: L Leonardo-Ding 提交者: GitHub

[ARM] optimize depthwise int8 f3s1 arm neon kernel,test=develop (#4125)

上级 42cefe1b
因为 它太大了无法显示 source diff 。你可以改为 查看blob
...@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout, ...@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
int padh, int padh,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s1_int8_int8_impl(int8_t* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
int flag_act,
float* alpha,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
void conv_depthwise_3x3s1_int8_float_impl(float* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
int flag_act,
float* alpha,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
template <typename Dtype> template <typename Dtype>
void conv_depthwise_3x3s2_int8(Dtype* dout, void conv_depthwise_3x3s2_int8(Dtype* dout,
const int8_t* din, const int8_t* din,
......
...@@ -814,24 +814,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din, ...@@ -814,24 +814,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
alpha[3] = local_alpha; alpha[3] = local_alpha;
} }
} }
bool support_act_type = flag_act <= 1;
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
bool support_width_type = w_in > 9 ? true : false;
if (stride == 1) { if (stride == 1) {
conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout), if (!support_act_type || !support_pad_type || !support_stride_type ||
reinterpret_cast<const int8_t*>(din), !support_width_type) {
reinterpret_cast<const int8_t*>(weights), conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
scale, reinterpret_cast<const int8_t*>(din),
bias, reinterpret_cast<const int8_t*>(weights),
flag_bias, scale,
flag_act, bias,
alpha, flag_bias,
num, flag_act,
ch_in, alpha,
h_in, num,
w_in, ch_in,
h_out, h_in,
w_out, w_in,
pad_w, h_out,
pad_h, w_out,
ctx); pad_w,
pad_h,
ctx);
} else {
conv_depthwise_3x3s1_int8_float_impl(
reinterpret_cast<float*>(dout),
reinterpret_cast<const int8_t*>(din),
reinterpret_cast<const int8_t*>(weights),
scale,
bias,
flag_bias,
flag_act,
alpha,
num,
ch_in,
h_in,
w_in,
h_out,
w_out,
pad_w,
pad_h,
ctx);
}
} else if (stride == 2) { } else if (stride == 2) {
conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout), conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout),
reinterpret_cast<const int8_t*>(din), reinterpret_cast<const int8_t*>(din),
...@@ -897,24 +925,52 @@ void conv_depthwise_3x3_int8_int8(const void* din, ...@@ -897,24 +925,52 @@ void conv_depthwise_3x3_int8_int8(const void* din,
alpha[3] = local_alpha; alpha[3] = local_alpha;
} }
} }
bool support_act_type = flag_act <= 1;
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
bool support_width_type = w_in > 9 ? true : false;
if (stride == 1) { if (stride == 1) {
conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout), if (!support_act_type || !support_pad_type || !support_stride_type ||
reinterpret_cast<const int8_t*>(din), !support_width_type) {
reinterpret_cast<const int8_t*>(weights), conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
scale, reinterpret_cast<const int8_t*>(din),
bias, reinterpret_cast<const int8_t*>(weights),
flag_bias, scale,
flag_act, bias,
alpha, flag_bias,
num, flag_act,
ch_in, alpha,
h_in, num,
w_in, ch_in,
h_out, h_in,
w_out, w_in,
pad_w, h_out,
pad_h, w_out,
ctx); pad_w,
pad_h,
ctx);
} else {
conv_depthwise_3x3s1_int8_int8_impl(
reinterpret_cast<int8_t*>(dout),
reinterpret_cast<const int8_t*>(din),
reinterpret_cast<const int8_t*>(weights),
scale,
bias,
flag_bias,
flag_act,
alpha,
num,
ch_in,
h_in,
w_in,
h_out,
w_out,
pad_w,
pad_h,
ctx);
}
} else if (stride == 2) { } else if (stride == 2) {
conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout), conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout),
reinterpret_cast<const int8_t*>(din), reinterpret_cast<const int8_t*>(din),
......
...@@ -31,7 +31,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -31,7 +31,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
auto paddings = *param.paddings; auto paddings = *param.paddings;
// select dw conv kernel // select dw conv kernel
if (kw == 3) { if (kw == 3) {
// VLOG(5) << "invoke 3x3 dw conv fp32";
bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2)); bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
if (pads_less && paddings[0] == paddings[2] && if (pads_less && paddings[0] == paddings[2] &&
(paddings[0] == 0 || paddings[0] == 1)) { (paddings[0] == 0 || paddings[0] == 1)) {
...@@ -54,7 +53,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -54,7 +53,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
kernel_func_name_ = "conv_depthwise_3x3_fp32"; kernel_func_name_ = "conv_depthwise_3x3_fp32";
#endif #endif
} else if (kw == 5) { } else if (kw == 5) {
// VLOG(5) << "invoke 5x5 dw conv fp32";
auto strides = param.strides; auto strides = param.strides;
if ((strides[0] == 1 && strides[1] == 1) || if ((strides[0] == 1 && strides[1] == 1) ||
(strides[0] == 2 && strides[1] == 2)) { (strides[0] == 2 && strides[1] == 2)) {
...@@ -104,23 +102,44 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -104,23 +102,44 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
w_scale_[i] = scale[i] * in_scale; w_scale_[i] = scale[i] * in_scale;
} }
} }
auto paddings = *param.paddings;
auto strides = param.strides;
auto x_dims = param.x->dims();
int iw = x_dims[3];
int ih = x_dims[2];
auto act_param = param.activation_param;
bool has_act = act_param.has_active;
lite_api::ActivationType act_type = act_param.active_type;
// no activation and relu activation is supported now
bool support_act_type =
(has_act == false) ||
(has_act == true && act_type == lite_api::ActivationType::kRelu);
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
bool support_width_type = iw > 9 ? true : false;
/// select dw conv kernel /// select dw conv kernel
if (kw == 3) { if (kw == 3) {
// trans weights // trans weights
// VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32; impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_3x3_int8_fp32"; kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
#endif #endif
int cround = ROUNDUP(w_dims[0], 8); if (!support_act_type || !support_pad_type || !support_stride_type ||
weights_.Resize({cround / 8, 1, kh * kw, 8}); !support_width_type) {
auto wptr = param.filter->data<int8_t>(); int cround = ROUNDUP(w_dims[0], 8);
auto wptr_new = weights_.mutable_data<int8_t>(); weights_.Resize({cround / 8, 1, kh * kw, 8});
lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9); auto wptr = param.filter->data<int8_t>();
flag_trans_weights_ = true; auto wptr_new = weights_.mutable_data<int8_t>();
lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
flag_trans_weights_ = true;
} else {
flag_trans_weights_ = false;
}
} else if (kw == 5) { } else if (kw == 5) {
// trans weights // trans weights
// VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32; impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_5x5_int8_fp32"; kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
...@@ -175,23 +194,45 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -175,23 +194,45 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
param.activation_param.Relu_clipped_coef = param.activation_param.Relu_clipped_coef =
param.activation_param.Relu_clipped_coef / param.output_scale; param.activation_param.Relu_clipped_coef / param.output_scale;
} }
auto paddings = *param.paddings;
auto strides = param.strides;
auto x_dims = param.x->dims();
int iw = x_dims[3];
int ih = x_dims[2];
auto act_param = param.activation_param;
bool has_act = act_param.has_active;
lite_api::ActivationType act_type = act_param.active_type;
// no activation and relu activation is supported now
bool support_act_type =
(has_act == false) ||
(has_act == true && act_type == lite_api::ActivationType::kRelu);
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
bool support_width_type = iw > 9 ? true : false;
/// select dw conv kernel /// select dw conv kernel
if (kw == 3) { if (kw == 3) {
// trans weights // trans weights
// VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8; impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_3x3_int8_int8"; kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
#endif #endif
int cround = ROUNDUP(w_dims[0], 8); if (!support_act_type || !support_pad_type || !support_stride_type ||
weights_.Resize({cround / 8, 1, kh * kw, 8}); !support_width_type) {
auto wptr = param.filter->data<int8_t>(); int cround = ROUNDUP(w_dims[0], 8);
auto wptr_new = weights_.mutable_data<int8_t>(); weights_.Resize({cround / 8, 1, kh * kw, 8});
lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9); auto wptr = param.filter->data<int8_t>();
flag_trans_weights_ = true; auto wptr_new = weights_.mutable_data<int8_t>();
lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
flag_trans_weights_ = true;
} else {
flag_trans_weights_ = false;
}
} else if (kw == 5) { } else if (kw == 5) {
// trans weights // trans weights
// VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8; impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_5x5_int8_int8"; kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
...@@ -283,7 +324,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -283,7 +324,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto w_dims = param.filter->dims(); auto w_dims = param.filter->dims();
auto o_dims = param.output->dims(); auto o_dims = param.output->dims();
int iw = x_dims[3]; // nchw int iw = x_dims[3];
int ih = x_dims[2]; int ih = x_dims[2];
int ic = x_dims[1]; int ic = x_dims[1];
int bs = x_dims[0]; int bs = x_dims[0];
...@@ -333,7 +374,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -333,7 +374,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto w_dims = param.filter->dims(); auto w_dims = param.filter->dims();
auto o_dims = param.output->dims(); auto o_dims = param.output->dims();
int iw = x_dims[3]; // nchw int iw = x_dims[3];
int ih = x_dims[2]; int ih = x_dims[2];
int ic = x_dims[1]; int ic = x_dims[1];
int bs = x_dims[0]; int bs = x_dims[0];
......
...@@ -125,7 +125,7 @@ void release_param(ConvParam* param) { ...@@ -125,7 +125,7 @@ void release_param(ConvParam* param) {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
void test_conv_int8(const std::vector<DDim>& input_dims, void test_conv_int8(const DDim& dim_in,
const DDim& weight_dim, const DDim& weight_dim,
int group, int group,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -237,241 +237,234 @@ void test_conv_int8(const std::vector<DDim>& input_dims, ...@@ -237,241 +237,234 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
conv_int8_fp32.SetContext(std::move(ctx2)); conv_int8_fp32.SetContext(std::move(ctx2));
/// set param and context /// set param and context
for (auto& dim_in : input_dims) { param_int8_out.x->Resize(dim_in);
param_int8_out.x->Resize(dim_in); DDim out_tmp_dims = compute_out_dim(dim_in, param_int8_out);
DDim out_tmp_dims = compute_out_dim(dim_in, param_int8_out); if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) { return;
continue;
}
param_fp32_out.x->Resize(dim_in);
param_int8_out.output->Resize(out_tmp_dims);
param_fp32_out.output->Resize(out_tmp_dims);
break;
} }
param_fp32_out.x->Resize(dim_in);
param_int8_out.output->Resize(out_tmp_dims);
param_fp32_out.output->Resize(out_tmp_dims);
conv_int8_int8.SetParam(param_int8_out); conv_int8_int8.SetParam(param_int8_out);
conv_int8_fp32.SetParam(param_fp32_out); conv_int8_fp32.SetParam(param_fp32_out);
/// prepare for run /// prepare for run
conv_int8_int8.PrepareForRun(); conv_int8_int8.PrepareForRun();
conv_int8_fp32.PrepareForRun(); conv_int8_fp32.PrepareForRun();
for (auto& dim_in : input_dims) { CHECK_EQ(weight_dim[1] * group, dim_in[1])
CHECK_EQ(weight_dim[1] * group, dim_in[1]) << "input channel must equal to weights channel";
<< "input channel must equal to weights channel"; DDim dim_out = compute_out_dim(dim_in, param_int8_out);
DDim dim_out = compute_out_dim(dim_in, param_int8_out); if (dim_out[2] < 1 || dim_out[3] < 1) {
if (dim_out[2] < 1 || dim_out[3] < 1) { continue;
continue; }
} delete param_fp32_out.output;
delete param_fp32_out.output; param_fp32_out.output = new Tensor;
param_fp32_out.output = new Tensor; param_fp32_out.output->set_precision(PRECISION(kFloat));
param_fp32_out.output->set_precision(PRECISION(kFloat)); delete param_int8_out.output;
delete param_int8_out.output; param_int8_out.output = new Tensor;
param_int8_out.output = new Tensor; param_int8_out.output->set_precision(PRECISION(kInt8));
param_int8_out.output->set_precision(PRECISION(kInt8));
param_int8_out.x->Resize(dim_in);
param_int8_out.x->Resize(dim_in); param_int8_out.output->Resize(dim_out);
param_int8_out.output->Resize(dim_out); param_fp32_out.x->Resize(dim_in);
param_fp32_out.x->Resize(dim_in); param_fp32_out.output->Resize(dim_out);
param_fp32_out.output->Resize(dim_out);
Tensor tin_fp32;
Tensor tin_fp32; tin_fp32.Resize(dim_in);
tin_fp32.Resize(dim_in); tin_fp32.set_precision(PRECISION(kFloat));
tin_fp32.set_precision(PRECISION(kFloat)); Tensor tout_basic_fp32;
Tensor tout_basic_fp32; Tensor tout_basic_int8;
Tensor tout_basic_int8;
paddle::lite::fill_tensor_rand(*param_int8_out.x, -127, 127);
paddle::lite::fill_tensor_rand(*param_int8_out.x, -127, 127); param_fp32_out.x->CopyDataFrom(*param_int8_out.x);
param_fp32_out.x->CopyDataFrom(*param_int8_out.x);
auto din_fp32 = tin_fp32.mutable_data<float>();
auto din_fp32 = tin_fp32.mutable_data<float>(); paddle::lite::arm::math::int8_to_fp32(param_int8_out.x->data<int8_t>(),
paddle::lite::arm::math::int8_to_fp32(param_int8_out.x->data<int8_t>(), din_fp32,
din_fp32, scale_in.data(),
scale_in.data(), 1,
1,
dim_in.production());
if (FLAGS_check_result) {
tout_basic_fp32.set_precision(PRECISION(kFloat));
tout_basic_fp32.Resize(dim_out);
tout_basic_int8.set_precision(PRECISION(kInt8));
tout_basic_int8.Resize(dim_out);
fill_tensor_const(tout_basic_fp32, 0.f);
auto dout_basic_fp32 = tout_basic_fp32.mutable_data<float>();
auto dout_basic_int8 = tout_basic_int8.mutable_data<int8_t>();
conv_basic<float, float>(din_fp32,
dout_basic_fp32,
dim_in[0],
dim_out[1],
dim_out[2],
dim_out[3],
dim_in[1],
dim_in[2],
dim_in[3],
wptr_fp32,
bptr_fp32,
group,
weight_dim[3],
weight_dim[2],
strides[1],
strides[0],
dilas[1],
dilas[0],
pads[2],
pads[0],
flag_bias,
flag_act,
six,
alpha);
paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32,
dout_basic_int8,
scale_out.data(),
1, 1,
1, 1,
dim_in.production()); dim_out.production());
}
if (FLAGS_check_result) { double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
tout_basic_fp32.set_precision(PRECISION(kFloat)); weight_dim[3] / group;
tout_basic_fp32.Resize(dim_out); /// warm up
tout_basic_int8.set_precision(PRECISION(kInt8)); for (int i = 0; i < FLAGS_warmup; ++i) {
tout_basic_int8.Resize(dim_out); conv_int8_fp32.Launch();
fill_tensor_const(tout_basic_fp32, 0.f); }
auto dout_basic_fp32 = tout_basic_fp32.mutable_data<float>(); /// compute fp32 output
auto dout_basic_int8 = tout_basic_int8.mutable_data<int8_t>(); Timer t0;
conv_basic<float, float>(din_fp32, for (int i = 0; i < FLAGS_repeats; ++i) {
dout_basic_fp32, t0.Start();
dim_in[0], conv_int8_fp32.Launch();
dim_out[1], t0.Stop();
dim_out[2], }
dim_out[3], LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
dim_in[1], << ",running time, avg: " << t0.LapTimes().Avg() << " ms"
dim_in[2], << ", min time: " << t0.LapTimes().Min() << " ms"
dim_in[3], << ", total GOPS: " << 1e-9 * gops
wptr_fp32, << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
bptr_fp32, << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
group,
weight_dim[3], // compute int8 output
weight_dim[2], t0.Reset();
strides[1], for (int i = 0; i < FLAGS_repeats; ++i) {
strides[0], t0.Start();
dilas[1], conv_int8_int8.Launch();
dilas[0], t0.Stop();
pads[2], }
pads[0], LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
flag_bias, << ",running time, avg: " << t0.LapTimes().Avg()
flag_act, << ", min time: " << t0.LapTimes().Min()
six, << ", total GOPS: " << 1e-9 * gops
alpha); << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32, << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
dout_basic_int8,
scale_out.data(), /// compare result fp32 output
1, if (FLAGS_check_result) {
1, double max_ratio = 0;
dim_out.production()); double max_diff = 0;
} tensor_cmp_host(
double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * tout_basic_fp32, *param_fp32_out.output, max_ratio, max_diff);
weight_dim[3] / group; LOG(INFO) << "FP32 compare result, max diff: " << max_diff
/// warm up << ", max ratio: " << max_ratio;
for (int i = 0; i < FLAGS_warmup; ++i) { if (std::abs(max_ratio) > 1e-5f) {
conv_int8_int8.Launch(); if (max_diff > 5e-5f) {
} LOG(WARNING) << "basic result";
/// compute fp32 output print_tensor(tout_basic_fp32);
Timer t0; LOG(WARNING) << "lite result";
for (int i = 0; i < FLAGS_repeats; ++i) { print_tensor(*param_fp32_out.output);
t0.Start(); Tensor tdiff;
conv_int8_fp32.Launch(); tdiff.Resize(tout_basic_fp32.dims());
t0.Stop(); tdiff.set_precision(PRECISION(kFloat));
} tensor_diff(tout_basic_fp32, *param_fp32_out.output, tdiff);
LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out print_tensor(tdiff);
<< ",running time, avg: " << t0.LapTimes().Avg() release_param(&param_int8_out);
<< ", min time: " << t0.LapTimes().Min() release_param(&param_fp32_out);
<< ", total GOPS: " << 1e-9 * gops LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() << ", output: " << dim_out
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); << ", weight dim: " << weight_dim << ", pad: " << pads[0]
<< ", " << pads[1] << ", " << pads[2] << ", " << pads[3]
/// compute int8 output << ", stride: " << strides[0] << ", " << strides[1]
t0.Reset(); << ", dila_: " << dilas[0] << ", " << dilas[1]
for (int i = 0; i < FLAGS_repeats; ++i) { << ", group: " << group
t0.Start(); << ", bias: " << (flag_bias ? "true" : "false")
conv_int8_int8.Launch(); << ", act: " << flag_act << ", threads: " << th
t0.Stop(); << ", power_mode: " << cls << " failed!!\n";
}
LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
<< ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
/// compare result fp32 output
if (FLAGS_check_result) {
double max_ratio = 0;
double max_diff = 0;
tensor_cmp_host(
tout_basic_fp32, *param_fp32_out.output, max_ratio, max_diff);
LOG(INFO) << "FP32 compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
if (std::abs(max_ratio) > 1e-5f) {
if (max_diff > 5e-5f) {
LOG(WARNING) << "basic result";
print_tensor(tout_basic_fp32);
LOG(WARNING) << "lite result";
print_tensor(*param_fp32_out.output);
Tensor tdiff;
tdiff.Resize(tout_basic_fp32.dims());
tdiff.set_precision(PRECISION(kFloat));
tensor_diff(tout_basic_fp32, *param_fp32_out.output, tdiff);
print_tensor(tdiff);
release_param(&param_int8_out);
release_param(&param_fp32_out);
LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
<< ", output: " << dim_out
<< ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1] << ", "
<< pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", group: " << group
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " failed!!\n";
}
} }
} }
/// compare result int8 output }
if (FLAGS_check_result) { // compare result int8 output
double max_ratio = 0; if (FLAGS_check_result) {
double max_diff = 0; double max_ratio = 0;
// ! int8 double max_diff = 0;
tensor_cmp_host( // ! int8
tout_basic_int8, *param_int8_out.output, max_ratio, max_diff); tensor_cmp_host(
LOG(INFO) << "int8 compare result, max diff: " << max_diff tout_basic_int8, *param_int8_out.output, max_ratio, max_diff);
<< ", max ratio: " << max_ratio; LOG(INFO) << "int8 compare result, max diff: " << max_diff
if (fabs(max_diff) > 0) { << ", max ratio: " << max_ratio;
Tensor tdiff; if (fabs(max_diff) > 0) {
tdiff.Resize(tout_basic_int8.dims()); Tensor tdiff;
tdiff.set_precision(PRECISION(kInt8)); tdiff.Resize(tout_basic_int8.dims());
tensor_diff(tout_basic_int8, *param_int8_out.output, tdiff); tdiff.set_precision(PRECISION(kInt8));
auto ptr = tdiff.data<int8_t>(); tensor_diff(tout_basic_int8, *param_int8_out.output, tdiff);
auto ptr_basic_fp32 = tout_basic_fp32.data<float>(); auto ptr = tdiff.data<int8_t>();
float count = 0; auto ptr_basic_fp32 = tout_basic_fp32.data<float>();
bool check = true; float count = 0;
for (int i = 0; i < tdiff.numel(); ++i) { bool check = true;
if (abs(ptr[i]) > 1) { for (int i = 0; i < tdiff.numel(); ++i) {
check = false; if (abs(ptr[i]) > 1) {
LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i] check = false;
<< ", after scale: " LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
<< ptr_basic_fp32[i] / scale_out[0]; << ", after scale: "
break; << ptr_basic_fp32[i] / scale_out[0];
} break;
if (ptr[i] != 0) {
LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
<< ", after scale: "
<< ptr_basic_fp32[i] / scale_out[0];
count += 1;
}
} }
check = if (ptr[i] != 0) {
check && LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
count < std::max(10, static_cast<int>(0.01 * tdiff.numel())); << ", after scale: "
if (!check) { << ptr_basic_fp32[i] / scale_out[0];
LOG(WARNING) << "int8 basic result"; count += 1;
print_tensor(tout_basic_int8);
LOG(WARNING) << "int8 lite result";
print_tensor(*param_int8_out.output);
LOG(WARNING) << "int8 diff tensor";
print_tensor(tdiff);
release_param(&param_int8_out);
release_param(&param_fp32_out);
LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
<< ", output: " << dim_out
<< ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1] << ", "
<< pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " failed!!\n";
} }
} }
check = check &&
count < std::max(10, static_cast<int>(0.01 * tdiff.numel()));
if (!check) {
LOG(WARNING) << "int8 basic result";
print_tensor(tout_basic_int8);
LOG(WARNING) << "int8 lite result";
print_tensor(*param_int8_out.output);
LOG(WARNING) << "int8 diff tensor";
print_tensor(tdiff);
release_param(&param_int8_out);
release_param(&param_fp32_out);
LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
<< ", output: " << dim_out
<< ", weight dim: " << weight_dim << ", pad: " << pads[0]
<< ", " << pads[1] << ", " << pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " failed!!\n";
}
} }
LOG(INFO) << "test int8 conv: input: " << dim_in
<< ", output: " << dim_out << ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
<< ", " << pads[3] << ", stride: " << strides[0] << ", "
<< strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " successed!!\n";
} }
LOG(INFO) << "test int8 conv: input: " << dim_in
<< ", output: " << dim_out << ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
<< ", " << pads[3] << ", stride: " << strides[0] << ", "
<< strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " successed!!\n";
} }
} }
release_param(&param_int8_out); release_param(&param_int8_out);
release_param(&param_fp32_out); release_param(&param_fp32_out);
} }
#else #else
void test_conv_int8(const std::vector<DDim>& input_dims, void test_conv_int8(const DDim& dims_in,
const DDim& weight_dim, const DDim& weight_dim,
int group, int group,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -493,25 +486,24 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) { ...@@ -493,25 +486,24 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
for (auto& flag_bias : {false, true}) { for (auto& flag_bias : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) { for (auto& flag_act : {0, 1, 2, 4}) {
for (auto& c : {1, 3, 5, 8, 16, 32}) { for (auto& c : {1, 3, 5, 8, 16, 32}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 3, 3}); DDim weights_dim({c, 1, 3, 3});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 15, 33}) { for (auto& h : {1, 3, 15, 33}) {
dims.push_back(DDim({batch, c, h, h})); DDim dims({batch, c, h, h});
test_conv_int8(dims,
weights_dim,
c,
{stride, stride},
{pad, pad, pad, pad},
{1, 1},
flag_bias,
flag_act,
{FLAGS_threads},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
test_conv_int8(dims,
weights_dim,
c,
{stride, stride},
{pad, pad, pad, pad},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
...@@ -529,25 +521,24 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) { ...@@ -529,25 +521,24 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
for (auto& flag_bias : {false, true}) { for (auto& flag_bias : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) { for (auto& flag_act : {0, 1, 2, 4}) {
for (auto& c : {1, 5, 15, 33}) { for (auto& c : {1, 5, 15, 33}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 5, 5}); DDim weights_dim({c, 1, 5, 5});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 15, 33, 112, 224}) { for (auto& h : {1, 3, 15, 33, 112, 224}) {
dims.push_back(DDim({batch, c, h, h})); DDim dims({batch, c, h, h});
test_conv_int8(dims,
weights_dim,
c,
{stride, stride},
{pad, pad, pad, pad},
{1, 1},
flag_bias,
flag_act,
{1, 4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
test_conv_int8(dims,
weights_dim,
c,
{stride, stride},
{pad, pad, pad, pad},
{1, 1},
flag_bias,
flag_act,
{1, 4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
...@@ -565,28 +556,27 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) { ...@@ -565,28 +556,27 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
for (auto& g : {1, 2}) { for (auto& g : {1, 2}) {
for (auto& flag_bias : {false, true}) { for (auto& flag_bias : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) { for (auto& flag_act : {0, 1, 2, 4}) {
std::vector<DDim> dims;
if (cin % g != 0 || cout % g != 0) { if (cin % g != 0 || cout % g != 0) {
continue; continue;
} }
DDim weights_dim({cout, cin / g, 1, 1}); DDim weights_dim({cout, cin / g, 1, 1});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 9, 16, 33}) { for (auto& h : {1, 9, 16, 33}) {
dims.push_back(DDim({batch, cin, h, h})); DDim dims({batch, cin, h, h});
test_conv_int8(dims,
weights_dim,
g,
{1, 1},
{0, 0, 0, 0},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
test_conv_int8(dims,
weights_dim,
g,
{1, 1},
{0, 0, 0, 0},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
...@@ -606,29 +596,29 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) { ...@@ -606,29 +596,29 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
for (auto& pad_left : {1, 2}) { for (auto& pad_left : {1, 2}) {
for (auto& pad_right : {1, 2}) { for (auto& pad_right : {1, 2}) {
for (auto& flag_bias : {false, true}) { for (auto& flag_bias : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) { for (auto& flag_act : {0, 1}) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3}); DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 17, 33}) { for (auto& h : {1, 7, 17, 33}) {
dims.push_back(DDim({batch, cin, h, h})); DDim dims({batch, cin, h, h});
if (cin == 1 && cout == 1) {
continue;
}
test_conv_int8(
dims,
weights_dim,
1,
{1, 1},
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
if (cin == 1 && cout == 1) {
continue;
}
test_conv_int8(dims,
weights_dim,
1,
{1, 1},
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
...@@ -652,25 +642,25 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) { ...@@ -652,25 +642,25 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
for (auto& pad_right : {1, 2}) { for (auto& pad_right : {1, 2}) {
for (auto& flag_bias : {false, true}) { for (auto& flag_bias : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) { for (auto& flag_act : {0, 1, 2, 4}) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3}); DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 33}) { for (auto& h : {1, 7, 19, 33}) {
dims.push_back(DDim({batch, cin, h, h})); DDim dims({batch, cin, h, h});
test_conv_int8(
dims,
weights_dim,
1,
{2, 2},
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
test_conv_int8(dims,
weights_dim,
1,
{2, 2},
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
...@@ -702,26 +692,27 @@ TEST(TestConvRandInt8, test_conv_rand) { ...@@ -702,26 +692,27 @@ TEST(TestConvRandInt8, test_conv_rand) {
if (cin % g != 0 || cout % g != 0) { if (cin % g != 0 || cout % g != 0) {
break; break;
} }
std::vector<DDim> dims;
DDim weights_dim({cout, cin / g, kh, kw}); DDim weights_dim({cout, cin / g, kh, kw});
for (auto& batch : {1, 2}) { for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 5, 19}) { for (auto& h : {1, 3, 5, 19}) {
dims.push_back(DDim({batch, cin, h, h})); DDim dims({batch, cin, h, h});
test_conv_int8(dims,
weights_dim,
g,
{stride, stride},
{pad_top,
pad_bottom,
pad_left,
pad_right},
{dila, dila},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
test_conv_int8(
dims,
weights_dim,
g,
{stride, stride},
{pad_top, pad_bottom, pad_left, pad_right},
{dila, dila},
flag_bias,
flag_act,
{4},
{FLAGS_power_mode},
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册