提交 a6b1e4fa 编写于 作者: X Xiaoyang LI 提交者: cyj1986

fix conv_transpose error (#2165)

* fix conv_transpose error

* fix build error, enable basic test of conv_transpose, test=develop
上级 dfce4621
......@@ -306,7 +306,7 @@ function(cc_library TARGET_NAME)
if(${source_file} MATCHES "__generated_code__.cc")
list(APPEND full_path_src ${source_file})
else()
if(NOT ${source_file} MATCHES "framework.pb.cc")
if(NOT ${source_file} MATCHES "framework.pb.cc" AND NOT ${source_file} MATCHES "__generated_code__.cc")
list(APPEND full_path_src ${CMAKE_CURRENT_SOURCE_DIR}/${source_file})
endif()
endif()
......
......@@ -49,10 +49,11 @@ void Conv2DTransposeCompute::PrepareForRun() {
lite::Tensor tmp_weights;
lite::arm::math::prepackA(
&tmp_weights, *(param.filter), 1., m, k, group, true, &ctx);
&tmp_weights, *(param.filter), 1.f, m, k, group, true, &ctx);
param.filter->Resize(tmp_weights.dims());
param.filter->CopyDataFrom(tmp_weights);
param.filter->Resize(w_dims);
is_first_epoch_ = false;
}
void Conv2DTransposeCompute::Run() {
......@@ -96,7 +97,7 @@ void Conv2DTransposeCompute::Run() {
const float* din_batch = din + i * chin * hin * win;
float* dout_batch = dout + i * chout * hout * wout;
float* col_data = static_cast<float*>(ctx.workspace_data<float>()) +
ctx.l2_cache_size() / sizeof(float);
ctx.llc_size() / sizeof(float);
if (flag_1x1s1p1) {
col_data = dout_batch;
}
......@@ -112,7 +113,7 @@ void Conv2DTransposeCompute::Run() {
weights_group,
din_group,
n,
0.,
0.f,
coldata_group,
n,
nullptr,
......
......@@ -2,5 +2,6 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/core/context.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM
#include "lite/kernels/arm/conv_transpose_compute.h"
#endif // LITE_WITH_ARM
DEFINE_int32(cluster, 3, "cluster id");
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(batch, 1, "batch size");
DEFINE_int32(in_channel, 32, "input channel");
DEFINE_int32(in_height, 32, "input height");
DEFINE_int32(in_width, 32, "input width");
DEFINE_int32(out_channel, 64, "output channel");
DEFINE_int32(group, 1, "group");
DEFINE_int32(kernel_h, 2, "kernel height");
DEFINE_int32(kernel_w, 2, "kernel width");
DEFINE_int32(pad_h, 0, "pad height");
DEFINE_int32(pad_w, 0, "pad width");
DEFINE_int32(stride_h, 2, "stride height");
DEFINE_int32(stride_w, 2, "stride width");
DEFINE_int32(dila_h, 1, "dilation height");
DEFINE_int32(dila_w, 1, "dilation width");
DEFINE_bool(flag_relu, false, "do relu");
DEFINE_bool(flag_bias, false, "with bias");
typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::ConvParam ConvParam;
DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::ConvParam& param) {
auto filter_dims = param.filter->dims();
DDim output_shape = dim_in;
output_shape[1] = filter_dims[1] * param.groups;
for (int i = 0; i < 2; i++) {
int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1;
int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent -
2 * param.paddings[i];
output_shape[i + 2] = output_len;
}
return output_shape;
}
#ifdef LITE_WITH_ARM
void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
const DDim& weight_dim,
int group,
const std::vector<int>& strides,
const std::vector<int>& pads,
const std::vector<int>& dilas,
bool flag_bias,
bool flag_relu,
const std::vector<int>& thread_num,
const std::vector<int>& cluster_id) {
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
ConvParam param;
param.x = new Tensor;
param.x->set_precision(PRECISION(kFloat));
param.filter = new Tensor;
param.filter->Resize(weight_dim);
param.filter->set_precision(PRECISION(kFloat));
if (flag_bias) {
param.bias = new Tensor;
param.bias->Resize({weight_dim[0]});
param.bias->set_precision(PRECISION(kFloat));
}
param.strides = strides;
param.paddings = pads;
param.dilations = dilas;
param.fuse_relu = flag_relu;
param.groups = group;
param.output = new Tensor;
param.output->set_precision(PRECISION(kFloat));
// paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
paddle::lite::fill_tensor_const(*param.filter, 1.f);
if (flag_bias) {
// paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
paddle::lite::fill_tensor_const(*param.bias, 1.f);
}
Tensor tmp_weights;
tmp_weights.Resize(weight_dim);
tmp_weights.CopyDataFrom(*param.filter);
auto wptr = tmp_weights.data<float>();
auto bias_ptr = flag_bias ? param.bias->data<float>() : nullptr;
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
paddle::lite::kernels::arm::Conv2DTransposeCompute conv_t;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
/// set param and context
for (auto& dim_in : input_dims) {
param.x->Resize(dim_in);
DDim out_tmp_dims = compute_out_dim(dim_in, param);
if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
continue;
}
param.output->Resize(out_tmp_dims);
break;
}
conv_t.SetParam(param);
conv_t.SetContext(std::move(ctx1));
/// prepare for run
conv_t.PrepareForRun();
for (auto& dim_in : input_dims) {
CHECK_EQ(weight_dim[0], dim_in[1])
<< "input channel must equal to weights channel";
DDim dim_out = compute_out_dim(dim_in, param);
if (dim_out[2] < 1 || dim_out[3] < 1) {
continue;
}
param.x->Resize(dim_in);
param.output->Resize(dim_out);
// paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
paddle::lite::fill_tensor_const(*param.x, 1.f);
auto din = param.x->data<float>();
Tensor tout_basic;
if (FLAGS_check_result) {
tout_basic.set_precision(PRECISION(kFloat));
tout_basic.Resize(dim_out);
fill_tensor_const(tout_basic, 0.f);
auto dout_basic = tout_basic.mutable_data<float>();
deconv_basic<float, float>(din,
dout_basic,
dim_in[0],
dim_out[1],
dim_out[2],
dim_out[3],
dim_in[1],
dim_in[2],
dim_in[3],
wptr,
bias_ptr,
group,
weight_dim[3],
weight_dim[2],
strides[1],
strides[0],
dilas[1],
dilas[0],
pads[1],
pads[0],
flag_bias,
flag_relu);
}
/// warm up
for (int i = 0; i < FLAGS_warmup; ++i) {
conv_t.Launch();
}
/// compute
lite::test::Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start();
conv_t.Launch();
t0.end();
}
float gops =
2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms()
<< ", min time: " << t0.get_min_time()
<< ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
if (FLAGS_check_result) {
double max_ratio = 0;
double max_diff = 0;
tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff);
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
if (std::abs(max_ratio) > 1e-3f) {
if (max_diff > 5e-4f) {
LOG(WARNING) << "basic result";
print_tensor(tout_basic);
LOG(WARNING) << "saber result";
print_tensor(*param.output);
Tensor tdiff;
tdiff.Resize(tout_basic.dims());
tdiff.set_precision(PRECISION(kFloat));
tensor_diff(tout_basic, *param.output, tdiff);
print_tensor(tdiff);
LOG(FATAL) << "test fp32 conv: input: " << dim_in
<< ", output: " << dim_out
<< ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", relu: " << (flag_relu ? "true" : "false")
<< ", threads: " << th << ", cluster: " << cls
<< " failed!!\n";
}
}
}
LOG(INFO) << "test fp32 conv: input: " << dim_in
<< ", output: " << dim_out << ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", relu: " << (flag_relu ? "true" : "false")
<< ", threads: " << th << ", cluster: " << cls
<< " successed!!\n";
}
}
}
delete param.x;
delete param.filter;
delete param.output;
delete param.bias;
}
#else
void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
const DDim& weight_dim,
int group,
const std::vector<int>& strides,
const std::vector<int>& pads,
const std::vector<int>& dilas,
bool flag_bias,
bool flag_relu,
const std::vector<int>& thread_num,
const std::vector<int>& cluster_id) {}
#endif // LITE_WITH_ARM
#if 1 /// random param conv
TEST(TestConvRand, test_conv_transpose_rand) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 16}) {
for (auto& cout : {1, 5, 8, 16}) {
for (auto& g : {1, 2}) {
for (auto& kw : {1, 2, 3}) {
for (auto& kh : {1, 2, 3}) {
for (auto& stride : {1, 2}) {
for (auto& pad : {0, 1, 2}) {
for (auto& dila : {1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
if (cin % g != 0 || cout % g != 0) {
continue;
}
std::vector<DDim> dims;
DDim weights_dim({cin, cout / g, kh, kw});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 19, 32, 28}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
test_conv_transpose_fp32(dims,
weights_dim,
g,
{stride, stride},
{pad, pad},
{dila, dila},
flag_bias,
flag_relu,
{1, 2, 4},
{FLAGS_cluster});
}
}
}
}
}
}
}
}
}
}
}
}
#endif /// random param conv
#if 1 /// custom
TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) {
CHECK_EQ(FLAGS_in_channel % FLAGS_group, 0)
<< "input channel must be divided by group";
CHECK_EQ(FLAGS_out_channel % FLAGS_group, 0)
<< "num_output must be divided by group";
test_conv_transpose_fp32(
{DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
DDim({FLAGS_in_channel,
FLAGS_out_channel / FLAGS_group,
FLAGS_kernel_h,
FLAGS_kernel_w}),
FLAGS_group,
{FLAGS_stride_h, FLAGS_stride_w},
{FLAGS_pad_h, FLAGS_pad_w},
{FLAGS_dila_h, FLAGS_dila_w},
FLAGS_flag_bias,
FLAGS_flag_relu,
{FLAGS_threads},
{FLAGS_cluster});
}
#endif // custom
......@@ -189,3 +189,176 @@ static void conv_basic(const Dtype1* din,
}
}
}
template <typename Dtype>
static void fill_bias_relu(Dtype* tensor,
const Dtype* bias,
int channel,
int channel_size,
bool flag_bias,
bool flag_relu) {
Dtype* data = tensor;
for (int j = 0; j < channel; ++j) {
Dtype bias_c = flag_bias ? bias[j] : 0;
for (int i = 0; i < channel_size; i++) {
data[i] += bias_c;
if (flag_relu) {
data[i] = data[i] > 0 ? data[i] : 0.f;
}
}
data += channel_size;
}
}
template <typename Dtype>
static void do_relu(Dtype* tensor, int size) {
for (int j = 0; j < size; ++j) {
tensor[j] = tensor[j] > 0 ? tensor[j] : (Dtype)0;
}
}
inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
return static_cast<unsigned>(a) < static_cast<unsigned>(b);
}
template <typename Dtype>
static void col2im(const Dtype* data_col,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
Dtype* data_im) {
memset(data_im, 0, height * width * channels * sizeof(Dtype));
const int output_h =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int output_w =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
const int channel_size = height * width;
for (int channel = channels; channel--; data_im += channel_size) {
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_row = -pad_h + kernel_row * dilation_h;
for (int output_rows = output_h; output_rows; output_rows--) {
if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
data_col += output_w;
} else {
int input_col = -pad_w + kernel_col * dilation_w;
for (int output_col = output_w; output_col; output_col--) {
if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
data_im[input_row * width + input_col] += *data_col;
}
data_col++;
input_col += stride_w;
}
}
input_row += stride_h;
}
}
}
}
}
//! for float, dtype1 and type2 is float
//! for int8, dytpe1 is char, dtype2 is int
template <typename Dtype1, typename Dtype2>
void deconv_basic(const Dtype1* din,
Dtype2* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const Dtype1* weights,
const Dtype2* bias,
int group,
int kernel_w,
int kernel_h,
int stride_w,
int stride_h,
int dila_w,
int dila_h,
int pad_w,
int pad_h,
bool flag_bias,
bool flag_relu) {
int m = chout * kernel_w * kernel_h / group;
int n = hin * win;
int k = chin / group;
int group_size_in = win * hin * chin / group;
int group_size_out = wout * hout * chout / group;
int group_size_coldata = m * n;
int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
(stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
(dila_w == 1) && (dila_h == 1);
Dtype2* workspace_ptr =
static_cast<Dtype2*>(malloc(sizeof(float) * m * n * group));
for (int i = 0; i < num; ++i) {
const Dtype1* din_batch = din + i * chin * hin * win;
Dtype2* dout_batch = dout + i * chout * hout * wout;
Dtype2* col_data = workspace_ptr;
if (flag_1x1s1p1) {
col_data = dout_batch;
}
memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
for (int g = 0; g < group; ++g) {
const Dtype1* din_group = din_batch + g * group_size_in;
const Dtype1* weights_group = weights + g * group_size_weights;
Dtype2* coldata_group = col_data + g * group_size_coldata;
basic_gemm<Dtype1, Dtype2>(true,
false,
m,
n,
k,
1,
weights_group,
m,
din_group,
n,
0,
coldata_group,
n,
nullptr,
false,
(!flag_bias && flag_relu));
}
if (!flag_1x1s1p1) {
col2im(col_data,
chout,
hout,
wout,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dila_h,
dila_w,
dout_batch);
}
//! add bias
if (flag_bias) {
fill_bias_relu(
dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
}
}
free(workspace_ptr);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册