未验证 提交 a797ed0a 编写于 作者: H HappyAngel 提交者: GitHub

[OpenCL] fix concat compute error (#3000)

* fix conv 2-pad to 4-pad

* fix compute conv shape

* fix pad, test=develop

* change conv_depthwise_3x3s1_fp.cc name to conv3x3s1p01_depthwise_fp32.cc to distinguish between conv3x3s1_depthwise_fp32.cc

* delete printf note in conv3x3s1, test=develop

* delete printf note, test=develop

* delete gem_sdot.h, test=develop

it is coped from __gemm_sdot_meta_.h

* update compute padding, test=develop

* fix padding size, must be 2 or 4. test=develop

* fix format in operators/conv_op.cc, test=develop

* change #if 0 to #if 1, test=develop

* put 2-pad to 4-pad in AttachImpl, test=develop

* fix clang-format error inn tests/math/connv_compute_test, test=develop

* fix x86 test result error, test=develop

* add asymmetric padding test case in liite/tests/math/conv_compute.cc, test=develop

* change paddings type to support dynamically modify, test=develop

* fix x86 build error in connv_compute_test, test=develop

* fix opencl build error, test=develop

* fix oopencl build error, test=develop

* fix  opencl/conv_compute build error, test=develop

* fix  opencl/conv_compute build error, test=develop

* fix format in kernels/opencl/conv_computte_ttest,test=develop

* fix build error, test=develop

fix build error in kernels/x86/conv_compute.h

* fix ccompute shape error in ooperators/conv_op.h, test=develop

* add conv_reelu6 and conv leaky_relu in conv_3x3s1_direct

* add conv_relu6 in c1, c2, c4,test=develop

* fix conflict in conv_bloock_utils.h, test=develop

* add relu6 and leankyrelu in conv_3x3s1_dw

* add conv_3x3s1px_dw relu6 and leaky_relu fusion, test=develop

* fix conflict in tests/math/conv_compute_arm, test=develop

* fix build error in winograd arm, test=develop

* channge act_param as pointer in conv_block_tuils.h, test=develop

* fix winograd in no equal 4-padding compute error, test=develop

* add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop

* fix format, test=develop

* fix format in conv_block_utils, test=develop

* move updatePadding from conv_op.cc to conv_op.h, test=develop

* fix format conv_op.h, test=develop

* fix buuilde error in conv_oop.h, test=develop

* remove flag_relu parameter in conv_3x3_depthwise, test=develop

* add conv relu6/lleakyrelu in sgemm, test=develop

* delete some notes, test=develop

* fix format, test=develop

* fix build moobile_android error, test=develop

* change matmul, conv_transpose, mul and fc in using sgemm, test=develop

* fix build error in sgemm_test.ccc, test=develop

* add act_param in gru_utils. test=develop

* fix compute error in conv_dw, test=develop

* delete con2d_transpose test, test=develop

delete con2d_transpose test, this test can found in test/math/

* fix build error, test=develop

* add conv5x5s2_dw. pooling1x1s2_max, conv_act_fusion include relu6 and leaky_relu, teest=develop

* fix fc_reluu fusion error, test=develop

* fix compute error, test=develop

* delete conv5x5s2p2_dw, test=develop

* delete extra nnoote, test=develop

* delete conv_relu6 fusion in conv_activation_fuse_pass, test=develop

* fix build error, test=develop

* fix test error in kernel, test=develop

kernel/arm/test_pool_compute

* fix no equal pooling compute error, test=develop

* free space, test=develop

* fix con5x5s2_dw compute error, test=develop

* fix fromat, test=develop

* fix hin compute. test=develop

* fix con_dw leakyrelu compute error

* fix conv3x3s1_dw leakyRelu compute error

* delte note, test=develop

* fix format, test=develop

* fix format in conv_compute_test, test=develop

* add conv+relu6/leakyRelu fusion, test=develop

* fix format bug, test=develop

* fix format, test=develop

* fix format. test=develop

* fix format bug, test=develop

* fix con+relu6/leakyRelu fusion in Fp32, test=develop

* note m=397 in sgemv_int8 ut, test=develop

* note m=397 in sgemv_int8 ut, test=develop

* fix ios build error. test=develop

* add ut

* fix calib compute in int32_to_int8, test=develop

* add classify ut

* add classify demo

* add classify demo

* fix build error

* fix model_test_classify format

* fix format, test=develop

* fix v7 buiild error

* fix run ut errorr

* fix format and run ut Error, test=develop

* fix build.sh, test=develop

* fix format, test=develop

* fix format. test=develop

* fix format, test=develop

* fix no-equal padding , test=develop

* fix pooling

* add no-equal padding inn pooling, test=develop

* fix un-equal padding error. test=develop

* fix conv+relu6 int8 fusion error, test=develop

* fix int8 conv+relu6 error, test=develop

* fix format, test=develop

* delete pool_compute_test, test=develop

* fix image compute error, test=develop
上级 4757e69e
...@@ -17,48 +17,151 @@ limitations under the License. */ ...@@ -17,48 +17,151 @@ limitations under the License. */
__kernel void concat2(__read_only image2d_t input0, __kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
__write_only image2d_t output, __write_only image2d_t output,
int axis_size, int flag, int width) { int flag, int C_0, int out_C, int out_W, int width) {
const int x = get_global_id(0); // image_width cxw/4 const int out_w = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh const int out_c = get_global_id(1); // image_width cxw/4
const int out_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
int xx = x / width; if (flag == 1){ // by channel
if (flag == 0){ int c_in = out_c;
xx = y / width; int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
} }
if (xx < axis_size){ int c_in;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y)); CL_DTYPE4 input_data;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
} else {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}else if (flag == 2){ // by height, width == n
int2 input_pos;
input_pos.x = out_c * out_W + out_w;
int h = out_nh / width;
CL_DTYPE4 input;
if (h < C_0){
input_pos.y = out_nh;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
}else{
input_pos.y = (h - C_0) * width;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
}
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
}else if (flag == 3){ // by width, width == C
int2 input_pos;
input_pos.y = out_nh;
CL_DTYPE4 input;
if (out_w < C_0){
input_pos.x = out_c * out_W + out_w;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
}else{ }else{
int new_val = xx - axis_size; input_pos.x = out_c * out_W + (out_w - C_0);
new_val *= width; input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y)); }
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
} }
// WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void concat_mul(__read_only image2d_t input0, __kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int axis_size, int flag, int width, int start) { int flag, int C_0, int out_C, int out_W, int in_W, int width) {
const int x = get_global_id(0); // image_width cxw/4 const int in_w = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh const int in_c = get_global_id(1); // image_width cxw/4
const int in_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
int xx = x / width; int2 input_pos;
if (flag == 0){ int2 output_pos;
xx = y / width; input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
if (flag == 1){ // by channel
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c_out = C_0 + in_c * 4 + i;
if (c_out >= out_C) {
break;
} }
int2 output_pos;
if (xx < axis_size && xx >= start){ output_pos.x = (c_out / 4) * in_W + in_w;
xx -= start; output_pos.y = in_nh;
xx *= width; float val;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y)); if (i == 0) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); val = input_data.x;
} else if (i == 1) {
val = input_data.y;
} else if (i == 2) {
val = input_data.z;
} else if (i == 3) {
val = input_data.w;
}
if (c_out % 4 == 0){
output_data.x = val;
}else if (c_out % 4 == 1){
output_data.y = val;
}else if (c_out % 4 == 2){
output_data.z = val;
}else if (c_out % 4 == 3){
output_data.w = val;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}
}else if (flag == 2){ // by height, width == n
int2 output_pos;
output_pos.x = in_c * in_W + in_w;
output_pos.y = in_nh + C_0 * width;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}else if (flag == 3){ // by width, width == C
int2 output_pos;
output_pos.y = in_nh;
output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
} }
} }
...@@ -53,11 +53,26 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -53,11 +53,26 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto in_dims = inputs[0]->dims(); auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis]; axis_size_ = out_dims[axis];
axis_ = axis; axis_ = axis;
for (int i = 0; i < axis; i++) { switch (axis_) {
pre_size_ *= in_dims[i]; case 0:
} width_ = out_dims[2]; // h
for (int i = axis + 1; i < in_dims.size(); i++) { flag_ = 0;
post_size_ *= in_dims[i]; break;
case 1: // channel
width_ = out_dims[3]; // w
flag_ = 1;
break;
case 2: // height
width_ = out_dims[0]; // n
flag_ = 2;
break;
case 3:
case -1: // width
width_ = out_dims[1]; // c
flag_ = 3;
break;
default:
printf("this axis: %d does not support \n", axis_);
} }
for (int i = 1; i < inputs.size(); i++) { for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims(); auto dims = inputs[i]->dims();
...@@ -81,7 +96,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -81,7 +96,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims(); const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>( auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only const auto& y_dims = param.output->dims(); // useless: check dim only
...@@ -93,8 +108,9 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -93,8 +108,9 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto inputs = param.x; auto inputs = param.x;
int arg_idx = 0; int arg_idx = 0;
int width = inputs[0]->dims()[-1]; int width = inputs[0]->dims()[-1];
auto global_work_size = auto global_work_size = cl::NDRange{
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), static_cast<cl::size_type>(x_dims[-1]),
static_cast<cl::size_type>(image_shape["width"] / x_dims[-1]),
static_cast<cl::size_type>(image_shape["height"])}; static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target()); VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
...@@ -103,42 +119,29 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -103,42 +119,29 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw int out_w = x_dims[-1];
switch (axis_) { int out_c = x_dims[1];
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
if (inputs.size() == 2) { if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>(); auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>(); auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0); cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1); status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status);
status = status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_])); kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag); status = kernel.setArg(++arg_idx, out_c);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width); status = kernel.setArg(++arg_idx, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
...@@ -153,18 +156,31 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -153,18 +156,31 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto start = 0; auto start = 0;
for (int i = 0; i < inputs.size(); i++) { for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0; arg_idx = 0;
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>(); auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
auto in_w = in_dims[-1];
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
global_work_size = cl::NDRange{
static_cast<cl::size_type>(in_dims[-1]),
static_cast<cl::size_type>(image_shape["width"] / in_dims[-1]),
static_cast<cl::size_type>(image_shape["height"])};
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_); status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start); status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag); status = kernel.setArg(++arg_idx, out_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width); status = kernel.setArg(++arg_idx, width_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
...@@ -184,9 +200,9 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -184,9 +200,9 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
std::string doc() { return "Concat using cl::Image, kFP16"; } std::string doc() { return "Concat using cl::Image, kFP16"; }
int axis_size_ = 1; int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1; int axis_ = 1;
int flag_ = 1;
int width_ = 1;
param_t* concat_param_{nullptr}; param_t* concat_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册