未验证 提交 a797ed0a 编写于 作者: H HappyAngel 提交者: GitHub

[OpenCL] fix concat compute error (#3000)

* fix conv 2-pad to 4-pad

* fix compute conv shape

* fix pad, test=develop

* change conv_depthwise_3x3s1_fp.cc name to conv3x3s1p01_depthwise_fp32.cc to distinguish between conv3x3s1_depthwise_fp32.cc

* delete printf note in conv3x3s1, test=develop

* delete printf note, test=develop

* delete gem_sdot.h, test=develop

it is coped from __gemm_sdot_meta_.h

* update compute padding, test=develop

* fix padding size, must be 2 or 4. test=develop

* fix format in operators/conv_op.cc, test=develop

* change #if 0 to #if 1, test=develop

* put 2-pad to 4-pad in AttachImpl, test=develop

* fix clang-format error inn tests/math/connv_compute_test, test=develop

* fix x86 test result error, test=develop

* add asymmetric padding test case in liite/tests/math/conv_compute.cc, test=develop

* change paddings type to support dynamically modify, test=develop

* fix x86 build error in connv_compute_test, test=develop

* fix opencl build error, test=develop

* fix oopencl build error, test=develop

* fix  opencl/conv_compute build error, test=develop

* fix  opencl/conv_compute build error, test=develop

* fix format in kernels/opencl/conv_computte_ttest,test=develop

* fix build error, test=develop

fix build error in kernels/x86/conv_compute.h

* fix ccompute shape error in ooperators/conv_op.h, test=develop

* add conv_reelu6 and conv leaky_relu in conv_3x3s1_direct

* add conv_relu6 in c1, c2, c4,test=develop

* fix conflict in conv_bloock_utils.h, test=develop

* add relu6 and leankyrelu in conv_3x3s1_dw

* add conv_3x3s1px_dw relu6 and leaky_relu fusion, test=develop

* fix conflict in tests/math/conv_compute_arm, test=develop

* fix build error in winograd arm, test=develop

* channge act_param as pointer in conv_block_tuils.h, test=develop

* fix winograd in no equal 4-padding compute error, test=develop

* add conv relu6 and leaky_relu in conv_dw_3x3s2, test=develop

* fix format, test=develop

* fix format in conv_block_utils, test=develop

* move updatePadding from conv_op.cc to conv_op.h, test=develop

* fix format conv_op.h, test=develop

* fix buuilde error in conv_oop.h, test=develop

* remove flag_relu parameter in conv_3x3_depthwise, test=develop

* add conv relu6/lleakyrelu in sgemm, test=develop

* delete some notes, test=develop

* fix format, test=develop

* fix build moobile_android error, test=develop

* change matmul, conv_transpose, mul and fc in using sgemm, test=develop

* fix build error in sgemm_test.ccc, test=develop

* add act_param in gru_utils. test=develop

* fix compute error in conv_dw, test=develop

* delete con2d_transpose test, test=develop

delete con2d_transpose test, this test can found in test/math/

* fix build error, test=develop

* add conv5x5s2_dw. pooling1x1s2_max, conv_act_fusion include relu6 and leaky_relu, teest=develop

* fix fc_reluu fusion error, test=develop

* fix compute error, test=develop

* delete conv5x5s2p2_dw, test=develop

* delete extra nnoote, test=develop

* delete conv_relu6 fusion in conv_activation_fuse_pass, test=develop

* fix build error, test=develop

* fix test error in kernel, test=develop

kernel/arm/test_pool_compute

* fix no equal pooling compute error, test=develop

* free space, test=develop

* fix con5x5s2_dw compute error, test=develop

* fix fromat, test=develop

* fix hin compute. test=develop

* fix con_dw leakyrelu compute error

* fix conv3x3s1_dw leakyRelu compute error

* delte note, test=develop

* fix format, test=develop

* fix format in conv_compute_test, test=develop

* add conv+relu6/leakyRelu fusion, test=develop

* fix format bug, test=develop

* fix format, test=develop

* fix format. test=develop

* fix format bug, test=develop

* fix con+relu6/leakyRelu fusion in Fp32, test=develop

* note m=397 in sgemv_int8 ut, test=develop

* note m=397 in sgemv_int8 ut, test=develop

* fix ios build error. test=develop

* add ut

* fix calib compute in int32_to_int8, test=develop

* add classify ut

* add classify demo

* add classify demo

* fix build error

* fix model_test_classify format

* fix format, test=develop

* fix v7 buiild error

* fix run ut errorr

* fix format and run ut Error, test=develop

* fix build.sh, test=develop

* fix format, test=develop

* fix format. test=develop

* fix format, test=develop

* fix no-equal padding , test=develop

* fix pooling

* add no-equal padding inn pooling, test=develop

* fix un-equal padding error. test=develop

* fix conv+relu6 int8 fusion error, test=develop

* fix int8 conv+relu6 error, test=develop

* fix format, test=develop

* delete pool_compute_test, test=develop

* fix image compute error, test=develop
上级 4757e69e
......@@ -17,48 +17,151 @@ limitations under the License. */
__kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int axis_size, int flag, int width) {
const int x = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh
int flag, int C_0, int out_C, int out_W, int width) {
const int out_w = get_global_id(0); // image_width cxw/4
const int out_c = get_global_id(1); // image_width cxw/4
const int out_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int xx = x / width;
if (flag == 0){
xx = y / width;
if (flag == 1){ // by channel
int c_in = out_c;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
} else {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}else if (flag == 2){ // by height, width == n
int2 input_pos;
input_pos.x = out_c * out_W + out_w;
int h = out_nh / width;
CL_DTYPE4 input;
if (h < C_0){
input_pos.y = out_nh;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
}else{
input_pos.y = (h - C_0) * width;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
}
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
}else if (flag == 3){ // by width, width == C
int2 input_pos;
input_pos.y = out_nh;
CL_DTYPE4 input;
if (out_w < C_0){
input_pos.x = out_c * out_W + out_w;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
}else{
input_pos.x = out_c * out_W + (out_w - C_0);
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
}
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
}
if (xx < axis_size){
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}else{
int new_val = xx - axis_size;
new_val *= width;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
// WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void concat_mul(__read_only image2d_t input0,
__kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output,
int axis_size, int flag, int width, int start) {
const int x = get_global_id(0); // image_width cxw/4
const int y = get_global_id(1); // image_height nxh
int flag, int C_0, int out_C, int out_W, int in_W, int width) {
const int in_w = get_global_id(0); // image_width cxw/4
const int in_c = get_global_id(1); // image_width cxw/4
const int in_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int xx = x / width;
if (flag == 0){
xx = y / width;
}
if (xx < axis_size && xx >= start){
xx -= start;
xx *= width;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
if (flag == 1){ // by channel
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c_out = C_0 + in_c * 4 + i;
if (c_out >= out_C) {
break;
}
int2 output_pos;
output_pos.x = (c_out / 4) * in_W + in_w;
output_pos.y = in_nh;
float val;
if (i == 0) {
val = input_data.x;
} else if (i == 1) {
val = input_data.y;
} else if (i == 2) {
val = input_data.z;
} else if (i == 3) {
val = input_data.w;
}
if (c_out % 4 == 0){
output_data.x = val;
}else if (c_out % 4 == 1){
output_data.y = val;
}else if (c_out % 4 == 2){
output_data.z = val;
}else if (c_out % 4 == 3){
output_data.w = val;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}
}else if (flag == 2){ // by height, width == n
int2 output_pos;
output_pos.x = in_c * in_W + in_w;
output_pos.y = in_nh + C_0 * width;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}else if (flag == 3){ // by width, width == C
int2 output_pos;
output_pos.y = in_nh;
output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}
}
......@@ -53,11 +53,26 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
for (int i = 0; i < axis; i++) {
pre_size_ *= in_dims[i];
}
for (int i = axis + 1; i < in_dims.size(); i++) {
post_size_ *= in_dims[i];
switch (axis_) {
case 0:
width_ = out_dims[2]; // h
flag_ = 0;
break;
case 1: // channel
width_ = out_dims[3]; // w
flag_ = 1;
break;
case 2: // height
width_ = out_dims[0]; // n
flag_ = 2;
break;
case 3:
case -1: // width
width_ = out_dims[1]; // c
flag_ = 3;
break;
default:
printf("this axis: %d does not support \n", axis_);
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
......@@ -81,7 +96,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>(
auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
......@@ -93,9 +108,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[-1];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
auto global_work_size = cl::NDRange{
static_cast<cl::size_type>(x_dims[-1]),
static_cast<cl::size_type>(image_shape["width"] / x_dims[-1]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
......@@ -103,42 +119,29 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int flag = 1; // cxw
switch (axis_) {
case 0:
width = x_dims[2]; // n
flag = 0;
break;
case 1:
width = x_dims[3]; // c
break;
case 2:
width = x_dims[0]; // h
flag = 0;
break;
case 3:
case -1:
width = x_dims[1]; // w
break;
default:
printf("this axis: %d does not support \n", axis_);
}
int out_w = x_dims[-1];
int out_c = x_dims[1];
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
status = kernel.setArg(++arg_idx, out_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
status = kernel.setArg(++arg_idx, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......@@ -153,18 +156,31 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
auto in_w = in_dims[-1];
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
global_work_size = cl::NDRange{
static_cast<cl::size_type>(in_dims[-1]),
static_cast<cl::size_type>(image_shape["width"] / in_dims[-1]),
static_cast<cl::size_type>(image_shape["height"])};
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, axis_size_);
status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag);
status = kernel.setArg(++arg_idx, out_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width);
status = kernel.setArg(++arg_idx, width_);
CL_CHECK_FATAL(status);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
......@@ -184,9 +200,9 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
std::string doc() { return "Concat using cl::Image, kFP16"; }
int axis_size_ = 1;
int post_size_ = 1;
int pre_size_ = 1;
int axis_ = 1;
int flag_ = 1;
int width_ = 1;
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册