From 9f343ac26252447788e8f92558dad5102339a913 Mon Sep 17 00:00:00 2001 From: "ysh329@sina.com" Date: Tue, 4 Aug 2020 11:41:31 +0000 Subject: [PATCH] fix opencl concat. test=develop --- .../opencl/cl_kernel/image/concat_kernel.cl | 283 ++++++++++++++ lite/core/optimizer.h | 6 +- lite/core/profile/precision_profiler.h | 22 +- lite/kernels/opencl/concat_image_compute.cc | 367 +++++++++--------- lite/kernels/opencl/conv_image_compute.cc | 43 +- lite/kernels/opencl/test_helper.h | 2 +- 6 files changed, 513 insertions(+), 210 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl index 40cc52d54d..dd967ba71c 100644 --- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl @@ -11,6 +11,285 @@ limitations under the License. */ #include +__kernel void concatByCWith2Inputs( + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W, + __read_only image2d_t input_image_0, + __private const int C_0, + __read_only image2d_t input_image_1, + __private const int C_1) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= out_C) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < C_0) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos); + } else { + c_in = c - C_0; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +__kernel void concatByCWith3Inputs( + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W, + __read_only image2d_t input_image_0, + __private const int C_0, + __read_only image2d_t input_image_1, + __private const int C_1, + __read_only image2d_t input_image_2, + __private const int C_2) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= out_C) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < C_0) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos); + } else if (c < C_0 + C_1) { + c_in = c - C_0; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos); + } else { + c_in = c - C_0 - C_1; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +__kernel void concatByCWith4Inputs( + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W, + __read_only image2d_t input_image_0, + __private const int C_0, + __read_only image2d_t input_image_1, + __private const int C_1, + __read_only image2d_t input_image_2, + __private const int C_2, + __read_only image2d_t input_image_3, + __private const int C_3) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= out_C) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < C_0) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos); + } else if (c < C_0 + C_1) { + c_in = c - C_0; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos); + } else if (c < C_0 + C_1 + C_2) { + c_in = c - C_0 - C_1; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos); + }else if (c < C_0 + C_1 + C_2 + C_3){ + c_in = c - C_0 - C_1 - C_2; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_3, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +__kernel void concatByH(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const int out_H_Start) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x; + output_pos.y = out_H_Start + input_pos.y; + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); + +} + + +__kernel void concatByW(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int in_W, + __private const int pre_Width, + __private const int out_Width) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x + pre_Width + out_Width * in_c; + output_pos.y = input_pos.y; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); +} + + +// deprecated __kernel void concat2(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, @@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0, } } +// deprecated __kernel void concat_mul(__read_only image2d_t input, __write_only image2d_t output, int flag, int C_0, int out_C, int out_W, int in_W, int width) { @@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input, WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); } } + + + diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 42dac8e59b..d3618aa625 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -161,8 +161,10 @@ class Optimizer { "runtime_context_assign_pass", "argument_type_display_pass", - - "memory_optimize_pass"}}; +#ifndef LITE_WITH_PRECISION_PROFILE + "memory_optimize_pass" +#endif + }}; if (passes.size() == 1) { // multi_stream_analysis_pass must be in the front of diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index fda2b74f8f..0a862d1e32 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -36,6 +36,10 @@ #include "lite/backends/cuda/math/type_trans.h" #endif +#ifdef LITE_ON_TINY_PUBLISH +#include "lite/utils/replace_stl/stream.h" +#endif + namespace paddle { namespace lite { namespace profile { @@ -88,19 +92,25 @@ class PrecisionProfiler { PrecisionProfiler() {} std::string GetSummaryHeader() { +#ifdef LITE_ON_TINY_PUBLISH + using replace_stl::setw; +#else using std::setw; +#endif using std::left; using std::fixed; STL::stringstream ss; ss << "\n\n========================================= " << "Detailed Precision Profiler Summary " - << "=========================================" << std::endl; + << "=========================================" + << "\n"; ss << setw(45) << left << "operator:(kernel_info)" << " " << setw(70) << left << "output_tensor_name:(tensor_info)" << " " << setw(15) << left << "dims" << " " << setw(15) << left << "mean" << " " << setw(15) << left << "std_deviation" - << " " << setw(15) << left << "ave_grow_rate*" << std::endl; + << " " << setw(15) << left << "ave_grow_rate*" + << "\n"; // write to file with path: `log_dir` if (log_dir_ != "") { @@ -368,7 +378,11 @@ class PrecisionProfiler { } std::string GetInstPrecision(const Instruction* inst = nullptr) { +#ifdef LITE_ON_TINY_PUBLISH + using replace_stl::setw; +#else using std::setw; +#endif using std::left; using std::fixed; STL::stringstream ss; @@ -429,7 +443,7 @@ class PrecisionProfiler { << output_arg_info << " " << setw(15) << left << tout->dims() << " " << setw(15) << left << mean_str << " " << setw(15) << left << std_dev_str << " " << setw(15) << left << ave_grow_rate_str - << std::endl; + << "\n"; } else if (type->IsTensorList()) { auto touts = op_scope->FindVar(out_name)->GetMutable>(); @@ -466,7 +480,7 @@ class PrecisionProfiler { << output_arg_info << " " << setw(15) << left << tout->dims() << " " << setw(15) << left << mean_str << " " << setw(15) << left << std_dev_str << " " << setw(15) << left << ave_grow_rate_str - << std::endl; + << "\n"; } } } diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc index 25830b6a08..83dc7dec87 100644 --- a/lite/kernels/opencl/concat_image_compute.cc +++ b/lite/kernels/opencl/concat_image_compute.cc @@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLiteAs(); concat_param_ = param_.get_mutable(); - if (concat_param_->x.size() == 2) { - kernel_func_name_ = "concat2"; - } else { - kernel_func_name_ = "concat_mul"; - } - VLOG(1) << "kernel_func_name_:" << kernel_func_name_; - context.cl_context()->AddKernel(kernel_func_name_, - "image/concat_kernel.cl", - build_options_, - time_stamp_); + auto input_num = concat_param_->x.size(); + auto* output = concat_param_->output; + auto output_dims_size = output->dims().size(); auto axis = concat_param_->axis; - auto inputs = concat_param_->x; - auto out_dims = concat_param_->output->dims(); - auto* axis_tensor = concat_param_->axis_tensor; - if (axis_tensor != nullptr) { - // auto* axis_tensor_data = axis_tensor->data(TARGET(kARM)); - // axis = axis_tensor_data[0]; - } - auto in_dims = inputs[0]->dims(); - axis_size_ = out_dims[axis]; - axis_ = axis; - if (out_dims.size() < 4) { - if (out_dims.size() - axis == 1) { - // width - width_ = out_dims[1]; // c - flag_ = 3; + if (output_dims_size < 4) { + if (output_dims_size - axis == 1) { + kernel_func_name_ = "concatByW"; } else { - // height - width_ = out_dims[0]; // n - flag_ = 2; - } - } else { - switch (axis_) { - case 0: - width_ = out_dims[2]; // h - flag_ = 0; - break; - case 1: // channel - width_ = out_dims[3]; // w - flag_ = 1; - break; - case 2: // height - width_ = out_dims[0]; // n - flag_ = 2; - break; - case 3: - case -1: // width - width_ = out_dims[1]; // c - flag_ = 3; - break; - default: - printf("this axis: %d does not support \n", axis_); + kernel_func_name_ = "concatByH"; } - } - - for (int i = 1; i < inputs.size(); i++) { - auto dims = inputs[i]->dims(); - // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size()); - if (in_dims.size() != dims.size()) { - printf("input shape must be same \n"); - return; - } - for (int i = 0; i < dims.size(); i++) { - if (i != axis) { - if (in_dims[i] != dims[i]) { - printf("input shape must be same \n"); - return; - } - } + } else if (output_dims_size == 4) { // output->dims.size() == 4 + if (input_num == 2) { + kernel_func_name_ = "concatByCWith2Inputs"; + } else if (input_num == 3) { + kernel_func_name_ = "concatByCWith3Inputs"; + } else if (input_num == 4) { + kernel_func_name_ = "concatByCWith4Inputs"; + } else { + LOG(FATAL) << "Unsupported input tensors number:" << input_num << "."; } + } else { // output->dims.size() > 4 + LOG(FATAL) << "Unsupported output dims " << output->dims() + << ", whose dims.size() is bigger than 4."; } + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel(kernel_func_name_, + "image/concat_kernel.cl", + build_options_, + time_stamp_); } void Run() override { - auto& param = *param_.get_mutable(); - const auto& x_dims = param.output->dims(); - auto image_shape = InitImageDimInfoWith(x_dims); - auto* out_buf = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - const auto& y_dims = param.output->dims(); // useless: check dim only + auto output_tensor_dims = concat_param_->output->dims(); + auto output_image_shape = InitImageDimInfoWith(output_tensor_dims); + auto output_image_p = + concat_param_->output->mutable_data( + output_image_shape["width"], output_image_shape["height"]); + + auto inputs = concat_param_->x; + auto axis = concat_param_->axis; auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_ << time_stamp_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - auto inputs = param.x; - int arg_idx = 0; - int width = inputs[0]->dims()[inputs[0]->dims().size() - 1]; + if (kernel_func_name_ == "concatByW" || kernel_func_name_ == "concatByH") { + auto output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1]; + if (output_tensor_dims.size() - axis == 1) { + for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) { + auto* input = inputs[input_idx]; + auto input_tensor_dims = input->dims(); + auto input_image_shape = InitImageDimInfoWith(input_tensor_dims); + auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1]; + auto* input_image_p = input->data(); -#ifdef LITE_WITH_LOG - VLOG(4) << "concat input shape: "; - for (size_t i = 0; i < inputs.size(); i++) { - VLOG(4) << "inputs [" << i << "]" - << "[" << inputs[i]->dims().size() << "D]:" - << " dims:" << inputs[i]->dims()[0] << " " - << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " " - << inputs[i]->dims()[3]; - } + size_t input_tensor_pre_w = 0; + for (size_t ii_idx = 0; ii_idx < input_idx; ++ii_idx) { + auto input_tensor_dims = inputs[ii_idx]->dims(); + input_tensor_pre_w += + input_tensor_dims[input_tensor_dims.size() - 1]; + } - VLOG(4) << "concat output shape: "; - VLOG(4) << " out dims: " - << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1] - << " " << x_dims[2] << " " << x_dims[3]; - VLOG(4) << "axis_: " << axis_; - VLOG(4) << "flag_: " << flag_; -#endif + int input_special_w = input_tensor_dims[output_tensor_dims.size() - + 2]; // not a good var name - auto global_work_size = - cl::NDRange{static_cast(x_dims[x_dims.size() - 1]), - static_cast(image_shape["width"] / - x_dims[x_dims.size() - 1]), - static_cast(image_shape["height"])}; + const std::vector& default_work_size = DefaultWorkSize( + input_tensor_dims, + DDim(std::vector{ + static_cast(input_image_shape["width"]), + static_cast(input_image_shape["height"])})); + cl::NDRange global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2])}; + cl_int status; + status = kernel.setArg(0, *input_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(1, *output_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(2, input_special_w); + CL_CHECK_FATAL(status); + status = kernel.setArg(3, input_tensor_pre_w); + CL_CHECK_FATAL(status); + status = kernel.setArg(4, output_tensor_w); + CL_CHECK_FATAL(status); -#ifdef LITE_WITH_LOG - VLOG(4) << TargetToStr(param.output->target()); - VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " - << image_shape["height"]; - VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " - << x_dims[1] << " " << x_dims[2] << " " << x_dims[3] - << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1]; - VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " - << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; - VLOG(4) << "width_: " << width_ << ", flag_: " << flag_; - VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " " - << (image_shape["width"] / x_dims[x_dims.size() - 1]) << " " - << (image_shape["height"]); -#endif + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + } + } else { + size_t output_image_height_start = 0; // output image height start + for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) { + auto* input = inputs[input_idx]; + auto input_tensor_dims = input->dims(); + auto input_image_shape = InitImageDimInfoWith(input_tensor_dims); + auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1]; + auto* input_image_p = input->data(); - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - int out_w = x_dims[x_dims.size() - 1]; - int out_c = x_dims[1]; - if (inputs.size() == 2) { - auto* x_buf0 = inputs[0]->data(); - auto* x_buf1 = inputs[1]->data(); - cl_int status = kernel.setArg(arg_idx, *x_buf0); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *x_buf1); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_buf); + const std::vector& default_work_size = DefaultWorkSize( + input_tensor_dims, + DDim(std::vector{ + static_cast(input_image_shape["width"]), + static_cast(input_image_shape["height"])})); + cl::NDRange global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2])}; + + cl_int status; + status = kernel.setArg(0, *input_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(1, *output_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(2, output_tensor_w); + CL_CHECK_FATAL(status); + status = kernel.setArg(3, output_image_height_start); + CL_CHECK_FATAL(status); + + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + + // compute new output_image_height_start + if (output_tensor_dims.size() == 3) { + output_image_height_start += input_tensor_dims[1]; + } else if (output_tensor_dims.size() == 2) { + output_image_height_start += input_tensor_dims[0]; + } + } + } + } else if (kernel_func_name_ == "concatByCWith2Inputs" || + kernel_func_name_ == "concatByCWith3Inputs" || + kernel_func_name_ == "concatByCWith4Inputs") { + auto* input0 = inputs[0]; + auto* input0_image_p = input0->data(); + size_t input0_tensor_c = input0->dims()[1]; + + auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr; + auto* input1_image_p = + input1 ? input1->data() : nullptr; + size_t input1_tensor_c = input1 ? input1->dims()[1] : -1; + + auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr; + auto* input2_image_p = + input2 ? input2->data() : nullptr; + size_t input2_tensor_c = input2 ? input2->dims()[1] : -1; + + auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr; + auto* input3_image_p = + input3 ? input3->data() : nullptr; + size_t input3_tensor_c = input3 ? input3->dims()[1] : -1; + + LOG(INFO) << "input0_image_p:" << input0_image_p; + LOG(INFO) << "input0_tensor_c:" << input0_tensor_c; + LOG(INFO) << "input1_image_p:" << input1_image_p; + LOG(INFO) << "input1_tensor_c:" << input1_tensor_c; + LOG(INFO) << "input2_image_p:" << input2_image_p; + LOG(INFO) << "input2_tensor_c:" << input2_tensor_c; + LOG(INFO) << "input3_image_p:" << input3_image_p; + LOG(INFO) << "input3_tensor_c:" << input3_tensor_c; + + const std::vector& default_work_size = DefaultWorkSize( + output_tensor_dims, + DDim(std::vector{ + static_cast(output_image_shape["width"]), + static_cast(output_image_shape["height"])})); + cl::NDRange global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2])}; + + cl_int status; + status = kernel.setArg(0, *output_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, flag_); + status = kernel.setArg( + 1, static_cast(output_tensor_dims[1])); // output_tensor_c CL_CHECK_FATAL(status); - status = - kernel.setArg(++arg_idx, static_cast(inputs[0]->dims()[axis_])); + status = kernel.setArg( + 2, static_cast(output_tensor_dims[3])); // output_tensor_w CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_c); + status = kernel.setArg(3, *input0_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_w); + status = kernel.setArg(4, input0_tensor_c); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width_); + status = kernel.setArg(5, *input1_image_p); CL_CHECK_FATAL(status); - - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = kernel.setArg(6, input1_tensor_c); CL_CHECK_FATAL(status); - } else { - auto start = 0; - for (int i = 0; i < inputs.size(); i++) { - arg_idx = 0; - auto in_dims = inputs[i]->dims(); - image_shape = InitImageDimInfoWith(in_dims); - auto* x_buf = inputs[i]->data(); - int in_w = in_dims[in_dims.size() - 1]; -#ifdef LITE_WITH_LOG - VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " - << image_shape["height"]; -#endif - global_work_size = - cl::NDRange{static_cast(in_dims[in_dims.size() - 1]), - static_cast(image_shape["width"] / - in_dims[in_dims.size() - 1]), - static_cast(image_shape["height"])}; - cl_int status = kernel.setArg(arg_idx, *x_buf); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_buf); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, flag_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, start); + if (inputs.size() >= 3) { + status = kernel.setArg(7, *input2_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_c); + status = kernel.setArg(8, input2_tensor_c); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, in_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width_); + } + if (inputs.size() == 4) { + status = kernel.setArg(9, *input3_image_p); CL_CHECK_FATAL(status); + status = kernel.setArg(10, input3_tensor_c); CL_CHECK_FATAL(status); - - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - start += inputs[i]->dims()[axis_]; } + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + } else { + LOG(FATAL) << "Unsupported kernel func name: " << kernel_func_name_; } } diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index f53c464e99..3494c727eb 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() { impl_ = &ConvImageCompute::DepthwiseConv2d; } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { -// #define CONV3x3OPT_FALL_BACK -#ifndef CONV3x3OPT_FALL_BACK // conv2d_3x3 - kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" - : "conv2d_3x3_opt"); - kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); - - CLImageConverterFolder converter; - const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - filter_image_h_ = filter_image_dims[1]; - filter_image_w_ = filter_image_dims[0]; - tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - - half_t* filter_image_data = - tensor_hold_filter_image_->mutable_data(); - - converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); - filter_gpu_image_->mutable_data( - filter_image_w_, filter_image_h_, filter_image_data); - - impl_ = &ConvImageCompute::Conv2d3x3opt; -#else - kernel_func_names_.push_back("conv2d_3x3"); - kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); + if (groups_ == 1) { + kernel_func_names_.push_back( + input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt"); + kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); + impl_ = &ConvImageCompute::Conv2d3x3opt; + } else { // groups_ > 1 + kernel_func_names_.push_back("conv2d_3x3"); + kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); + impl_ = &ConvImageCompute::Conv2d3x3; + } CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); filter_image_h_ = filter_image_dims[1]; filter_image_w_ = filter_image_dims[0]; tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( filter_image_w_, filter_image_h_, filter_image_data); - - impl_ = &ConvImageCompute::Conv2d3x3; -#endif -#undef CONV3x3OPT_FALL_BACK } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT @@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() { static_cast(w_blk_), static_cast(nh_blk_)}; input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); + } else if (kernel_func_names_[0] == "conv2d_3x3") { + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || kernel_func_names_[0] == "conv2d_3x3_opt") { int w_blk_size = 5; diff --git a/lite/kernels/opencl/test_helper.h b/lite/kernels/opencl/test_helper.h index a1b875688e..2df88758d0 100644 --- a/lite/kernels/opencl/test_helper.h +++ b/lite/kernels/opencl/test_helper.h @@ -19,7 +19,7 @@ #define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5)) #define IS_DIFF_PASSED(res0, res1, threshold) \ - (((COMPTUE_ABS_DIFF(res0, res1) < threshold) || \ + (((COMPUTE_ABS_DIFF(res0, res1) < threshold) || \ (COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \ ? true \ : false) -- GitLab