From d341fccb579d6f7531d076dff4b40967c78b7f00 Mon Sep 17 00:00:00 2001 From: ysh329 Date: Tue, 14 Jul 2020 10:09:58 +0800 Subject: [PATCH] [OPENCL] remove conv redundant's for opencl kernel. test=develop (#3924) remove conv redundant's for opencl kernel. --- lite/backends/opencl/cl_context.cc | 4 +- lite/backends/opencl/cl_context.h | 4 +- .../cl_kernel/image/conv2d_1x1_opt_kernel.cl | 4 - .../cl_kernel/image/conv2d_3x3_kernel.cl | 2 - .../cl_kernel/image/conv2d_3x3_opt_kernel.cl | 4 - .../cl_kernel/image/conv2d_5x5_kernel.cl | 2 - .../cl_kernel/image/conv2d_5x5_opt_kernel.cl | 6 +- .../cl_kernel/image/conv2d_7x7_kernel.cl | 2 - .../cl_kernel/image/conv2d_7x7_opt_kernel.cl | 6 +- .../image/depthwise_conv2d_basic_kernel.cl | 2 - .../image/depthwise_conv2d_kernel.cl | 4 - lite/kernels/opencl/conv_image_compute.cc | 2175 ++++++----------- lite/kernels/opencl/conv_image_compute.h | 85 +- 13 files changed, 867 insertions(+), 1433 deletions(-) diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 67d679fdd5..002073517b 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } -cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; @@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, static_cast(gws0)}; #endif } -cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 82d15bee5e..c204a85104 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -62,10 +62,10 @@ class CLContext { cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); - cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); - cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); bool IsArmMali(); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 1c808da68d..9209f0e0f8 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, @@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 771765ea60..6a3aa6455d 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 79f3922e89..739f852a7c 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index d856af6a1d..f08d53fa49 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index 4ed2e07202..4cce039f27 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 4998dc9927..2a2f210601 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index d82f4b4c96..4eadcd9f80 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 27313aea23..465b9f8f92 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 5626fe6be7..6fbdc21f93 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3( __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, @@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __private const int ou_nh, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 5b9e3b220a..083f72134e 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -30,92 +30,81 @@ namespace kernels { namespace opencl { void ConvImageCompute::PrepareForRun() { - const auto& param = this->Param(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); + ReInitWhenNeeded(); + + auto filter_dims = conv_param_->filter->dims(); + filter_tensor_n_ = filter_dims[0]; + filter_tensor_c_ = filter_dims[1]; + filter_tensor_h_ = filter_dims[2]; + filter_tensor_w_ = filter_dims[3]; - float* filter_cpu = param.filter->mutable_data(); auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); const bool is_mali = context.cl_context()->IsArmMali(); - filter_gpu_image_ = std::unique_ptr(new Tensor); - tensor_hold_filter_image_ = std::unique_ptr(new Tensor); - tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - int bs = x_dims[0]; - int c_in = x_dims[1]; - int h_out = output_dims[2]; - int w_out = output_dims[3]; - int kernel_h = filter_dims[2]; // oihw - int kernel_w = filter_dims[3]; - auto paddings = *param.paddings; - auto dilations = *param.dilations; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - int groups = param.groups; - bool relu_fused = param.fuse_relu; - bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); - bool zero_pad = (pad_h == 0) && (pad_w == 0); - - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) && - (paddings[2] == paddings[3])); - bool stride_equal = stride_h == stride_w; - bool dilation_equal = dilations[0] == dilations[1]; + + auto paddings = *conv_param_->paddings; + pad_up_ = paddings[0]; + pad_down_ = paddings[1]; + pad_left_ = paddings[2]; + pad_right_ = paddings[3]; + + auto dilations = *conv_param_->dilations; + dilation_h_ = dilations[0]; + dilation_w_ = dilations[1]; + + stride_h_ = conv_param_->strides[0]; + stride_w_ = conv_param_->strides[1]; + + groups_ = conv_param_->groups; + relu_fused_ = conv_param_->fuse_relu; + has_bias_ = (conv_param_->bias) != nullptr; + offset_ = filter_tensor_h_ / 2 - pad_up_; + + bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) && + (pad_left_ == pad_right_)); + bool stride_equal = stride_h_ == stride_w_; + bool dilation_equal = dilation_h_ == dilation_w_; VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No"); - VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); - VLOG(3) << "groups:" << groups << " stride_h:" << stride_h - << " stride_w:" << stride_w << " pad_h:" << pad_h - << " pad_w:" << pad_w << " kernel_h:" << kernel_h - << " kernel_h:" << kernel_h; - VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] - << " " << x_dims[3]; - VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1]; - VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " " - << output_dims[2] << " " << output_dims[3]; - VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " " - << filter_dims[2] << " " << filter_dims[3]; + VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No"); + VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_ + << " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_ + << " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_ + << " filter_tensor_h_:" << filter_tensor_h_; + VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_ + << " " << input_tensor_h_ << " " << input_tensor_w_; + VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_; + VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_ + << " " << output_tensor_h_ << " " << output_tensor_w_; + VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_ + << " " << filter_tensor_h_ << " " << filter_tensor_w_; VLOG(3) << "pad_equal:" << pad_equal; VLOG(3) << "stride_equal:" << stride_equal; VLOG(3) << "dilation_equal:" << dilation_equal; - VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " - << paddings[2] << " " << paddings[3]; + VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_ + << " " << pad_right_; CHECK(pad_equal && stride_equal && dilation_equal); + CHECK_GE(conv_param_->dilations->size(), 2); + CHECK(dilation_h_ == dilation_w_); + CHECK_GE(conv_param_->paddings->size(), 2); + CHECK(pad_left_ == pad_up_); + CHECK_GE(conv_param_->strides.size(), 2); + CHECK(stride_h_ == stride_w_); if (!is_mali) { - use_turn_ = false; + use_tune_ = false; } - // general gws.. - auto out_image_shape = InitImageDimInfoWith(output_dims); - - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - default_c_blk_ = default_work_size[0]; - default_w_blk_ = default_work_size[1]; - default_nh_blk_ = default_work_size[2]; - c_blk_ = default_c_blk_; - w_blk_ = default_w_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - - if (kernel_h == 1 && kernel_w == 1) { - // conv2d_1x1 - // if (param.x->dims()[1] % 4 == 0) { - // kernel_func_names_.push_back("conv2d_1x1_simple"); - // } else { - // kernel_func_names_.push_back("conv2d_1x1_opt"); - // } + /********************************************* + * Upload filter, bias to opencl device + *********************************************/ + float* filter_cpu = conv_param_->filter->mutable_data(); + filter_gpu_image_ = std::unique_ptr(new Tensor); + tensor_hold_filter_image_ = std::unique_ptr(new Tensor); + tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - if (param.x->dims()[1] % 4 == 0) { + if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) { + if (input_tensor_c_ % 4 == 0) { kernel_func_names_.push_back("conv2d_1x1_simple"); } else { kernel_func_names_.push_back("conv2d_1x1_opt"); @@ -124,89 +113,49 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - // std::vector filter_image_v(filter_image_dims[0] * - // filter_image_dims[1] * 4); // 4 : - // RGBA - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); - + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d1x1opt; - { - // calc 1x1 gws - w_blk_ = maptofactor(default_w_blk_, 4); - c_blk_ = default_c_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #define DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && - kernel_h == 3 && kernel_w == 3 && groups > 1) { + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ && + filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) { // depth_conv2d_3x3s1, depth_conv2d_3x3 - if (stride_h == 1 && dilations[0] == 1) { + if (stride_h_ == 1 && dilation_h_ == 1) { kernel_func_names_.push_back("depth_conv2d_3x3s1"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1; - { - // depthwise spl gws s1 - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - int w_blk_size = 2; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - - c_blk_ = c_block; - w_blk_ = w_blk; - nh_blk_ = nh; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } else { kernel_func_names_.push_back("depth_conv2d_3x3"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3; - { - // depthwise spl gws - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - - c_blk_ = c_block; - w_blk_ = w; - nh_blk_ = nh; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); #endif - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ #ifdef DEPTH_CONV_USE_SPL && - kernel_h != 3 + filter_tensor_h_ != 3 #endif #undef DEPTH_CONV_USE_SPL ) { @@ -216,75 +165,61 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::DepthwiseConv2d; - } else if (kernel_w == 3 && kernel_h == 3) { + } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { // #define CONV3x3OPT_FALL_BACK #ifndef CONV3x3OPT_FALL_BACK // conv2d_3x3 - kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch" - : "conv2d_3x3_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" + : "conv2d_3x3_opt"); kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3opt; - - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #else kernel_func_names_.push_back("conv2d_3x3"); kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3; - #endif #undef CONV3x3OPT_FALL_BACK - } else if (kernel_h == 5 && kernel_w == 5) { + } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT // conv2d_5x5 @@ -293,55 +228,42 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5; #else // conv2d_5x5_opt - kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch" - : "conv2d_5x5_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch" + : "conv2d_5x5_opt"); kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_5x5_OPT - } else if (kernel_h == 7 && kernel_w == 7) { + } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) { #define CONV_7x7_OPT #ifndef CONV_7x7_OPT // conv2d_7x7 @@ -350,52 +272,39 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7; #else // conv2d_7x7 - kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch" - : "conv2d_7x7_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" + : "conv2d_7x7_opt"); kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_7x7_OPT } else { @@ -407,30 +316,30 @@ void ConvImageCompute::PrepareForRun() { // build options std::string build_options_single(" -DCL_DTYPE_half"); // relu options - VLOG(3) << "relu_fused:" << relu_fused - << " param.activation_param.active_type:" - << static_cast(param.activation_param.active_type) - << " param.activation_param.has_active:" - << param.activation_param.has_active; - if (param.activation_param.has_active) { - if (param.activation_param.active_type == - lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused` + VLOG(3) << "relu_fused_:" << relu_fused_ + << " conv_param_->activation_param.active_type:" + << static_cast(conv_param_->activation_param.active_type) + << " conv_param_->activation_param.has_active:" + << conv_param_->activation_param.has_active; + if (conv_param_->activation_param.has_active) { + if (conv_param_->activation_param.active_type == + lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused_` // also is ok build_options_single += " -DRELU"; - } else if (param.activation_param.active_type == + } else if (conv_param_->activation_param.active_type == lite_api::ActivationType::kRelu6) { build_options_single += " -DRELU6"; } else { LOG(FATAL) << "Unsupported activation type:" - << static_cast(param.activation_param.active_type); + << static_cast(conv_param_->activation_param.active_type); } } + GetGlobalWorkSize(); // bias options - const bool has_bias = param.bias != nullptr; const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - if (has_bias) { + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); + if (has_bias_) { bias_gpu_image_ = std::unique_ptr(new Tensor); build_options_single += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH"; @@ -438,21 +347,36 @@ void ConvImageCompute::PrepareForRun() { // convert cpu buffer bias --> gpu image CLImageConverterFolder bias_converter; const DDim& bias_image_dims = - bias_converter.InitImageDimInfoWith(param.bias->dims()); - + bias_converter.InitImageDimInfoWith(conv_param_->bias->dims()); + bias_image_h_ = bias_image_dims[1]; + bias_image_w_ = bias_image_dims[0]; tensor_hold_bias_image_->Resize( {1, bias_image_dims[0], bias_image_dims[1], 4}); half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); - float* bias_cpu_data = param.bias->mutable_data(); + float* bias_cpu_data = conv_param_->bias->mutable_data(); bias_converter.NCHWToImage( - bias_cpu_data, bias_image_data, param.bias->dims()); + bias_cpu_data, bias_image_data, conv_param_->bias->dims()); this->bias_gpu_image_->mutable_data( bias_image_dims[0], bias_image_dims[1], bias_image_data); // convert cpu buffer bias --> gpu image --- end ---- + } else { + bias_gpu_image_ = std::unique_ptr(new Tensor); + CLImageConverterFolder bias_converter; + tensor_hold_bias_image_->Resize({1, 1, 1, 4}); + half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); + this->bias_gpu_image_->mutable_data( + 1, 1, bias_image_data); } + // define image pointer for filter, bias + input_image_p_ = conv_param_->x->data(); + filter_image_p_ = filter_gpu_image_->data(); + bias_image_p_ = bias_gpu_image_->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + build_options_.push_back(build_options_single); for (size_t i = 0; i < kernel_func_names_.size(); i++) { @@ -478,55 +402,55 @@ void ConvImageCompute::PrepareForRun() { VLOG(4) << "max_work_group_size: " << max_work_group_size; if (max_work_group_size > 0 && use_lws_) { - double min_turn_time = DBL_MAX; + double min_tune_time = DBL_MAX; cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize( global_work_size_, max_work_group_size); VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " " << best_local_work_size[1] << " " << best_local_work_size[2]; cl::NDRange last_local_work_size = cl::NDRange{ static_cast(0), static_cast(0), static_cast(0)}; - if (use_turn_) { + if (use_tune_) { for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; } // reverse for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; @@ -540,548 +464,316 @@ void ConvImageCompute::PrepareForRun() { } } -void ConvImageCompute::Conv2d1x1opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::ReInitWhenNeeded() { + conv_param_ = param_.get_mutable(); + auto x_dims = conv_param_->x->dims(); #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; -#endif -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d_1x1 params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -// VLOG(4) << "default work size{c_block, w, nh}: " -// << "{" << c_block << ", " << w << ", " << nh << "" -// << "}"; + LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_ + << ", last_input_dims_:" << last_input_dims_ + << ", x_dims:" << x_dims; #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, default_w_blk_); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { - CLRuntime::Global()->command_queue().finish(); - } -} -void ConvImageCompute::Conv2d3x3(bool is_turn) { - auto kernel = kernel_; - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - int filter_channel = filter_dims[1]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - - // re-calc group - int new_groups{param.groups}; - if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) { - new_groups = 1; - } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { - new_groups = input_channel / filter_channel; - } - /* TODO(ysh329): mobile has no case below - else { - LOG(FATAL) << "Not support conv3x3 case with" - << " input_dims:" << input_dims << " output_dims:" << - output_dims - << " filter_dims:" << filter_dims; + if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) { + is_first_epoch_for_run_ = false; + last_input_dims_ = x_dims; + + input_tensor_n_ = x_dims[0]; + input_tensor_c_ = x_dims[1]; + input_tensor_h_ = x_dims[2]; + input_tensor_w_ = x_dims[3]; + auto x_image_shape = InitImageDimInfoWith(x_dims); + input_image_h_ = x_image_shape["height"]; + input_image_w_ = x_image_shape["width"]; + + auto output_dims = conv_param_->output->dims(); + output_tensor_n_ = output_dims[0]; + output_tensor_c_ = output_dims[1]; + output_tensor_h_ = output_dims[2]; + output_tensor_w_ = output_dims[3]; + auto output_image_shape = InitImageDimInfoWith(output_dims); + output_image_h_ = output_image_shape["height"]; + output_image_w_ = output_image_shape["width"]; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + CHECK_GE(conv_param_->x->dims().size(), 4); + CHECK_GE(conv_param_->output->dims().size(), 4); + if (kernel_func_names_.size() > 0 && + kernel_func_names_[0] == "conv2d_3x3") { + groups_ = conv_param_->groups; + if (filter_tensor_n_ == output_tensor_c_ && + filter_tensor_c_ == input_tensor_c_) { + groups_ = 1; + } else if (!(filter_tensor_n_ == input_tensor_c_ && + filter_tensor_c_ == 1)) { + groups_ = input_tensor_c_ / filter_tensor_c_; + } } - */ - - // const std::vector& default_work_size = - // DefaultWorkSize(output_dims, - // DDim(std::vector{ - // static_cast(out_image_shape["width"]), - // static_cast(out_image_shape["height"])})); - - // int c_block = default_work_size[0]; - // int w = default_work_size[1]; - // int nh = default_work_size[2]; - - // VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_c_block: " << input_c_block; - // VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - // VLOG(4) << "input_dims: " << input_dims; - // VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - // VLOG(4) << "output_dims: " << output_dims; - // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - // << out_image_shape["height"]; - // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - // VLOG(4) << "has bias: " << has_bias; - // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - // VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - // VLOG(4) << "offset: " << offset; - // VLOG(4) << "dilations.size : " << dilations.size(); - // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - // VLOG(4) << "param.groups(groups):" << param.groups; - // VLOG(4) << "new_groups:" << new_groups; - // VLOG(4) << "default work size{c_block, w, nh}: " - // << "{" << c_block << ", " << w << ", " << nh << "" - // << "}"; - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - // STL::stringstream kernel_key; - // kernel_key << kernel_func_names_[0] << build_options_[0]; - // auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - // VLOG(4) << "kernel_key: " << kernel_key.str(); - // VLOG(4) << "kernel ready ... " << kernel_key.str(); - // VLOG(4) << "w: " << w; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - VLOG(4) << "set bias_image: "; - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + // define image pointer for input, output + input_image_p_ = conv_param_->x->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + + GetGlobalWorkSize(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, new_groups); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_dims[1])); - CL_CHECK_FATAL(status); - - // auto global_work_size = - // cl::NDRange{static_cast(default_work_size.data()[0]), - // static_cast(default_work_size.data()[1]), - // static_cast(default_work_size.data()[2])}; - - // VLOG(4) << "out_image: " << out_image; - // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - // << global_work_size[1] << "," << global_work_size[2] << "}"; - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Conv2d3x3opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif +void ConvImageCompute::GetGlobalWorkSize() { + if (kernel_func_names_.size() <= 0) return; + // general input_c_block + input_c_block_ = static_cast(input_image_w_ / input_tensor_w_); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + // general gws + auto output_dims = conv_param_->output->dims(); + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(output_image_w_), + static_cast(output_image_h_)})); + default_c_blk_ = default_work_size[0]; + default_w_blk_ = default_work_size[1]; + default_nh_blk_ = default_work_size[2]; + c_blk_ = default_c_blk_; + w_blk_ = default_w_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + if (kernel_func_names_[0] == "conv2d_1x1_simple" || + kernel_func_names_[0] == "conv2d_1x1_opt") { + w_blk_ = maptofactor(default_w_blk_, 4); + c_blk_ = default_c_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + + } else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") { + // depthwise spl gws s1 + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + int w_blk_size = 2; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + + c_blk_ = c_block; + w_blk_ = w_blk; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "depth_conv2d_3x3") { + // depthwise spl gws + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + + c_blk_ = c_block; + w_blk_ = w; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); + } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || + kernel_func_names_[0] == "conv2d_3x3_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" || + kernel_func_names_[0] == "conv2d_5x5_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" || + kernel_func_names_[0] == "conv2d_7x7_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d1x1opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, default_w_blk_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(17, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(18, filter_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(19, groups_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(20, input_tensor_c_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d3x3opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); #ifdef LITE_WITH_LOG // VLOG(4) << "out_image: " << out_image; @@ -1089,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - -// default_work_size[2] = h_blk; +void ConvImageCompute::Conv2d5x5(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - // VLOG(4) << "out_image: " << out_image; + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d5x5opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d7x7(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d7x7opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); +void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d 7x7 params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + auto& context = ctx_->As(); - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); +void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(paddings[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[1])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - int offset = filter_dims[2] / 2 - paddings[0]; - int input_c_block = (x_dims[1] + 3) / 4; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - - auto kernel = kernel_; - +void ConvImageCompute::DepthwiseConv2d(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "setArg"; - VLOG(4) << "strides = " << strides[0]; - VLOG(4) << "offset = " << offset; - VLOG(4) << "dilations = " << dilations[0]; - VLOG(4) << "input_c_block = " << input_c_block; - VLOG(4) << "x_dims[3] = " << x_dims[3]; - VLOG(4) << "x_dims[2] = " << x_dims[2]; - VLOG(4) << "output_dims[3] = " << output_dims[3]; - VLOG(4) << "output_dims[2] = " << output_dims[2]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(offset)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_c_block)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); +void ConvImageCompute::Run() { (this->*impl_)(false); } - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; +void ConvImageCompute::PrintConvInfo() { + const bool is_element_wise_bias = + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ depthwise conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_; + // VLOG(4) << "input_image: " << input_image_p_; + VLOG(4) << "input_dims: " << conv_param_->x->dims(); + VLOG(4) << "filter_dims: " << conv_param_->filter->dims(); // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "output_dims: " << conv_param_->output->dims(); + VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_; + VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_; + VLOG(4) << "has bias: " << has_bias_; VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - -#ifdef LITE_WITH_LOG + VLOG(4) << "strides: " << stride_h_ << "," << stride_w_; + VLOG(4) << "offset: "; + VLOG(4) << "dilations.size : " << conv_param_->dilations->size(); + VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_; VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," << global_work_size_[1] << "," << global_work_size_[2] << "}"; -#endif - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Run() { (this->*impl_)(false); } - -double ConvImageCompute::Turn(int times) { +double ConvImageCompute::Tune(int times) { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 64276a5721..4eab7be1f1 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -33,6 +33,7 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { + class ConvImageCompute : public KernelLite { @@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite kernel_func_names_{}; @@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite tensor_hold_bias_image_{nullptr}; cl::NDRange global_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; + + // opencl kernel args int c_blk_ = 1; int w_blk_ = 1; int nh_blk_ = 1; + const cl::Image2D* input_image_p_{nullptr}; + const cl::Image2D* filter_image_p_{nullptr}; + const cl::Image2D* bias_image_p_{nullptr}; + const cl::Image2D* output_image_p_{nullptr}; + + int stride_h_{-1}; + int stride_w_{-1}; + + int dilation_h_{-1}; + int dilation_w_{-1}; + + int pad_up_{-1}; + int pad_down_{-1}; + int pad_left_{-1}; + int pad_right_{-1}; + + int offset_{-1}; + int groups_{-1}; + bool relu_fused_{false}; + bool has_bias_{false}; + + int input_tensor_n_{-1}; + int input_tensor_c_{-1}; + int input_tensor_h_{-1}; + int input_tensor_w_{-1}; + int input_image_h_{-1}; + int input_image_w_{-1}; + int input_c_block_{-1}; + + int output_tensor_n_{-1}; + int output_tensor_c_{-1}; + int output_tensor_h_{-1}; + int output_tensor_w_{-1}; + int output_image_h_{-1}; + int output_image_w_{-1}; + + int filter_tensor_n_{-1}; + int filter_tensor_c_{-1}; + int filter_tensor_h_{-1}; + int filter_tensor_w_{-1}; + int filter_image_h_{-1}; + int filter_image_w_{-1}; + + int bias_image_h_{-1}; + int bias_image_w_{-1}; + int default_c_blk_ = 1; int default_w_blk_ = 1; int default_nh_blk_ = 1; + // ================= + + DDim last_input_dims_{}; + bool is_first_epoch_for_run_{true}; cl::Kernel kernel_; + cl_int status_; cl::NDRange local_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; bool use_lws_{true}; - bool use_turn_{false}; + bool use_tune_{false}; }; } // namespace opencl -- GitLab