diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index d664e37150fcc661e4bb97ed57a42364dd0d475d..a409690a2e37750c88395a592565b9e968e62845 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -14,8 +14,8 @@ #include "lite/kernels/opencl/conv_image_compute.h" +#include #include - #include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_include.h" #include "lite/core/op_registry.h" @@ -78,9 +78,27 @@ void ConvImageCompute::PrepareForRun() { VLOG(3) << "dilation_equal:" << dilation_equal; VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " << paddings[2] << " " << paddings[3]; - CHECK(pad_equal && stride_equal && dilation_equal); + // general gws.. + auto out_image_shape = InitImageDimInfoWith(output_dims); + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + default_c_blk_ = default_work_size[0]; + default_w_blk_ = default_work_size[1]; + default_nh_blk_ = default_work_size[2]; + c_blk_ = default_c_blk_; + w_blk_ = default_w_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + if (kernel_h == 1 && kernel_w == 1) { // conv2d_1x1 if (param.x->dims()[1] % 4 == 0) { @@ -99,6 +117,15 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d1x1opt; + { + // calc 1x1 gws + w_blk_ = maptofactor(default_w_blk_, 4); + c_blk_ = default_c_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } #define DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && @@ -107,9 +134,38 @@ void ConvImageCompute::PrepareForRun() { if (stride_h == 1 && dilations[0] == 1) { kernel_func_names_.push_back("depth_conv2d_3x3s1"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1; + { + // depthwise spl gws s1 + int c_block = (output_dims[1] + 3) / 4; + int w = output_dims[3]; + int nh = output_dims[0] * output_dims[2]; + int w_blk_size = 2; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + + c_blk_ = c_block; + w_blk_ = w_blk; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } } else { kernel_func_names_.push_back("depth_conv2d_3x3"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3; + { + // depthwise spl gws + int c_block = (output_dims[1] + 3) / 4; + int w = output_dims[3]; + int nh = output_dims[0] * output_dims[2]; + + c_blk_ = c_block; + w_blk_ = w; + nh_blk_ = nh; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); @@ -157,6 +213,22 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d3x3opt; + { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } + } else if (kernel_h == 5 && kernel_w == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT @@ -189,6 +261,21 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d5x5opt; + { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } #endif #undef CONV_5x5_OPT } else if (kernel_h == 7 && kernel_w == 7) { @@ -223,6 +310,21 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d7x7opt; + { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } #endif #undef CONV_7x7_OPT @@ -270,9 +372,36 @@ void ConvImageCompute::PrepareForRun() { context.cl_context()->AddKernel( kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]); } + + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; + + std::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + size_t max_work_group_size = 0; + kernel_.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + + VLOG(4) << "max_work_group_size: " << max_work_group_size; + + if (max_work_group_size > 0 && use_lws) { + // local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1( + // global_work_size_, max_work_group_size); + local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_, + max_work_group_size); + + VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," + << local_work_size_[1] << "," << local_work_size_[2] << "}"; + } } void ConvImageCompute::Conv2d1x1opt() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -302,16 +431,28 @@ void ConvImageCompute::Conv2d1x1opt() { int input_c = input_dims[1]; auto dilations = *param.dilations; - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); +// const std::vector& default_work_size = +// DefaultWorkSize(output_dims, +// DDim(std::vector{ +// static_cast(out_image_shape["width"]), +// static_cast(out_image_shape["height"])})); + +// int c_block = default_work_size[0]; +// int w = default_work_size[1]; +// int nh = default_work_size[2]; + +// int maped_w = maptofactor(w, 4); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; +// auto global_work_size_ = +// cl::NDRange{static_cast(default_work_size.data()[0]), +// static_cast(maped_w), +// static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG + // VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; +#endif #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d_1x1 params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -331,9 +472,9 @@ void ConvImageCompute::Conv2d1x1opt() { VLOG(4) << "offset: " << offset; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; +// VLOG(4) << "default work size{c_block, w, nh}: " +// << "{" << c_block << ", " << w << ", " << nh << "" +// << "}"; #endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -350,27 +491,14 @@ void ConvImageCompute::Conv2d1x1opt() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - std::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - int maped_w = maptofactor(w, 4); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); - VLOG(4) << "maped_w: " << maped_w; - VLOG(4) << "hasbias: " << has_bias; -#endif - + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, maped_w); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -401,49 +529,87 @@ void ConvImageCompute::Conv2d1x1opt() { CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); + status = kernel.setArg(++arg_idx, default_w_blk_); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(maped_w), - static_cast(default_work_size.data()[2])}; - -#ifndef LITE_SHUTDOWN_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; -#endif - - size_t max_work_group_size = 0; - kernel.getWorkGroupInfo(CLRuntime::Global()->device(), - CL_KERNEL_WORK_GROUP_SIZE, - &max_work_group_size); - cl::NDRange local_work_size = cl::NullRange; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "max_work_group_size: " << max_work_group_size; -#endif - if (max_work_group_size > 0 && use_lws) { - local_work_size = context.cl_context()->LocalWorkSize(global_work_size, - max_work_group_size); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," - << local_work_size[1] << "," << local_work_size[2] << "}"; -#endif - } - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, - local_work_size, + global_work_size_, + local_work_size_, nullptr, event_.get()); CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_image, event_); + +#ifdef PROFILE_CONV_KERNEL + bool use_profile = false; + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + double start = GetCurrentUS(); + + if (use_profile) { + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); + } else { + int count = 50; + double sumtime = 0; + if (!use_profile) { + count = 1; + } + for (size_t i = 0; i < count; i++) { + start = GetCurrentUS(); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); + if (use_profile) { + event_->wait(); + double duration = GetCurrentUS() - start; + sumtime += duration; + } + } + + auto dims_string = [](DDimLite dims) -> std::string { + std::ostringstream stream; + stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << "," + << dims[3] << "]"; + return stream.str(); + }; + if (use_profile) { + // LOG(INFO) << "input: " << input_dims; + // LOG(INFO) << "filter: " << filter_dims; + // LOG(INFO) << "output: " << output_dims; + + std::cout << std::setw(25) << std::left << dims_string(input_dims) + << std::setw(25) << std::left << dims_string(filter_dims) + << std::setw(25) << std::left << dims_string(output_dims) + << std::setw(25) << std::left << sumtime / count << std::endl; + } else { + dims_string(input_dims); + } + } +#endif } void ConvImageCompute::Conv2d3x3() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -486,24 +652,14 @@ void ConvImageCompute::Conv2d3x3() { } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { new_groups = input_channel / filter_channel; } - /* TODO(ysh329): mobile has no case below - else { - LOG(FATAL) << "Not support conv3x3 case with" - << " input_dims:" << input_dims << " output_dims:" << - output_dims - << " filter_dims:" << filter_dims; - } - */ - - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; +/* TODO(ysh329): mobile has no case below + else { + LOG(FATAL) << "Not support conv3x3 case with" + << " input_dims:" << input_dims << " output_dims:" << + output_dims + << " filter_dims:" << filter_dims; + } +*/ #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; @@ -527,9 +683,9 @@ void ConvImageCompute::Conv2d3x3() { VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "param.groups(groups):" << param.groups; VLOG(4) << "new_groups:" << new_groups; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; +// VLOG(4) << "default work size{c_block, w, nh}: " +// << "{" << c_block << ", " << w << ", " << nh << "" +// << "}"; #endif CHECK_GE(dilations.size(), 2); @@ -544,26 +700,15 @@ void ConvImageCompute::Conv2d3x3() { if (has_bias) { bias_image = bias_gpu_image_.data(); } - - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); - VLOG(4) << "w: " << w; -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -607,21 +752,16 @@ void ConvImageCompute::Conv2d3x3() { status = kernel.setArg(++arg_idx, new_groups); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(default_work_size.data()[1]), - static_cast(default_work_size.data()[2])}; - #ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -630,6 +770,8 @@ void ConvImageCompute::Conv2d3x3() { } void ConvImageCompute::Conv2d3x3opt() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -657,24 +799,6 @@ void ConvImageCompute::Conv2d3x3opt() { const bool is_element_wise_bias = has_bias && param.output->dims() == param.bias->dims(); - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 5; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - // default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; -// default_work_size[2] = h_blk; - #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -692,9 +816,6 @@ void ConvImageCompute::Conv2d3x3opt() { VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); @@ -710,24 +831,15 @@ void ConvImageCompute::Conv2d3x3opt() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, h_blk); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -763,38 +875,17 @@ void ConvImageCompute::Conv2d3x3opt() { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(w_blk), - static_cast(h_blk)}; #ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; -#endif - - size_t max_work_group_size = 0; - kernel.getWorkGroupInfo(CLRuntime::Global()->device(), - CL_KERNEL_WORK_GROUP_SIZE, - &max_work_group_size); - cl::NDRange local_work_size = cl::NullRange; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "max_work_group_size: " << max_work_group_size; -#endif - if (max_work_group_size > 0 && use_lws) { - local_work_size = context.cl_context()->LocalWorkSize(global_work_size, - max_work_group_size); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," - << local_work_size[1] << "," << local_work_size[2] << "}"; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - } status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, - local_work_size, + global_work_size_, + local_work_size_, nullptr, event_.get()); CL_CHECK_FATAL(status); @@ -802,6 +893,8 @@ void ConvImageCompute::Conv2d3x3opt() { } void ConvImageCompute::Conv2d5x5() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -833,16 +926,6 @@ void ConvImageCompute::Conv2d5x5() { int input_c = input_dims[1]; auto dilations = *param.dilations; - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -863,9 +946,6 @@ void ConvImageCompute::Conv2d5x5() { VLOG(4) << "offset: " << offset; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); @@ -881,25 +961,15 @@ void ConvImageCompute::Conv2d5x5() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); - VLOG(4) << "w: " << w; -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -933,21 +1003,16 @@ void ConvImageCompute::Conv2d5x5() { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(default_work_size.data()[1]), - static_cast(default_work_size.data()[2])}; - #ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -956,6 +1021,8 @@ void ConvImageCompute::Conv2d5x5() { } void ConvImageCompute::Conv2d5x5opt() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -984,22 +1051,6 @@ void ConvImageCompute::Conv2d5x5opt() { const bool is_element_wise_bias = has_bias && param.output->dims() == param.bias->dims(); - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 5; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - // default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; // default_work_size[2] = h_blk; #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; @@ -1018,9 +1069,6 @@ void ConvImageCompute::Conv2d5x5opt() { VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -1035,22 +1083,14 @@ void ConvImageCompute::Conv2d5x5opt() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, h_blk); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -1083,38 +1123,13 @@ void ConvImageCompute::Conv2d5x5opt() { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(w_blk), - static_cast(h_blk)}; - -// VLOG(4) << "out_image: " << out_image; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; -#endif - size_t max_work_group_size = 0; - kernel.getWorkGroupInfo(CLRuntime::Global()->device(), - CL_KERNEL_WORK_GROUP_SIZE, - &max_work_group_size); - cl::NDRange local_work_size = cl::NullRange; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "max_work_group_size: " << max_work_group_size; -#endif - if (max_work_group_size > 0 && use_lws) { - local_work_size = context.cl_context()->LocalWorkSize(global_work_size, - max_work_group_size); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," - << local_work_size[1] << "," << local_work_size[2] << "}"; -#endif - } + // VLOG(4) << "out_image: " << out_image; status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, - local_work_size, + global_work_size_, + local_work_size_, nullptr, event_.get()); CL_CHECK_FATAL(status); @@ -1122,6 +1137,8 @@ void ConvImageCompute::Conv2d5x5opt() { } void ConvImageCompute::Conv2d7x7() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -1153,16 +1170,6 @@ void ConvImageCompute::Conv2d7x7() { int input_c = input_dims[1]; auto dilations = *param.dilations; - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -1183,9 +1190,6 @@ void ConvImageCompute::Conv2d7x7() { VLOG(4) << "offset: " << offset; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); @@ -1201,25 +1205,15 @@ void ConvImageCompute::Conv2d7x7() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); - VLOG(4) << "w: " << w; -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -1253,21 +1247,16 @@ void ConvImageCompute::Conv2d7x7() { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(default_work_size.data()[1]), - static_cast(default_work_size.data()[2])}; - #ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -1275,6 +1264,8 @@ void ConvImageCompute::Conv2d7x7() { context.cl_wait_list()->emplace(out_image, event_); } void ConvImageCompute::Conv2d7x7opt() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -1302,23 +1293,6 @@ void ConvImageCompute::Conv2d7x7opt() { const bool is_element_wise_bias = has_bias && param.output->dims() == param.bias->dims(); - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 5; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - // default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; -// default_work_size[2] = h_blk; #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d 7x7 params ============"; // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -1336,9 +1310,6 @@ void ConvImageCompute::Conv2d7x7opt() { VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -1353,24 +1324,15 @@ void ConvImageCompute::Conv2d7x7opt() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, h_blk); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -1403,39 +1365,19 @@ void ConvImageCompute::Conv2d7x7opt() { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(w_blk), - static_cast(h_blk)}; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; -#endif - size_t max_work_group_size = 0; - kernel.getWorkGroupInfo(CLRuntime::Global()->device(), - CL_KERNEL_WORK_GROUP_SIZE, - &max_work_group_size); - cl::NDRange local_work_size = cl::NullRange; - if (max_work_group_size > 0 && use_lws) { - local_work_size = context.cl_context()->LocalWorkSize(global_work_size, - max_work_group_size); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," - << local_work_size[1] << "," << local_work_size[2] << "}"; -#endif - } - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, - local_work_size, + global_work_size_, + local_work_size_, nullptr, event_.get()); CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_image, event_); } void ConvImageCompute::DepthwiseConv2d3x3s1() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto x_dims = param.x->dims(); auto filter_dims = param.filter->dims(); @@ -1444,8 +1386,6 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { auto strides = param.strides; auto dilations = *param.dilations; - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); auto* input_img = param.x->data(); auto* filter_img = filter_gpu_image_.data(); @@ -1459,26 +1399,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { auto* output_img = param.output->mutable_data( image_shape["width"], image_shape["height"]); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - - int w_blk_size = 2; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - - auto global_work_size = cl::NDRange(c_block, w_blk, nh); + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, static_cast(c_block)); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(w_blk)); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(nh)); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_img); CL_CHECK_FATAL(status); @@ -1516,28 +1445,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); CL_CHECK_FATAL(status); - size_t max_work_group_size = 0; - kernel.getWorkGroupInfo(CLRuntime::Global()->device(), - CL_KERNEL_WORK_GROUP_SIZE, - &max_work_group_size); - cl::NDRange local_work_size = cl::NullRange; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "max_work_group_size: " << max_work_group_size; -#endif - if (max_work_group_size > 0 && use_lws) { - local_work_size = context.cl_context()->LocalWorkSize(global_work_size, - max_work_group_size); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," - << local_work_size[1] << "," << local_work_size[2] << "}"; -#endif - } - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, - local_work_size, + global_work_size_, + local_work_size_, nullptr, event_.get()); CL_CHECK_FATAL(status); @@ -1545,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { } void ConvImageCompute::DepthwiseConv2d3x3() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto x_dims = param.x->dims(); auto filter_dims = param.filter->dims(); @@ -1555,8 +1469,6 @@ void ConvImageCompute::DepthwiseConv2d3x3() { int offset = filter_dims[2] / 2 - paddings[0]; int input_c_block = (x_dims[1] + 3) / 4; - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); auto* input_img = param.x->data(); auto* filter_img = filter_gpu_image_.data(); @@ -1570,21 +1482,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() { auto* output_img = param.output->mutable_data( image_shape["width"], image_shape["height"]); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - auto global_work_size = cl::NDRange(c_block, w, nh); + auto kernel = kernel_; #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "setArg"; - VLOG(4) << "c_block = " << c_block; - VLOG(4) << "w = " << w; - VLOG(4) << "nh = " << nh; - VLOG(4) << "strides = " << strides[0]; VLOG(4) << "offset = " << offset; VLOG(4) << "dilations = " << dilations[0]; @@ -1597,11 +1498,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() { cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, static_cast(c_block)); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(w)); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(nh)); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_img); CL_CHECK_FATAL(status); @@ -1641,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() { status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -1650,6 +1551,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() { } void ConvImageCompute::DepthwiseConv2d() { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -1681,16 +1584,6 @@ void ConvImageCompute::DepthwiseConv2d() { int input_c = input_dims[1]; auto dilations = *param.dilations; - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ depthwise conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," @@ -1710,9 +1603,6 @@ void ConvImageCompute::DepthwiseConv2d() { VLOG(4) << "offset: " << offset; VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - VLOG(4) << "default work size{c_block, w, nh}: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; #endif CHECK_GE(dilations.size(), 2); @@ -1730,25 +1620,15 @@ void ConvImageCompute::DepthwiseConv2d() { bias_image = bias_gpu_image_.data(); } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_names_[0] << build_options_[0]; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "kernel_key: " << kernel_key.str(); - VLOG(4) << "kernel ready ... " << kernel_key.str(); - VLOG(4) << "w: " << w; -#endif + auto kernel = kernel_; cl_int status; int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); + status = kernel.setArg(arg_idx, c_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); + status = kernel.setArg(++arg_idx, w_blk_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); + status = kernel.setArg(++arg_idx, nh_blk_); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *input_image); CL_CHECK_FATAL(status); @@ -1786,21 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() { status = kernel.setArg(++arg_idx, filter_height); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size.data()[0]), - static_cast(default_work_size.data()[1]), - static_cast(default_work_size.data()[2])}; - #ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - << global_work_size[1] << "," << global_work_size[2] << "}"; + VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," + << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -1809,7 +1684,7 @@ void ConvImageCompute::DepthwiseConv2d() { } void ConvImageCompute::Run() { (this->*impl_)(); } - +#undef PROFILE_CONV_KERNEL } // namespace opencl } // namespace kernels } // namespace lite diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 57e4b91e0a842487fc5dfce4799fab244348772d..c30c271498737acf3b831d7799af1b5b316e95de 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -59,6 +59,19 @@ class ConvImageCompute : public KernelLite event_{new cl::Event}; Tensor filter_gpu_image_; Tensor bias_gpu_image_; + cl::NDRange global_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; + int c_blk_ = 1; + int w_blk_ = 1; + int nh_blk_ = 1; + + int default_c_blk_ = 1; + int default_w_blk_ = 1; + int default_nh_blk_ = 1; + + cl::Kernel kernel_; + cl::NDRange local_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; bool use_lws{true}; };