diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index a1164273accfce42ad3217893f3e0c7578c48747..45bd249e5d6272a4803015279bd1330d00e76e15 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -1034,12 +1034,15 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (uint16_t)output_w_); kernel.set(argIdx++, (uint16_t)output_h_); - size_t global_size[3]; - global_size[0] = output_w_; - global_size[1] = output_h_; - global_size[2] = num_output_ * num_; - - if (!kernel.run_(3, global_size, NULL, false)) + size_t wgs = kernel.workGroupSize(); + if (!wgs) + { + CV_LOG_ERROR(NULL, "DNN/OpenCL: Can't query workGroupSize of DWCONV kernel"); + return false; + } + size_t lws[1] = { wgs }; + size_t gws[1] = { roundUp((size_t)output_w_ * output_h_ * num_output_ * num_, (unsigned)lws[0]) }; + if (!kernel.run_(1, gws, lws, false)) { CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed"); return false; diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 455f0ed7ea06af82fd097c6fa6ed874e62192c57..eb5d354020bd326c167cde8483fd9091911d8782 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -1850,10 +1850,13 @@ __kernel void DWCONV( const ushort output_width, const ushort output_height) { __global Dtype* convolved_image = convolved_image_base + convolved_image_offset; - const int outputX = get_global_id(0); - const int outputY = get_global_id(1); - const int outputZ = get_global_id(2); - if(outputX < output_width && outputY < output_height) + const int out_idx = get_global_id(0); // 1D task layout: [output_width * output_height * OUTPUT_Z] + const int plane_size = output_width * output_height; + const int out_plane_idx = out_idx % plane_size; + const int outputZ = out_idx / plane_size; + const int outputY = out_plane_idx / output_width; + const int outputX = out_plane_idx % output_width; + if (outputZ < OUTPUT_Z) { Dtype sum = 0.;