提交 f977d10a 编写于 作者: A Alexander Alekhin

dnn(ocl): fix conv DWCONV workgroup

上级 4d587c34
......@@ -1034,12 +1034,15 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
size_t global_size[3];
global_size[0] = output_w_;
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run_(3, global_size, NULL, false))
size_t wgs = kernel.workGroupSize();
if (!wgs)
{
CV_LOG_ERROR(NULL, "DNN/OpenCL: Can't query workGroupSize of DWCONV kernel");
return false;
}
size_t lws[1] = { wgs };
size_t gws[1] = { roundUp((size_t)output_w_ * output_h_ * num_output_ * num_, (unsigned)lws[0]) };
if (!kernel.run_(1, gws, lws, false))
{
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
return false;
......
......@@ -1850,10 +1850,13 @@ __kernel void DWCONV(
const ushort output_width,
const ushort output_height) {
__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
const int outputX = get_global_id(0);
const int outputY = get_global_id(1);
const int outputZ = get_global_id(2);
if(outputX < output_width && outputY < output_height)
const int out_idx = get_global_id(0); // 1D task layout: [output_width * output_height * OUTPUT_Z]
const int plane_size = output_width * output_height;
const int out_plane_idx = out_idx % plane_size;
const int outputZ = out_idx / plane_size;
const int outputY = out_plane_idx / output_width;
const int outputX = out_plane_idx % output_width;
if (outputZ < OUTPUT_Z)
{
Dtype sum = 0.;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册