From c240355cc6cd56083b064866d37ff437ed0eeef3 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 15 Dec 2020 01:34:20 +0000
Subject: [PATCH] dnn(ocl): avoid mess FP16/FP32 in convolution layer

---
 modules/core/src/convert.dispatch.cpp         |  2 +-
 modules/core/src/opencl/halfconvert.cl        | 13 ++-
 modules/dnn/src/layers/convolution_layer.cpp  | 26 +++---
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp   |  2 -
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 88 ++++++++++---------
 modules/dnn/src/opencl/conv_spatial_helper.cl |  7 +-
 6 files changed, 77 insertions(+), 61 deletions(-)

diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp
index 4d396b5e99..c32c5bb420 100644
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
                               sdepth == CV_32F ? "half" : "float",
                               rowsPerWI,
                               sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
-    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/opencl/halfconvert.cl b/modules/core/src/opencl/halfconvert.cl
index 506df69faf..9df602f406 100644
--- a/modules/core/src/opencl/halfconvert.cl
+++ b/modules/core/src/opencl/halfconvert.cl
@@ -47,8 +47,17 @@
 #endif
 #endif
 
-__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
-                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+__kernel void
+#ifdef FLOAT_TO_HALF
+    convertFp16_FP32_to_FP16
+#else
+    convertFp16_FP16_to_FP32
+#endif
+(
+    __global const uchar * srcptr, int src_step, int src_offset,
+    __global uchar * dstptr, int dst_step, int dst_offset,
+    int dst_rows, int dst_cols
+)
 {
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 63bd386119..f131c023ab 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -1461,16 +1461,7 @@ public:
             umat_blobs.resize(n);
             for (size_t i = 0; i < n; i++)
             {
-                if (use_half)
-                {
-                    Mat matFP32;
-                    convertFp16(inputs[i + 1], matFP32);
-                    matFP32.copyTo(umat_blobs[i]);
-                }
-                else
-                {
-                    inputs[i + 1].copyTo(umat_blobs[i]);
-                }
+                inputs[i + 1].copyTo(umat_blobs[i]);
             }
             inputs.resize(1);
         }
@@ -1481,7 +1472,10 @@ public:
             umat_blobs.resize(n);
             for (size_t i = 0; i < n; i++)
             {
-                blobs[i].copyTo(umat_blobs[i]);
+                if (use_half)
+                    convertFp16(blobs[i], umat_blobs[i]);
+                else
+                    blobs[i].copyTo(umat_blobs[i]);
             }
         }
 
@@ -1537,14 +1531,20 @@ public:
 
         if (fusedWeights)
         {
-            weightsMat.copyTo(umat_blobs[0]);
+            if (use_half)
+                convertFp16(weightsMat, umat_blobs[0]);
+            else
+                weightsMat.copyTo(umat_blobs[0]);
             fusedWeights = false;
         }
         if (fusedBias)
         {
             if ( umat_blobs.size() < 2 )
                 umat_blobs.resize(2);
-            umat_blobs[1] = UMat(biasvec, true);
+            if (use_half)
+                convertFp16(Mat(biasvec, true), umat_blobs[1]);
+            else
+                Mat(biasvec, true).copyTo(umat_blobs[1]);
             convolutionOp->setBias(true);
             fusedBias = false;
         }
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index 8de7ba26e2..7bb277d102 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
         int32_t group_;
         bool bias_term_;
         UMat swizzled_weights_umat;
-        UMat weights_half;
-        UMat bias_half;
         UMat bottom_data2_;
 
         int32_t bottom_index_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 3707a31846..fd98919343 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
         fused_eltwise_ = false;
     }
 
-    if (use_half_ && bias_half.empty() && !bias.empty())
-        convertFp16(bias, bias_half);
+    if (use_half_ && !bias.empty())
+        CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
 
-    if (use_half_ && weights_half.empty())
-        convertFp16(weight, weights_half);
+    if (use_half_)
+        CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
 
-    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
+    prepareKernel(bottom, top, weight, bias, numImages);
     if (bestKernelConfig.empty())
         return false;
-    return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
+    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
 }
 
 template<typename Dtype>
@@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
                                      kernel_h_ * (int)alignSize(kernel_w_, 2),
                                      (use_half_) ? CV_16SC1 : CV_32FC1);
 
-    UMat swizzled_weights_tmp;
-    if (use_half_)
-        swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
-
     if (!interleave) {
-        cl_uint argIdx = 0;
         int32_t channels = channels_ / group_;
 
-        ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
-                                     cv::ocl::dnn::conv_spatial_helper_oclsrc);
+        ocl::Kernel oclk_copy_weight(
+            use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
+            cv::ocl::dnn::conv_spatial_helper_oclsrc,
+            use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
+        );
         if (oclk_copy_weight.empty())
             return false;
 
-        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
-        if (use_half_)
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
-        else
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
-        oclk_copy_weight.set(argIdx++, kernel_w_);
-        oclk_copy_weight.set(argIdx++, kernel_h_);
-        oclk_copy_weight.set(argIdx++, channels);
-        oclk_copy_weight.set(argIdx++, num_output_);
-        oclk_copy_weight.set(argIdx++, swizzled_factor);
+        oclk_copy_weight.args(
+            ocl::KernelArg::PtrReadOnly(weight),
+            ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
+            kernel_w_,
+            kernel_h_,
+            channels,
+            num_output_,
+            swizzled_factor
+        );
 
         size_t global_work_size_copy[3] = {
             (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         }
     } else {
         // assumption: kernel dimension is 2
-        Mat weightMat = weight.getMat(ACCESS_READ);
-        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
+        Mat weightMat;
         Mat swizzledWeightMat;
+        UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
         if (use_half_)
-            swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
+        {
+            CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
+            convertFp16(weight, weight_tmp);
+            weightMat = weight_tmp.getMat(ACCESS_READ);
+            swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
+        }
         else
+        {
+            weightMat = weight.getMat(ACCESS_READ);
             swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+        }
+
+        CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
+        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
         Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
 
         int interleavedRows = (kernel_w_ / 2) * 2;
@@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         int blockWidth = swizzled_factor;  // should equal to simd size.
         int rowAlignment = 32;
         size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
-        Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
-        CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
+        cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
         for (int od = 0; od < M_; od++)
             for (int id = 0; id < channels_; id++)
                 for (int r = 0; r < kernel_h_; r++)
                     for (int c = 0; c < kernel_w_; c++)
                         tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
                             cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
+
         interleaveMatrix(cpu_swizzled_weight,
-                         tmpSwizzledWeight,
+                         tmpSwizzledWeight.data(),
                          kernel_w_ * kernel_h_ * channels_, M_,
                          interleavedRows,
                          nonInterleavedRows,
                          blockWidth,
                          rowAlignment);
-        free(tmpSwizzledWeight);
-    }
 
-    if (use_half_)
-        convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
+        // unmap OpenCL buffers
+        weightMat.release();
+
+        if (use_half_)
+            convertFp16(swizzledWeightMat, swizzled_weights_umat);
+    }
 
     return true;
 }
@@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         cl_uint argIdx = 0;
         setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-        if (use_half_)
-            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
-        else
-            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
         if (bias_term_)
             kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
         kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                 kernel.set(argIdx++, image_offset);
-                if (use_half_)
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
-                else
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                 kernel.set(argIdx++, kernel_offset);
                 if (bias_term_)
                     kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
 
     UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
 
-    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
+    calculateBenchmark(bottom, benchData, weight, bias, numImages);
 
     if (run_auto_tuning_ || force_auto_tuning_)
     {
diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl
index 9d5a89f7b1..33d9db57c8 100644
--- a/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@@ -39,9 +39,14 @@
 //
 //M*/
 
+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
 
 __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
     (__global Dtype* weightIn,
-- 
GitLab