Merge pull request #22479 from scottchou007:master

Fix issues in opencv_test_dnn from conv48 kernels without bias

Merge pull request #22479 from scottchou007:master
Fix issues in opencv_test_dnn from conv48 kernels without bias
e1e92614 · Alexander Smorkalov · GitHub · 819e5b19 · a3cb2020 · e1e92614
4 changed file
--- a/modules/dnn/src/vkcom/shader/conv48_nobias.comp
+++ b/modules/dnn/src/vkcom/shader/conv48_nobias.comp
+#version 450
+
+layout (constant_id = 0) const int LOCAL_SZ_X = 0;
+layout (constant_id = 1) const int LOCAL_SZ_Y = 0;
+layout (constant_id = 2) const int LOCAL_SZ_Z = 0;
+layout (constant_id = 3) const int IN_H = 0;
+layout (constant_id = 4) const int IN_W = 0;
+layout (constant_id = 5) const int OUT_W = 0;
+layout (constant_id = 6) const int STRIDE_H = 0;
+layout (constant_id = 7) const int STRIDE_W = 0;
+layout (constant_id = 8) const int PAD_H = 0;
+layout (constant_id = 9) const int PAD_W = 0;
+layout (constant_id = 10) const int FILTER_H = 0;
+layout (constant_id = 11) const int FILTER_W = 0;
+layout (constant_id = 12) const int CHANNELS = 0;
+layout (constant_id = 13) const int BATCH = 0;
+layout (constant_id = 14) const int M = 0;
+layout (constant_id = 15) const int K = 0;
+layout (constant_id = 16) const int N = 0;
+layout (constant_id = 17) const int TAIL_M = 0;
+layout (constant_id = 18) const int DILATION_H = 0;
+layout (constant_id = 19) const int DILATION_W = 0;
+
+#if defined(ACTIVATION_RELU)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(0.0), vec4(999999999.0))
+#elif defined(ACTIVATION_RELU1)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(-1.0), vec4(1.0))
+#elif defined(ACTIVATION_RELU6)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(0.0), vec4(6.0))
+#else
+#define ACTIVATION_FUNCTION(x)  (x)
+#endif
+
+layout(binding = 0) readonly buffer Input0{
+    float data[];
+} src0;
+layout(binding = 1) readonly buffer Input1 {
+    vec4 data[];
+} bias;
+layout(binding = 2) readonly buffer Input3{
+    vec4 data[];
+} src1;
+layout(binding = 3) writeonly buffer Output{
+    vec4 data[];
+} out0;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#define VEC_SIZE 4
+#define BLOCK_H 4
+#define BLOCK_W 8
+#define FILTER_AREA (FILTER_H * FILTER_W)
+#define LOAD_A(elm_idx, a_component) \
+            src0_x = org_x + ((i * VEC_SIZE + elm_idx) % FILTER_W) * DILATION_W; \
+            src0_y = org_y + (((i * VEC_SIZE + elm_idx) % FILTER_AREA) / FILTER_W) * DILATION_H; \
+            src0_z = (i * VEC_SIZE + elm_idx) / FILTER_AREA; \
+            if(src0_y >= 0 && src0_y < IN_H && src0_x >= 0 && src0_x < IN_W) \
+            { \
+                a_component = src0.data[input_batch_offset + src0_z * (IN_H * IN_W) + src0_y * IN_W + src0_x]; \
+            }
+
+#define A_MULTIPLY_BTILE(a, sliver_num, comp) \
+            dst_x = (out_y + sliver_num) % OUT_W; \
+            dst_y = (out_y + sliver_num) / OUT_W; \
+            org_y = dst_y * STRIDE_H - PAD_H; \
+            org_x = dst_x * STRIDE_W - PAD_W; \
+            LOAD_A(0, a.x); \
+            LOAD_A(1, a.y); \
+            LOAD_A(2, a.z); \
+            LOAD_A(3, a.w); \
+            dot0.comp += dot(brow0, a); \
+            dot1.comp += dot(brow1, a); \
+            dot2.comp += dot(brow2, a); \
+            dot3.comp += dot(brow3, a); \
+            dot4.comp += dot(brow4, a); \
+            dot5.comp += dot(brow5, a); \
+            dot6.comp += dot(brow6, a); \
+            dot7.comp += dot(brow7, a);
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+    int out_x = BLOCK_W * gx;
+    int out_y = BLOCK_H * gy;
+    int input_batch_offset  = gz * IN_H * IN_W * CHANNELS;
+    int output_batch_offset = gz * M * N / VEC_SIZE;
+    if (out_x < N && gy < M / BLOCK_H)
+    {
+        int width0 = K / VEC_SIZE;
+        int width1 = N / VEC_SIZE;
+        int src1_read0_offset = out_x * width0;
+        vec4 dot0 = vec4(0.f);
+        vec4 dot1 = vec4(0.f);
+        vec4 dot2 = vec4(0.f);
+        vec4 dot3 = vec4(0.f);
+        vec4 dot4 = vec4(0.f);
+        vec4 dot5 = vec4(0.f);
+        vec4 dot6 = vec4(0.f);
+        vec4 dot7 = vec4(0.f);
+        int i = 0;
+        do
+        {
+            int dst_x, dst_y, org_x, org_y, src0_x, src0_y, src0_z;
+            vec4 a0 = vec4(0.f), a1 = vec4(0.f), a2 = vec4(0.f), a3 = vec4(0.f);
+            vec4 brow0 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow1 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow2 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow3 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow4 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow5 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow6 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow7 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            src1_read0_offset += 1 - BLOCK_W * width0;
+
+            A_MULTIPLY_BTILE(a0, 0, x);
+            A_MULTIPLY_BTILE(a1, 1, y);
+            A_MULTIPLY_BTILE(a2, 2, z);
+            A_MULTIPLY_BTILE(a3, 3, w);
+            i++;
+        }
+        while( i < width0 );
+
+        out0.data[output_batch_offset + (out_x + 0) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot0);
+        out0.data[output_batch_offset + (out_x + 1) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot1);
+        out0.data[output_batch_offset + (out_x + 2) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot2);
+        out0.data[output_batch_offset + (out_x + 3) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot3);
+        out0.data[output_batch_offset + (out_x + 4) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot4);
+        out0.data[output_batch_offset + (out_x + 5) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot5);
+        out0.data[output_batch_offset + (out_x + 6) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot6);
+        out0.data[output_batch_offset + (out_x + 7) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot7);
+    }
+}
--- a/modules/dnn/src/vkcom/shader/conv48_nobias_spv.cpp
+++ b/modules/dnn/src/vkcom/shader/conv48_nobias_spv.cpp
--- a/modules/dnn/src/vkcom/shader/spv_shader.hpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.hpp
@@ -14,6 +14,7 @@ namespace cv { namespace dnn { namespace vkcom {
 extern const unsigned int dw_conv_spv[1760];
 extern const unsigned int permute_spv[765];
 extern const unsigned int conv48_spv[7458];
+extern const unsigned int conv48_nobias_spv[7182];
 extern const unsigned int lrn_spv[1845];
 extern const unsigned int concat_spv[541];
 extern const unsigned int avg_pool_spv[1538];

--- a/modules/dnn/src/vkcom/src/op_conv.cpp
+++ b/modules/dnn/src/vkcom/src/op_conv.cpp
@@ -167,7 +167,8 @@ bool OpConv::forward(Tensor& in, Tensor& filter_weights, Tensor& bias, Tensor& o
            config_.local_size_z = 1;
            config_.block_height = 4;
            config_.block_width  = 8;
-            createShaderModule(conv48_spv, sizeof(conv48_spv));
+            has_bias_ ? createShaderModule(conv48_spv, sizeof(conv48_spv))
+                : createShaderModule(conv48_nobias_spv, sizeof(conv48_nobias_spv));
            // specialization constants
            VkSpecializationInfo spec_info;
            ShaderConstant shader_constant;