From 91c3d5fe152f2535296c4cc413168f0892b38992 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 24 Apr 2020 17:09:45 +0800
Subject: [PATCH] fix(dnn/cuda): add sharemem to prefer and refactor code

GitOrigin-RevId: 0dba39f56ba0bf61ecb591f866641e7f51937ab9
---
 dnn/src/cuda/group_local/cuda_interface.cu    | 145 -----------------
 dnn/src/cuda/group_local/forward/kern.cu      | 146 ++++++++++++++++++
 .../{cuda_interface.h => forward/kern.cuh}    |   7 +-
 .../{fwd.cpp => forward/opr_impl.cpp}         |  21 +--
 4 files changed, 164 insertions(+), 155 deletions(-)
 delete mode 100644 dnn/src/cuda/group_local/cuda_interface.cu
 create mode 100644 dnn/src/cuda/group_local/forward/kern.cu
 rename dnn/src/cuda/group_local/{cuda_interface.h => forward/kern.cuh} (83%)
 rename dnn/src/cuda/group_local/{fwd.cpp => forward/opr_impl.cpp} (89%)

diff --git a/dnn/src/cuda/group_local/cuda_interface.cu b/dnn/src/cuda/group_local/cuda_interface.cu
deleted file mode 100644
index 301b78a39..000000000
--- a/dnn/src/cuda/group_local/cuda_interface.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * \file dnn/src/cuda/group_local/cuda_interface.cu
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "./cuda_interface.h"
-
-#include "src/cuda/utils.cuh"
-
-namespace megdnn {
-namespace cuda {
-
-// src layout is (N, G, IC, IH, IW)
-// filter layout is (G, OH, OW, IC, FH, FW, OC)
-// dst layout is (N, G, OC, OH, OW)
-// NR_THREADS is 256
-// gridDim.z is G
-// gridDim.y is OC*OH*OW/NR_THREADS
-// gridDim.x is N/NB
-// blockDim.x is NR_THREADS
-
-// INs and ONs are the stride on the src/dst batch size dim
-// IC and OC are nr. channels per group
-
-// Each thread tackles with NB (actually NB_cur if non-multiple-of-NB N is considered).
-// Let oid = blockIdx.y*NR_THREADS + threadIdx.x (global thread ID along block
-// axis y), and we flatten (OC, OH, OW) into one dimension, then each thread
-// calculates the answer at dst position (n, blockIdx.z, oid), where n ranges
-// from blockDim.x*NB + 0 to blockDim.x*NB + (NB-1).
-// IC is processed at stride of ICB. On entrance of each iteration of the loop,
-// NB * ICB spatial src planes are loaded into shared memory (presumably src
-// spatial size is small).
-template <uint32_t NB, uint32_t ICB, bool is_xcorr>
-__global__ void forward_kernel(const float * __restrict__ src,
-        const float * __restrict__ filter,
-        float * __restrict__ dst,
-        uint32_t N,
-        uint32_t IC, uint32_t IH, uint32_t IW,
-        uint32_t OC, uint32_t OH, uint32_t OW,
-        uint32_t FH, uint32_t FW,
-        uint32_t G,
-        uint32_t INs, uint32_t ONs,
-        uint32_t PH, uint32_t PW,
-        uint32_t SH, uint32_t SW)
-{
-    // NB * ICB * sizeof(float) * IH * IW
-    extern __shared__ float shared_mem[];
-    float *src_cache = shared_mem;
-    uint32_t tid = threadIdx.x;
-    uint32_t tstride = blockDim.x;
-    uint32_t oid = tid + blockIdx.y * tstride;
-    src += blockIdx.x*NB * INs + blockIdx.z*IC*IH*IW;
-    dst += blockIdx.x*NB * ONs + blockIdx.z*OC*OH*OW;
-    filter += blockIdx.z*OH*OW*IC*FH*FW*OC;
-    uint32_t op = oid / OC;
-    uint32_t oc = oid % OC;
-    uint32_t oh = op / OW;
-    uint32_t ow = op % OW;
-    float dst_reg[NB];
-    for (uint32_t nb = 0; nb < NB; ++nb) dst_reg[nb] = 0.0f;
-    uint32_t NB_cur = min(N-blockIdx.x*NB, NB);
-    for (uint32_t ic = 0; ic < IC; ic += ICB) {
-        // read ICB-channel src
-        // (NB, ICB, IHs, IWs)
-        uint32_t ICB_cur = min(ICB, IC-ic);
-        for (uint32_t i = tid; i < NB_cur*ICB*IH*IW; i += tstride) {
-            uint32_t ip = i % (IH*IW);
-            uint32_t icb = i / (IH*IW) % ICB;
-            uint32_t nb = i / (IH*IW) / ICB;
-            src_cache[i] =
-                (icb < ICB_cur) * src[nb*INs + min(IC-1, (ic+icb))*IH*IW + ip];
-        }
-        __syncthreads();
-        if (oid < OC*OH*OW)
-        for (uint32_t fh = 0; fh < FH; ++fh)
-        {
-        uint32_t ih;
-        if (is_xcorr) ih = oh*SH + fh - PH; else ih = oh*SH + (FH-fh-1) - PH;
-        if (ih < IH)
-        for (uint32_t fw = 0; fw < FW; ++fw)
-        {
-            uint32_t iw;
-            if (is_xcorr) iw = ow*SW + fw - PW; else iw = ow*SW + (FW-fw-1) - PW;
-            if (iw < IW)
-            for (uint32_t icb = 0; icb < ICB_cur; ++icb) {
-                uint32_t fid = op*IC*FH*FW*OC + (ic+icb)*FH*FW*OC +
-                    fh*FW*OC + fw*OC + oc;
-                float fval = filter[fid];
-                float src_reg[NB];
-#pragma unroll
-                for (uint32_t nb = 0; nb < NB; ++nb) {
-                    src_reg[nb] = src_cache[nb*ICB*IH*IW + icb*IH*IW + ih*IW + iw];
-                }
-#pragma unroll
-                for (uint32_t nb = 0; nb < NB; ++nb) {
-                    dst_reg[nb] += src_reg[nb]*fval;
-                }
-            }
-        }
-        }
-        __syncthreads();
-    }
-    if (oid < OC*OH*OW) {
-        for (uint32_t nb = 0; nb < NB_cur; ++nb) {
-            dst[nb*ONs + oc*OH*OW + op] = dst_reg[nb];
-        }
-    }
-}
-
-void run_inference_kernel(const float *src, const float *filter, float *dst,
-        float *wptr,
-        uint32_t N, uint32_t IC, uint32_t IH, uint32_t IW,
-        uint32_t OC, uint32_t OH, uint32_t OW,
-        uint32_t FH, uint32_t FW,
-        uint32_t G,
-        uint32_t PH, uint32_t PW,
-        uint32_t SH, uint32_t SW,
-        cudaStream_t stream)
-{
-    MEGDNN_MARK_USED_VAR(wptr);
-    size_t threads = 256;
-    const size_t NB = 4, ICB = 4;
-    dim3 blocks = dim3(DIVUP(N, NB), DIVUP(OC*OH*OW, threads), G);
-    uint32_t INs = G*IC*IH*IW, ONs = G*OC*OH*OW;
-    forward_kernel<NB, ICB, true><<<blocks, threads,
-        NB*ICB*sizeof(float)*IH*IW, stream>>>(src, filter, dst,
-                N,
-                IC, IH, IW,
-                OC, OH, OW,
-                FH, FW,
-                G,
-                INs, ONs,
-                PH, PW,
-                SH, SW);
-    after_kernel_launch();
-}
-
-} // namespace cuda
-} // namespace megdnn
-// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/group_local/forward/kern.cu b/dnn/src/cuda/group_local/forward/kern.cu
new file mode 100644
index 000000000..e8850f450
--- /dev/null
+++ b/dnn/src/cuda/group_local/forward/kern.cu
@@ -0,0 +1,146 @@
+/**
+ * \file dnn/src/cuda/group_local/forward/kern.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/cuda/group_local/forward/kern.cuh"
+
+#include "src/cuda/utils.cuh"
+
+using namespace megdnn;
+using namespace cuda;
+
+namespace {
+
+constexpr size_t NB = 4, ICB = 4;
+
+// src layout is (N, G, IC, IH, IW)
+// filter layout is (G, OH, OW, IC, FH, FW, OC)
+// dst layout is (N, G, OC, OH, OW)
+// NR_THREADS is 256
+// gridDim.z is G
+// gridDim.y is OC*OH*OW/NR_THREADS
+// gridDim.x is N/NB
+// blockDim.x is NR_THREADS
+
+// INs and ONs are the stride on the src/dst batch size dim
+// IC and OC are nr. channels per group
+
+// Each thread tackles with NB (actually NB_cur if non-multiple-of-NB N is
+// considered). Let oid = blockIdx.y*NR_THREADS + threadIdx.x (global thread ID
+// along block axis y), and we flatten (OC, OH, OW) into one dimension, then
+// each thread calculates the answer at dst position (n, blockIdx.z, oid), where
+// n ranges from blockDim.x*NB + 0 to blockDim.x*NB + (NB-1). IC is processed at
+// stride of ICB. On entrance of each iteration of the loop, NB * ICB spatial
+// src planes are loaded into shared memory (presumably src spatial size is
+// small).
+template <uint32_t NB, uint32_t ICB, bool is_xcorr>
+__global__ void forward_kernel(const float* __restrict__ src,
+                               const float* __restrict__ filter,
+                               float* __restrict__ dst, uint32_t N, uint32_t IC,
+                               uint32_t IH, uint32_t IW, uint32_t OC,
+                               uint32_t OH, uint32_t OW, uint32_t FH,
+                               uint32_t FW, uint32_t INs, uint32_t ONs,
+                               uint32_t PH, uint32_t PW, uint32_t SH,
+                               uint32_t SW) {
+    // NB * ICB * sizeof(float) * IH * IW
+    extern __shared__ float shared_mem[];
+    float* src_cache = shared_mem;
+    uint32_t tid = threadIdx.x;
+    uint32_t tstride = blockDim.x;
+    uint32_t oid = tid + blockIdx.y * tstride;
+    src += blockIdx.x * NB * INs + blockIdx.z * IC * IH * IW;
+    dst += blockIdx.x * NB * ONs + blockIdx.z * OC * OH * OW;
+    filter += blockIdx.z * OH * OW * IC * FH * FW * OC;
+    uint32_t op = oid / OC;
+    uint32_t oc = oid % OC;
+    uint32_t oh = op / OW;
+    uint32_t ow = op % OW;
+    float dst_reg[NB];
+    for (uint32_t nb = 0; nb < NB; ++nb)
+        dst_reg[nb] = 0.0f;
+    uint32_t NB_cur = min(N - blockIdx.x * NB, NB);
+    for (uint32_t ic = 0; ic < IC; ic += ICB) {
+        // read ICB-channel src
+        // (NB, ICB, IHs, IWs)
+        uint32_t ICB_cur = min(ICB, IC - ic);
+        for (uint32_t i = tid; i < NB_cur * ICB * IH * IW; i += tstride) {
+            uint32_t ip = i % (IH * IW);
+            uint32_t icb = i / (IH * IW) % ICB;
+            uint32_t nb = i / (IH * IW) / ICB;
+            src_cache[i] =
+                    (icb < ICB_cur) *
+                    src[nb * INs + min(IC - 1, (ic + icb)) * IH * IW + ip];
+        }
+        __syncthreads();
+        if (oid < OC * OH * OW)
+            for (uint32_t fh = 0; fh < FH; ++fh) {
+                uint32_t ih;
+                if (is_xcorr)
+                    ih = oh * SH + fh - PH;
+                else
+                    ih = oh * SH + (FH - fh - 1) - PH;
+                if (ih < IH)
+                    for (uint32_t fw = 0; fw < FW; ++fw) {
+                        uint32_t iw;
+                        if (is_xcorr)
+                            iw = ow * SW + fw - PW;
+                        else
+                            iw = ow * SW + (FW - fw - 1) - PW;
+                        if (iw < IW)
+                            for (uint32_t icb = 0; icb < ICB_cur; ++icb) {
+                                uint32_t fid = op * IC * FH * FW * OC +
+                                               (ic + icb) * FH * FW * OC +
+                                               fh * FW * OC + fw * OC + oc;
+                                float fval = filter[fid];
+                                float src_reg[NB];
+#pragma unroll
+                                for (uint32_t nb = 0; nb < NB; ++nb) {
+                                    src_reg[nb] = src_cache[nb * ICB * IH * IW +
+                                                            icb * IH * IW +
+                                                            ih * IW + iw];
+                                }
+#pragma unroll
+                                for (uint32_t nb = 0; nb < NB; ++nb) {
+                                    dst_reg[nb] += src_reg[nb] * fval;
+                                }
+                            }
+                    }
+            }
+        __syncthreads();
+    }
+    if (oid < OC * OH * OW) {
+        for (uint32_t nb = 0; nb < NB_cur; ++nb) {
+            dst[nb * ONs + oc * OH * OW + op] = dst_reg[nb];
+        }
+    }
+}
+
+}
+
+void group_local::exec(const float* src, const float* filter, float* dst,
+                       float* wptr, uint32_t N, uint32_t IC, uint32_t IH,
+                       uint32_t IW, uint32_t OC, uint32_t OH, uint32_t OW,
+                       uint32_t FH, uint32_t FW, uint32_t G, uint32_t PH,
+                       uint32_t PW, uint32_t SH, uint32_t SW,
+                       cudaStream_t stream) {
+    MEGDNN_MARK_USED_VAR(wptr);
+    size_t threads = 256;
+    dim3 blocks = dim3(DIVUP(N, NB), DIVUP(OC * OH * OW, threads), G);
+    uint32_t INs = G * IC * IH * IW, ONs = G * OC * OH * OW;
+    forward_kernel<NB, ICB, true>
+            <<<blocks, threads, NB * ICB * sizeof(float) * IH * IW, stream>>>(
+                    src, filter, dst, N, IC, IH, IW, OC, OH, OW, FH, FW, INs,
+                    ONs, PH, PW, SH, SW);
+    after_kernel_launch();
+}
+
+size_t group_local::get_share_mem_in_bytes(uint32_t IH, uint32_t IW) {
+    return NB * ICB * sizeof(float) * IH * IW;
+}
diff --git a/dnn/src/cuda/group_local/cuda_interface.h b/dnn/src/cuda/group_local/forward/kern.cuh
similarity index 83%
rename from dnn/src/cuda/group_local/cuda_interface.h
rename to dnn/src/cuda/group_local/forward/kern.cuh
index bcd0c5f28..f23c576f4 100644
--- a/dnn/src/cuda/group_local/cuda_interface.h
+++ b/dnn/src/cuda/group_local/forward/kern.cuh
@@ -14,8 +14,9 @@
 
 namespace megdnn {
 namespace cuda {
+namespace group_local {
 
-void run_inference_kernel(const float *src, const float *filter, float *dst,
+void exec(const float *src, const float *filter, float *dst,
         float *wptr,
         uint32_t N, uint32_t IC, uint32_t IH, uint32_t IW,
         uint32_t OC, uint32_t OH, uint32_t OW,
@@ -25,6 +26,10 @@ void run_inference_kernel(const float *src, const float *filter, float *dst,
         uint32_t SH, uint32_t SW,
         cudaStream_t stream);
 
+size_t get_share_mem_in_bytes(uint32_t IH, uint32_t IW);
+
+} // namespace group_local
+
 } // namespace cuda
 } // namespace megdnn
 
diff --git a/dnn/src/cuda/group_local/fwd.cpp b/dnn/src/cuda/group_local/forward/opr_impl.cpp
similarity index 89%
rename from dnn/src/cuda/group_local/fwd.cpp
rename to dnn/src/cuda/group_local/forward/opr_impl.cpp
index f418be9a6..9bde4f76d 100644
--- a/dnn/src/cuda/group_local/fwd.cpp
+++ b/dnn/src/cuda/group_local/forward/opr_impl.cpp
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/cuda/group_local/fwd.cpp
+ * \file dnn/src/cuda/group_local/forward/opr_impl.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
@@ -14,7 +14,7 @@
 #include "src/cuda/local/local.cuh"
 #include "src/cuda/utils.h"
 
-#include "./cuda_interface.h"
+#include "src/cuda/group_local/forward/kern.cuh"
 
 namespace megdnn {
 namespace cuda {
@@ -46,7 +46,7 @@ void GroupLocalForwardImpl::exec(_megdnn_tensor_in src,
     auto one = handle->one_device();
     auto zero = handle->zero_device();
     if (prefer_inference_kernel(src.layout, filter.layout, dst.layout)) {
-        run_inference_kernel(sptr, fptr, dptr, wptr,
+        group_local::exec(sptr, fptr, dptr, wptr,
                 N, IC, IH, IW,
                 OC, OH, OW,
                 FH, FW,
@@ -141,11 +141,14 @@ bool GroupLocalForwardImpl::prefer_inference_kernel(const TensorLayout &src,
         const TensorLayout &filter,
         const TensorLayout &dst)
 {
-    megdnn_ignore(filter);
-    megdnn_ignore(dst);
-    return src.shape[0] <= 8;
+    MEGDNN_MARK_USED_VAR(filter);
+    MEGDNN_MARK_USED_VAR(dst);
+    auto handle = concrete_handle(this->handle());
+    size_t N = src.shape[0], IH = src.shape[2], IW = src.shape[3];
+    return N <= 8 && handle->device_prop().sharedMemPerBlock >=
+                             group_local::get_share_mem_in_bytes(IH, IW);
 }
 
-} // namespace cuda
-} // namespace megdnn
-// vim: syntax=cpp.doxygen
+}  // namespace cuda
+}  // namespace megdnn
+   // vim: syntax=cpp.doxygen
-- 
GitLab