From c3a4b2225d21b40e16be3f77638a38d8f8ae11e4 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 24 Nov 2020 16:59:59 +0800 Subject: [PATCH] feat(dnn/cuda): add cutlass impls for fused convolution reformat operation GitOrigin-RevId: 02ef559c3f7367a3ee40d9f5017dcf3ece72ac0f --- .../conv_bias/cutlass_convolution_wrapper.cu | 255 ++++++++++++++++++ .../conv_bias/cutlass_convolution_wrapper.cuh | 18 ++ .../implicit_gemm_int8_nchw32_imma.cpp | 149 +++++++--- .../implicit_gemm_int8_nchw4_dp4a.cpp | 36 +++ ..._ncdiv32hw32_128x128x32_64x32x32_hswish.cu | Bin 0 -> 1916 bytes ...4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu | Bin 0 -> 1910 bytes ...w4_ncdiv32hw32_128x128x32_64x32x32_relu.cu | Bin 0 -> 1914 bytes ...4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu | Bin 0 -> 1915 bytes ...v4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu | Bin 0 -> 1909 bytes ...hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu | Bin 0 -> 1913 bytes ...4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu | Bin 0 -> 1915 bytes ...v4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu | Bin 0 -> 1909 bytes ...hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu | Bin 0 -> 1913 bytes ...iv32hw32_1x1_128x128x32_64x32x32_hswish.cu | Bin 0 -> 1917 bytes ..._ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu | Bin 0 -> 1911 bytes ...cdiv32hw32_1x1_128x128x32_64x32x32_relu.cu | Bin 0 -> 1915 bytes ...div32hw32_1x1_128x32x32_64x32x32_hswish.cu | Bin 0 -> 1916 bytes ...4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu | Bin 0 -> 1910 bytes ...ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu | Bin 0 -> 1914 bytes ...div32hw32_1x1_128x64x32_64x32x32_hswish.cu | Bin 0 -> 1916 bytes ...4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu | Bin 0 -> 1910 bytes ...ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu | Bin 0 -> 1914 bytes ...div32hw32_1x1_32x128x32_32x64x32_hswish.cu | Bin 0 -> 1916 bytes ...4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu | Bin 0 -> 1910 bytes ...ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu | Bin 0 -> 1914 bytes ...cdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu | Bin 0 -> 1915 bytes ...w4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu | Bin 0 -> 1909 bytes ..._ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu | Bin 0 -> 1913 bytes ...cdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu | Bin 0 -> 1915 bytes ...w4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu | Bin 0 -> 1909 bytes ..._ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu | Bin 0 -> 1913 bytes ...div32hw32_1x1_64x128x32_64x32x32_hswish.cu | Bin 0 -> 1916 bytes ...4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu | Bin 0 -> 1910 bytes ...ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu | Bin 0 -> 1914 bytes ...cdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu | Bin 0 -> 1915 bytes ...w4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu | Bin 0 -> 1909 bytes ..._ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu | Bin 0 -> 1913 bytes ...cdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu | Bin 0 -> 1915 bytes ...w4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu | Bin 0 -> 1909 bytes ..._ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu | Bin 0 -> 1913 bytes ...4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu | Bin 0 -> 1915 bytes ...v4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu | Bin 0 -> 1909 bytes ...hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu | Bin 0 -> 1913 bytes ...w4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu | Bin 0 -> 1914 bytes ...iv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu | Bin 0 -> 1908 bytes ...4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu | Bin 0 -> 1912 bytes ...w4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu | Bin 0 -> 1914 bytes ...iv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu | Bin 0 -> 1908 bytes ...4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu | Bin 0 -> 1912 bytes ...4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu | Bin 0 -> 1915 bytes ...v4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu | Bin 0 -> 1909 bytes ...hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu | Bin 0 -> 1913 bytes ...w4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu | Bin 0 -> 1914 bytes ...iv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu | Bin 0 -> 1908 bytes ...4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu | Bin 0 -> 1912 bytes ...w4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu | Bin 0 -> 1914 bytes ...iv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu | Bin 0 -> 1908 bytes ...4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu | Bin 0 -> 1912 bytes ...32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu | Bin 0 -> 1923 bytes ...32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu | Bin 0 -> 1917 bytes ...hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu | Bin 0 -> 1921 bytes ...32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu | Bin 0 -> 1923 bytes ...32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu | Bin 0 -> 1917 bytes ...hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu | Bin 0 -> 1921 bytes ...w32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu | Bin 0 -> 1922 bytes ...v32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu | Bin 0 -> 1916 bytes ...2hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu | Bin 0 -> 1920 bytes ...cdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu | Bin 0 -> 1924 bytes ...32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu | Bin 0 -> 1918 bytes ..._ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu | Bin 0 -> 1922 bytes ...cdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu | Bin 0 -> 1924 bytes ...32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu | Bin 0 -> 1918 bytes ..._ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu | Bin 0 -> 1922 bytes ...ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu | Bin 0 -> 1923 bytes ...w32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu | Bin 0 -> 1917 bytes ...2_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu | Bin 0 -> 1921 bytes ...cdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu | Bin 0 -> 1924 bytes ...32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu | Bin 0 -> 1918 bytes ..._ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu | Bin 0 -> 1922 bytes ..._ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu | Bin 0 -> 1922 bytes ...hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu | Bin 0 -> 1916 bytes ...32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu | Bin 0 -> 1920 bytes ...ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu | Bin 0 -> 1923 bytes ...w32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu | Bin 0 -> 1917 bytes ...2_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu | Bin 0 -> 1921 bytes ..._ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu | Bin 0 -> 1922 bytes ...hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu | Bin 0 -> 1916 bytes ...32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu | Bin 0 -> 1920 bytes ...32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu | Bin 0 -> 1923 bytes ...32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu | Bin 0 -> 1917 bytes ...hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu | Bin 0 -> 1921 bytes ...hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu | Bin 0 -> 1921 bytes ...iv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu | Bin 0 -> 1915 bytes ...32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu | Bin 0 -> 1919 bytes ...w32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu | Bin 0 -> 1922 bytes ...v32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu | Bin 0 -> 1916 bytes ...2hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu | Bin 0 -> 1920 bytes ...hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu | Bin 0 -> 1921 bytes ...iv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu | Bin 0 -> 1915 bytes ...32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu | Bin 0 -> 1919 bytes dnn/test/cuda/conv_bias_int8.cpp | 67 +++++ 101 files changed, 493 insertions(+), 32 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index f9dd4c45a..e9e056e2c 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -25,6 +25,8 @@ using namespace megdnn; using namespace cuda; using namespace cutlass_wrapper; +/* ================= cutlass kernel wrapper for nchw32 layout ================ + */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -148,6 +150,131 @@ INST(true); INST(false); #undef INST +/* ==== cutlass kernel wrapper for nchw32 layout and nchw4 output ===== */ +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const int8_t* /* d_z */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ + cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 16, 16, NeedLoadFromConstMem>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(256, 128, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 256, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 64, 32, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 32, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 64, 32, 32, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 64, 16, 32, 64); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationHSwishClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST + +/* ================ cutlass kernel wrapper for nchw4 layout ================= */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -275,6 +402,7 @@ INST(true); INST(false); #undef INST +/* ===== cutlass kernel wrapper for nchw4 layout and nchw output ===== */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -401,4 +529,131 @@ void megdnn::cuda::cutlass_wrapper:: INST(true); INST(false); #undef INST + +/* ====== cutlass kernel wrapper for nchw4 layout and nchw32 output ====== */ +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const int8_t* /* d_z */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<32>, int32_t, \ + cutlass::layout::TensorNCxHWx<32>, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 4, aligned_, NeedLoadFromConstMem>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationHSwishClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST + // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index 2d78e8c37..85fdd29e6 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -40,6 +40,15 @@ void do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32( const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, cudaStream_t stream); +template +void do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const int8_t* d_z, int8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + template void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, @@ -58,6 +67,15 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, cudaStream_t stream); +template +void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const int8_t* d_z, int8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 80fd2d35f..b02e9027a 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -35,10 +35,23 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; - if (param.format != Format::NCHW32) + if (param.format != Format::NCHW32 && param.format != Format::NCHW32_NCHW4) return false; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR // TODO support group conv available &= param.sparse == Sparse::DENSE; // mode must be cross correlation @@ -84,8 +97,21 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR auto&& stream = cuda_stream(args.opr->handle()); int8_t* filter_ptr = nullptr; @@ -137,33 +163,79 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( } uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< - false>(args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, - kern_param, nonlinear_mode, alpha, beta, gamma, - dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW32) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< + false>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< + false>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } } else { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW32) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< + true>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< + true>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } } + after_kernel_launch(); } std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string( @@ -189,8 +261,21 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; src.init_contiguous_stride(); TensorLayout dst = src; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 3eb7bb9fc..58c631444 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -208,6 +208,24 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( stream); } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< + false>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); } } else { if (param.format == Format::NCHW4) { @@ -246,6 +264,24 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< + true>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); } } after_kernel_launch(); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2c9c4628f4f1084d10517b099b89824b4beb5c9 GIT binary patch literal 1916 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7r7LK>Or#R~7P+?DT6G-DFD>QQ@0_=$tQ}CP ziF5XIchB9E%bu8^FZYAt^}|Cl8VsKm)SP+}c zxlxHowX}zN6#!zYGQ-i57$wvcWfMfblf>HAo;GBf5;W}h5fcQ?Stlsc-p+RI70wuH zA3S^IEoTRs7?N_wP5T2F>x`i8fJn#`k^zWNRC>ly+Zx&lR5~@4j!^Dw{g{NuoSC4) zB~zZJz&&S@EDh0`n?ve_lI>Q@2k+>PtTOLnL(z|lGsZGEVm#u?X6B*)a`XJsJGnUD z!2A>v8107jU!LM$|H7`V+l?th9e3+kqnY7^UCE5j;~CMsi4F8H9>O)>>K%2@B9vbm z4xWk8?;=l6B6Qph|2pukf<}1iM5f+_A3@j%tqU~JLS|FXA9XWbURp7BIVp^1XL!C=vqR#7kjkeMwT?YoRT_yv+_mMhKS2-2~S4Vj?r*Y94)1eI|Wy+e{`X zs5E2D_(D*QapPZRdMY#s_7dfW%HC1d`-$ld4K_@1+x3u0T;SaXt({P=Vtx z=U|BznP4d8Q>GLY8>UFOZj6Stp~1778@)y!F22>Ogp=?>J7|@H5r-OwFdj&rautTl zK4J+BuQKx>=nvouZ3XBgA2pvthj%Mr!@&O>5`-IouMO0H`IYBc(0g4McyspAGBb0l z39VWdh;Mh*sN3)AV^P%TYq%|v{1ejfzuir(T<^4Zj~e6Lo2^_`BdFmhC?R$*WYfeU V7o4wIkQe{} literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..e758d0da7b78dc2c888d8b1bcc587eddc394e681 GIT binary patch literal 1910 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7rR%2gGLcH`TjbhqYt?Zq$LV;=+pgRc>C~>j0VG}>*UW#tJ6VKuDBtVGc;cNhlDyt5}K(+G8QCH z=G>@6q*~_3dhG#XsxrgTk{Bh_6lD`cos}fdTL;>ZX-d$r-$zUkIG0;Nk#_lP-(KO2 zq4v?UQ!eN1NE1U+ZnJ`T61$uolvsfYB}#6-H~Q7ba_Cj11#Mrc!@ffh2GI{v7e>GINwu??g!zJV5B zGj=BwC#IjJW1)x-*q0>Ly%yT>3k|jqEJCn6?Iy6U7ZaHgTQlf`-ZSaE!)6*@JEa+8 z#utKej2mZ*xK*55A0fGdEgUX|iSbrOK9kW}Lr(KXQqe}7 z#LyYWj~a@U=dnm3R_&?Si_>#xM~LxSJ`t+*5TVyCj?kuYCBF8&m1EJru@~i5=?eK4 zF~e@W6fZB57bV^8Yd7DkGCpYX48(OFArK>fom5qddnc{Xat&IFiFJPTfeI{#2?tBG z$OJ*c-=l+JYPfGZ{@t-m7|jluu` literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..f707002d6b15af94eb308bad30475306f8ae1e49 GIT binary patch literal 1914 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7rR%2gGLcH`TjbhqYt?Zq$LV;=+pgRc>C~>j0VG}>*UW#tJ6VKuDBtVGc;cNhlDyt5}K(+G8QCH z=G>@6q*~_3dhG#XsxrgTk{Bh_6lD`cos}fdTL;>ZX-d$r-$zUkIG0;Nk#_lP-(KO2 zq4v?UQ!eN1NE1U+ZnJ`T61$uolvsfYB}#6-H~Q7ba_Cj11#Mrc!@ffh2GI{v7e>GINwu??g!zJV5B zGj=BwC#IjJW1)x-_{62_OOond3+?!YCR+#?Ay}Sv6Ij=aiOh(tS@c2knRMQ1GZn9& z(u^_V3qd)?jk87EDo(AD5az;%D@=t1Hr9}Bu?1~zB(=*H4wvG@cq=2H$!M)1r+Fi( zXd_T!=nUgW4MocHSfmiE_EhY}={d9{#CR>C2-TX1(CaowXw$qBUwhulvFPF03v;V< zg@lWkVK-iimlw&4lJ54ko9|T_A9Q&J0y~co2$H`}sw&03lU8WC20g{ZIzRqE1(w5< zgC$yIf}xa8nNm<}m?8eUH5fLA2G440^csD*`ckVBPQvSKp;Zb79BLfGcp!Pol^-pK zh$S$(%FMl>-+?Q%<)M=>Xg-GaZf2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3zCSN?4VT&CLC%U!gwHg%2k*x`-mkl zyUNUipg({sv=yL}eA0Xl9iFX#4fFnUNDyuSzBEw(3ETK@nIu8-mX literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..5158b527d76fc935e73de1de686870785afc514e GIT binary patch literal 1909 zcmb7FZBHUG5dO}u=wWiXGpj^2##GKkk;BXRa&bw#Z=2Ef2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3zN44q;8 zprJ_C#v+ASwWne)PEVm7d5qWUiBPSF2)%A`gw~BK@wMlT9E<*qgC;jhSE#p$neWC+ z@$w>hQPSYsI3Fe-sK9X;aIi#+OfZ!4 zDN_oH4dWwRH)g`x(BN4Oj9#M;7vE}C!by0N9kfcpY(tGh7!M>*xe8NdAF%|cR+)Ja z^apT-wgPmL&zR4l!&4QoVbXsN3BnD)mj>#;{L1qz=)JBBJURPlnVGrOgjOvJ#J9U@ z)a_68mMCiUHT*4-{N2&;zuir(T<^4Z4;tg#o2^_`!>8dWC?R$*UDL!N7mNg%jf?;Q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..21c01d3675fc3eeec3f93b2bcb86fe02c95c25be GIT binary patch literal 1913 zcmb7FZBHUG5dO}u=wWiXGpj^2##GKkk;BXRa&bw#Z=2Ef2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3z4Vc|DnUM_8Dqv5 zf^v)-XN$O1oLVAzn6nVBFck_|Tk~v-E$DJ1scp9WaH&m*kgC+Ve(^MGwb8mm8%k6kNp2 zcjKjad6B#*>2_b+`CgRqLC+^3u+s>EAo=U0s#4r}>4cUm&{IsD57Q4+;5ZC9SfWKH z7)tq+DFwxb5fZK&b75_0@T`VLuhEB#Z?!7nBs|RyTBTsVp~fMM2a=~;g~_sySOSx) z%sdGC1Gqw40XoU&%;(VI$qLvo?LUVE;RfJK1NC2i<#`tLUe^VloPD&+%-m{1tCj`g z+g&y4_NRJN6gB!9{uW98{%H8$?xt3*ciOuLjdAYHR<5c6)NmA(5IdN#Y2pwJJXm}` Q-}R2tnQrZI#l)ob51z7*Bme*a literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..b28f68247219e07bca34c94fcb166aacd470fab4 GIT binary patch literal 1915 zcmb7FYj4^x6#dSx@TfFxRaykCs>)@RTE^(KKBfk#`xcqlfQ92&en9E3-?j4~bg;Gv zCC;_aeVvn=13pEc?uMi9_xH(oIC{KF{&ZWN4w^B^G_j1L$=W|8siP!GbGb|=oS0<6 zv`l!WggI7g58yMIYlc=tOD<<9pQ6-RL5yh~C{1P=L8HL{QBL5Tu{=e}nfboG!Wl*F zqbDbwVf085O)_SgZodO#tmdgbA_6jnWB|ftDja2~ZFJi4rF802I$YZG&12#pLuEYW zHkq+3gW5wS$x0KYm_AM&m!jQjIqx0al6CHEtW)%@su^Puj2MrZG`YSXJl{M$_fIb` zwlF`qfQt6R2G5W2uYY0RjoptaL!I>MSmU{7gkFn0UBq*uSQG2Z!*~SOu+{JO&Lebo z8leXMiD>mEu)u8LeoElMrIzRNO?Wp#@Za=jsJO^yjy zOxcpB4CBVxB5oC@7D)(m?!y(PLIWFX$hO!5o*PN+vW3H?Ix*hL$Y(NME68clNGjSW zl<0Jh@q>aU858piV%46Cy*N7qQbLT^8j4V@iU_?XIzk)zN__2kE60MxiC5-U>k17Q zF~csf`SK!I(bC<%cGr7Z#_t8DyJrysNwU{TRjs&l!g3|nps1Kw=jR`&z%m$futdw8 zQ^15!Z>&{m8)t5H5_>*#1>|3mRJNM4;Ei9 Q_I;&Pu3867Qa)|{1t_nN=Kufz literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..f106af213e1889d6744bb87bfc84f268b48ee5bf GIT binary patch literal 1909 zcmb7FYj4^x6#dSx@TfFxRaykCs>)@RTFTh8KBfk#`xY77fQ92&jt!;1e%H=}(81aw zlsMNu_jOKg4#X6Fx*Lve?(dWFaP)YU{OPtj9W>*L8)7*_leK?HsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1dRp*!~}tJxfK*?m(Ta@70wuH zA3Ztca?Xx4F(l=doAx^}xix~?BO)PFNCqH6QRx^*?cC6gFQrqL(h7%CG~ z*ks1j6lxEZBr8L-=H{3>p=7(&a^5?-CF{)D*iiJXsu^P$j2Mr($}@96c>eM9+&{g% z*us1l5-Qpc8$3V8zy5{Y2i5MF^HBy#&_vVk$FY>&^{8?wNGnVKWUcozjdk<4Zv~ z#*MQ@+$v5jj}YeEhbv44`x|S>w%7uO8%gc5g~O#TG2Y6^XEI)E$Z63?D%xn17&^!J zK|_=BJQgX$sy!8Zae4-Hgcz^Y6QNoU5qeE=gf_&L_}cSUjs<@cugR^}73wWwhTV82 zUS1?GTDsfU?s_lF_`Se%_bfsnM*cdfsug!mTA}3{v=kHT{Nw`_SPlaYmS~v?hEhIb zNeE93uOxPG2byfqT-{9f$ORY*c32(9msuavN)HsCkQ1X;3KUEG9OJHi1nR`LM z16OFvL#JWJd<^a1Dv$M({$ofGt_Qv}Q2(v3JkPw|>%PF7bBI=%S>&3~iZWmLZdZ-I z{jRd2ecvaY M>DB>POiWvU0bYQOkpKVy literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f45ef6209a6aa7d3a35034a79017ed3a4df73c7 GIT binary patch literal 1913 zcmb7FZExBz5dO}u@TfFxRaykCs>)@RTFTh8zDx~L_boE90Sm{m`~s!FerM-}(81aw zlsIQUclX>qxg78*`gAuO-Q3?ND*4lGbvkIqB-6w)iY9CSkfe^1B+cbAnQ&r~ z1=BL&nG)t$tv!IxWUd)n5iPl#p?r!`X9Y2)b)YnvWdw}|14KE2bH?%%DQD*U_6lbd zwU3^hbcWF*MKsBnWxD+ijIo-h_J|0`6p{f5m#J`+p|;U!$5ZLlR61PR^UY)89|JR< za+}OpmI3#`Bw1;q6w}A4<5IL+E$6+XTe8lbjdhB?Rh%&v!HDsgNt5gQ!Sj!&=l<#C z#TMo}7rGivt^|fV;eYOd<`AGqV$$a zM)V-hCR`F9@R12O5F}Im7TWO(J+=@qLa;pPC9tj+Q;`!}S8o84&xP|&o2hvDlxB=6 zTk@1)+&Ejrt>V-Y31QBCxWZH@U}Fv07F*EeMpC?D->MB z47>4)zq|-mlytYR-Fz?0_`T5S?pcIDknD9*RVnVAuw02X=qV=F`RNBLundMAEYULO z6lH8irG#R`2=Ujgxv()bcveHB-{`~Tms*u@65eJDt&%X`P~#BBL%}j8{bV^rG=a%g zX6^<34qTxv51ocN^D(r4vpm*M`;Q?(xE}b@K>e3rd7gT`*L8t6=Mb%Oy)cTTRm(i_ z-L4vS`(1r0iW+_Pw`CH(9}WMz-PFo;Ryq5iFb>{K<*FJ$4M$!Hv4shnB^JTJgT>d2 QecvdRtJVRNluuiK0lBGOr#R~7P+?DT6G-DFD>QQ@0_=$tQ}CP ziF5XIchB9E%bu8^FZYAt^}|Cl8VsKm)SP+}c zxlxHowX}zN6#!zYGQ-i57$wvcWfMfblf>HAo;GBf5;W}h5fcQ?Stlsc-p+RI70wuH zA3S^IEoTRs7?N_wP5T2F>x`i8fJn#`k^zWNRC>ly+Zx&lR5~@4j!^Dw{g{NuoSC4) zB~zZJz&&S@EDh0`n?ve_lI>Q@2k+>PtTOLnL(z|lGsZGEVm#u?X6B*)a`XJsJGnUD z!2A>v8107jU!LM$|H7`V+l?th9e3+kqnY7^UCE5j;~CMsi4F8H9>O)>>K%2@B9vbm z4xWk8?;=l6B6Qph|2pukf<}1iM5f+_A3@j%tqU~JLS|FXA9XWbURp7BIVp^1XL!C=vqR#7kjkeMwT?YoRT_yv+_mMhKS2-2~S4Vj?r*Y94)1eI|Wy+e{`X zs5E2D_(D*QapPZRdMY#s_7dfW%HCG$GRRw^>!SxcAZtEmt6^m^dFMAjrUR z7<8~ii%c+-@+ngastsc#TsLOJ+Rz|c4US%;5f|TTS;9$pqaE~0!Hh$VLl_SvPq_-y zWgoEwrdOGHQ1l0Ig}4HAlFypYp~K4+uwmkV4hhN)z}E)qzx>McEC{}?3%ok}XqlP0 z)r3|p3&gj(YSis_^|dH!^flZTN&XFK_}}iPmacc&yGM<2?#))Nsu|R96qFDpQ*Ly|g5k~Ei#WXy?4 z=1j|kXG)l3we|o$mAPhUNwnm0it-6cofX8G)`8Mwnh`YY_Yvg;&Kb*7q@0=U+bf(= z)INH4(iujN6wxGOmg)98Fve=0+9M($Q%D9NT&BWNhT2A_9Z#iGQ|WMNFE)>fe+eQpGD}j>;HA&pAs73QpcI`*iV>9W#_u??g!zJV5B zQ+mfGBf6hwV=jpg*cT*Iy%yT>3k|jqEJCn6?Iy6U7ZZ^aTQlf`-gDu+!)6*@JEa+8 z$`(9j7&p!qajQ7BK0=raAFePJ?cZ2Kw#62-xRKN@TR2<_6XUInd?ur{f}G}!q@s;D ziB4x2KPo7aF)_~|R_&?Si?efRM~LxSJ`t+*5TVyCj?kuYCBF8&m1EJru@~i5=?eK4 zF~e@W{<<|4Hiib#YGm{pjkx+!%Mwn)yKJFX5~dq!9Kv`YSjMEED~E_C zFt^Igy`tZNE5zlYlQ3mIhW4+O$NE|SF(fG0178}b|MDx(Q!n_sF7WCcqGhhD!JDaERnw>8$SWbXFkiF8BA9lt S_0% literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a7d3c990753413a214bc256be4e40b13aebc819 GIT binary patch literal 1915 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7rR%2gGLcH`Tja)WYt?Zqzx0J)zq9kwl(hpY zHF3^pQ*Ly|g5k~Ei#WXy?4 z=1j|kXG)l3we|o$mAPhUNwnm0it-6cofX8G)`8Mwnh`YY_Yvg;&Kb*7q@0=U+bf(= z)INH4(iujN6wxGOmg)98Fve=0+9M($Q%D9NT&BWNhT2A_9Z#iGQ|WMNFE)>fe+eQpGD}j>;HA&pAs73QpcI`*iV>9W#_u??g!zJV5B zQ+mfGBf6hwV=jpg_{43!XBJ8)u8SRh(KQAC2-TX1(CaowXw$qBUwhulvFPF03v;V< zg@lWkVK-j#mlwf`lJ54ko9|T_A9Q&J0y~e;gb2f4XI0hW-U-WFVLT8lW75x- zLqrppU1jE8(eJ<&;_}c*m^2?l`&Y|j{k;De5|rzKFAdax`IYCX7kph8cy$iZGS_pX zNLsbb6W{HsQMcdKx1y-gXMbBH;S196zuQeMU1ycEj|$`9%~Yf2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3zmhYk-{z=nbUIV31I0ACuY|MDx(vmp4oF7W8=qh)61Rufva zED+!Bs!_K;)ytx&(bw>|Nb*-m!~b?SwRFAH-aTlHb8ohCRn4G=qo9P?!H`W8hhXNx S;`{lo*Obn5YmX}?Car&ev5+kQ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..35b97bcda599aed2b2404d4968942c8f36d5f7b4 GIT binary patch literal 1910 zcmb7FZBHUG5dO}u=wWiXGpj^2##GKkk;BXRa&bw#Z=2Ef2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3zN44q;8 zprJ_C#v+ASwWne)PEVm7d5qWUiBPSF2)%A`gw~BK@wMlT9E<*qgC;jhSE#p$neWC+ z@$w>hQPS#V9;+s+4`kptOgLDgMJ5It{X#PZDLHvEa4)OX zHR|@KdQB8H`WpTgN&fO^_}}iPmacc&y9bSN?#))Ns_D~k6qFD<7_VvK5KKE*d_Ujy Og3_68?QzA#r1cN8N{!b5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ea93a1c4d9f0621c4d5fcb17b5e95defec5b051 GIT binary patch literal 1914 zcmb7FZBHUG5dO}u=wWiXGpj^2##GKkk;BXRa&bw#Z=2Ef2E)h8~HJnYr&j-#k6{PA<+j zFh7L^M!RAC=g0W>TiCUAyD?>`<8B>mG&7vAE1A)GJR_Pnv4I}OL%8N!y`%0~gpRx6 zSL2_E(eDCJPu}2uDd>ZzPGss`_)&!5yROeb3z4Vc|DnUM_8Dqv5 zf^v)-XN$O1oLVAzn6nVBFck_|Tk~v-E$DJ1scp9WaH&m*kgC+Ve(^MGwb8mm8%k6kNp2 zcjKjad6B#*>2_b+`CgRqLC+^3u+s=lh_w85R#h$TymUg#6$mOO&WG^_GH@KG94yfy z6AYz%%9Mg?!wd=6jlr-sG>BGHqt|G}#kX3Pa1tJ82fb1-;85cb#skSyuEJ>9M=XKS zRc0O({Q+Ddt^l3ngXVMS@Mr~W826t;f^q}!rGfe{zw$f_g0JfWkIp_?W@c_Rp;gNQ z@$Ie}b^BAjDvBC?4S$Oye}OdoZ+BBm*E{XqgT^@bW-C|K1Zp@6N{Ahd*fennCLS!l RpYM7}=}fowxME_``UemOk8c0~ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..3681959837ff5df0c0583c4b36c43592099f1aab GIT binary patch literal 1916 zcmb7FYj4^x6#dSx@TfFxRaykCs>)@RTE^(KKBfk#`xcqlfQ92&en9E3-?j4~bg;Gv zCC;_aeVvn=13pEc?uMi9_xH(oIC{KF{&ZWN4w^B^G_j1L$=W|8siP!GbGb|=oS0<6 zv`l!WggI7g58yMIYlc=tOD<<9pQ6-RL5yh~C{1P=L8HL{QBL5Tu{=e}nfboG!Wl*F zqbDbwVf085O)_SgZodO#tmdgbA_6jnWB|ftDja2~ZFJi4rF802I$YZG&12#pLuEYW zHkq+3gW5wS$x0KYm_AM&m!jQjIqx0al6CHEtW)%@su^Puj2MrZG`YSXJl{M$_fIb` zwlF`qfQt6R2G5W2uYY0RjoptaL!I>MSmU{7gkFn0UBq*uSQG2Z!*~SOu+{JO&Lebo z8leXMiD>mEu)u8LeoElMrIzRNO?Wp#@Za=jsJO^yjy zOxcpB4CBVxB5oC@7D)(m?!y(PLIWFX$hO!5o*PN+vW3H?Ix*hL$Y(NME68clNGjSW zl<0Jh@q>aU858piV%46Cy*N7qQbLT^8j4V@iU_?XIzk)zN__2kE60MxiC5-U>k17Q zF~csf`SK!I(bC<%cGr7Z#_t8DyJr!a5@Fcutg2qzIbpdHYmih-tn&j9WMCQ0Ias1) z&MC^+j7kaJhAHB&TccrPXcSt_jediO%P+Mo;Uv7!7Pyiy;!xud#zVm}CjD?ZL^Ofn zRc7uL{SI6qE)SiCQS&jhf4Mx?5B!fILAf6I(m?&UzVbZvg0K4mug)P_<$7ThNh`{H z;k#Wm`u4l}SQI__>~G5?d_o%jce|;j>#TD2L17%cnaWi)gBp&!5@HKOHcKpmnFoun R7yCX_Dp#!oCMln`{sNk}kTL)O literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..ebeb82911b4e51fbc55541f23a10a9ab2fb6a9e5 GIT binary patch literal 1910 zcmb7FYj4^x6#dSx@TfFxRaykCs>)@RTFTh8KBfk#`xcqlfQ92&en9E3-?j4~bg;Gv zCC;_aeVvn=13pEc?uMhA`}<@(96eqof4Z$s2hEsdnpj5BWbGf4)KQY8xm+d_PE4|3 zS|&VG!W^r$2k@E9HA5?+C6_alPf_ZuAjY%~lqR!`pwVD}C?{~vSe_!~%zWQo;f$j8 z(UX(TFnXkjCK+tcmsIVLXCs*y?wC=Mg$P zjZg#sM6`MnSYWnrKPB+sQpznb>{{k_gpydu$hLJPHD!NvL#O$ z#*MQ@+$v5jj}YeEhbv44`x|S>w%7uO8%gc5g~O#TG2Y6^XEI(Z$Z63?D%xn1=yZ

HF$AZ6!*W^~~3iTE-!*0Cd zFE4@>E#2*FcfFTo{9a(XdlsQ75r)0as_Mm^6P7En1~J9NIzReA2A08ugC$z#oT7}) zsFcucm>>SSH54|6MxoWj=r@SC{8GyjPQt5ffh!5a4K)s7JQOTr(vOuxL=zZWW#(Sd z@4yw}^3Z7*G9N?x*UDr4sQ(xeln4b*?@E6-Ce__{Cf>Kvj~t`|m;w4%%xzS~u! zZ@;V0MA4(q{)@RTFTh8zDx~L_boE90Sm{m`~s!FerM-}(81aw zlsIQUclX>qxg78*`gAuO-Q3?ND*4lGbvkIqB-6w)iY9CSkfe^1B+cbAnQ&r~ z1=BL&nG)t$tv!IxWUd)n5iPl#p?r!`X9Y2)b)YnvWdw}|14KE2bH?%%DQD*U_6lbd zwU3^hbcWF*MKsBnWxD+ijIo-h_J|0`6p{f5m#J`+p|;U!$5ZLlR61PR^UY)89|JR< za+}OpmI3#`Bw1;q6w}A4<5IL+E$6+XTe8lbjdhB?Rh%&v!HDsgNt5gQ!Sj!&=l<#C z#TMo}7rGivt^|fV;eYOd<`AGqV$$a zM)V-hCR`F9@R12O5F}Im7TWO(J+=@qLa;pPC9tj+Q;`!}S8o84&xP|&o2hvDlxB=6 zTk@1)+&Ejrt>V-Y31QBCxWZH@U}Fv07F*EeMpC?D->MB z47>4)zq|-mlytYR-Fz?0_`T5S?pcJUL>TrutEv`vPFSwQ8Uz&+>-_iw8CV8W4wh({ zbBZ!Hqf$b(VTSnY)?nBe8bqt9(Qh>3@=GmCI0>(_gi z_-I;)(0P#6bqrgBwHpoSx_gxJD}%@T`X;=$tU Q#lDY}%2n%tNy?|KzbgWdbN~PV literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..0cd9b194f6a68d2ccd1b833fc7b3925d87b2161a GIT binary patch literal 1916 zcmb7F*-jfV6n)QExIwB~G!PLGLMBCmkcO%(6%&cRjU3xatav<@mn?jJuV>4ICN1P) z;<@&@>+zWbF+ram2E*IO$7D1ZK3^xlPg&z9gycwa|`VK4k|XBLvISZUXCiF_9T@b^rQ6`b_%ZwwX*IP-(`P z@r9rqGdEk9hc6XUInLMEfNhMeY&q@oQ& ziJ>!$pENYd+E}CztM*jv#pyW=C6DozLlLT35uvw(j?iX&CBF8&m18l*abV_F>k1AR zG4oxR=Ig8EMN4=4+FkEe86RkU1{6Dw(1b|K-)2?y;=xNNv|IzFV&Z%dK%jx+pmVT9 zi%c+-@+ngax(zB4o?FteF*FLTbfec8#MPIYmT(d-w1ZhGkT}#hgz-S~l&c^vhlnL0 zUS;Ni(eJ<&{1ejfzuQesUGKDaPa5Ogo2^_`8q{zUm=HS<*)(woG!GVE RFZX?>bf#MeTrn|e{Q-Y=kSzcJ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a89381a8c71bc9e076d8028880be7f2f0788428 GIT binary patch literal 1910 zcmb7FTW{Jh7=7ngcvPCUDlICls>)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gIPFL1q<;~i z7XFDC{U+e~*%t1Hf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UE#N|Nb82krP3^K}p`La;pTC$Mf7Gno_DbZ!WA&!rCz+i3*qRA!7BUkb`G zZk;XT)^Qqo6k#qxxYATke`77OEw^C8t)zC@io+EzG2Y53WHMQ6$Z63^D%x<97&^!J zK|_yXfAJD*YP&inkWiA-X zc*>N5Zi9Y==awjJ46Q|j<3#0|9$VLXyN<0{C?Az}&0R+V{R z^c!#`xdL=nh|K%Y;aUZ3koxx_fw=+r(n9^SzUn*+gm3x+SLYC|a)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gIPFL1q<;~i z7XFDC{U+e~*%t1Hf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UF=2uTQKETQoC%$;fk9WZ)FrRnXEPBv}h$2ZTLwH zon!oXy7=g94yf? z7Yt=QWlBM}K|{iGOBgnWR-v_O3|fP@`qa=8PQrC|Fe?QDhgyd)9!Z{Y6{O`5u>_>6 z$~-Xo4Y-nA0Xi#$=6&dJwE{NC{rix>+yH!Oq5fH4b)E&nH+_MtbBI>CSy)YIJ+e^v zZda|o{i;6|Wsg3FzhzQ3I$W%BDwr`JO%SKJWC8JeuZO+q~*3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^=%Sv3ub}} zm!v$)fP29tSsJ1>H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABhHq4L+j zpNP@_gZr+a4W2rYtAE13L!?(p*S&vJevqb zLcp7NDeP<@Nv8W9wB=JY*+Iw%!SeVhfpxQ(%AB}{{{S?fOCQ{}lL`8%%osDi5R_xw zI$OrA<1`v6!kmS0rKyy_+FE2=Zb6${No}(ghbwVnypd7JWW3Ul)4Y{bv=%5abcXSR zh9X%Tiwt5lp31#AJB6kcF2_b+`Ce4z(%QL1SEav$d;w2DKaoCBzP9Y?e3#GY=M@&v(70 NbgnylTrn~2{04-|kMaNj literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..4bee34e863810b9e00e6e2eabbe6f433f11396a8 GIT binary patch literal 1909 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzQlw;gFTgI*9 zH0mhAoQH6wscin*T4Y;pL4#XKZL<}JD_LT^kx|HGywZ@e6mL@TDhVg@jB3T=Y z3}Q8&%Dp%{gLV`#UW+F}jUFQOy2TM%H?G1ro;PwV`!@-a+$dcs-ZEyf8!yGni{xcV zxBJ@8_p*u)dOihlokeI$q~))(s%vrQr4w4NKuj@lK1@E4f#WdXV2KvFU?}4$QwpjL z<0D)*X2ROgB3ciOeyb6epBh=hNqCYS^h&{OL#;y?4<*mI3R7htu>_`8m3dI~8*rt# z0(4r;nD?Q>V->Jr(!UQ0$_>Dm7V5A3s`D%ezNrg5I{Rpuo4M75)-4Ofx4UZ9?PvX# zC~NdF{4J8=-O=*D-AyB1@3eOh8soy7tzFgAr{yRpA$Blbv&12ocCh$-vFrV$bKTkF Iiiv6GH@U@)oB#j- literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..69ab4e102db4ca1372b6cc177fc6efc6d0107e78 GIT binary patch literal 1913 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzF>akL z7A~Yq^^4D3_wYc-r2`yJ3sF*k(rXR?_aTs#2M2lQ7l<|}) z1=WTT60RF_VQpv;t%pXx)riYajV$3LJk1VzrC`3H)*+0Cl4o3n$+C}F0+XxCJSh4N zxKdmJIxXhR`_SRh3fM61--iU{2H;Bz^;dq?c@_lU)CC@$eYDKY+-gGWmIdP5UA5}= zvwl;QHToF-7D@5`X!+mnrjf39+PephapBF@uIdTYauk#hJD9Lp;t)(cSbV3I$W%BDwr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABhHq0^HH zwct<0=>NfeSI`DeoygTc;a?&I|8;qWTF5;0{6V*q<&_mqHn)=6W-AU?;>36(qmapXr6H$zE2(HLP-5r| z;|C2zvNjeO#A-a1dvSIOO(|l$mQaKmRYd4@qa(C#UxjZxZ{%2ZaT0{NQMyvXWz1q1 zy7}@Vd0Eo!zP9tdsN#dFPe5X)5tk!66$uq9PY}rRFf!S4M z9u)lsTq&*qofebkedzFV1#Fo2??Zxe1MsDV`YXTcJPU$v>H@FMK3e8xZZ)BG%L4K3 zu3B~bS$`|a8hs46MN)i0TK>1YX{76&_U=JrTzIp!t9k~t90euB4rXkYI0Q2f7N5^| PeWi4+J9}I)G41>YxxSD5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..576e237832cdb914ddea1b42c64d27ac69be9e68 GIT binary patch literal 1909 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zP~x2Z+}(5cJ`baQ{7jEAGgtK`>Fr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2GL^H;2>@ivhN3SuXN+ZG#CXh=&CUJb`P{{k@9?yv8ZET>2@d&QPR{!KULT9HD zYQdj~(f@<{uAmK`I+3e?!jB>Z|8;qWTF5;0{6V*q<&_mcgey&D^VilQ+j0vU+)8SjtvFoC661}GLMG#thMeZDq@uMniJ>!$A2bxn z+E`=|tMOFs#n~COqlocZJP~U25TVyCj?lVs6~6Jjkz?7vNs#15=}PgIF^k=JDPCSA zFH5@J*LJ>_ReaF%DTwPVLQ^6wf1Oobi#so!&~gQ0iiz`K@_`H-hXDsmw8#ZR8Bdu~ zP;D3=;kq#s)`k|*dSLWhjkx^O$P!M%o9v)h3T7K>9m04hdB#2yUEtN(N6XyIttPZ?Ss=dMRjY14 z>#syvqmSXXNQ!Sq%l~#ajdZ=!-aTlH3vae|RZpLmqo9P?!F0_MhhW;l;`7C>?~~4T KXOAl;rk&sJeT}97 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..32917d0042db7617b6fffa77bd88ab2f754230d7 GIT binary patch literal 1913 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zP~x2Z+}(5cJ`baQ{7jEAGgtK`>Fr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2GL^H;2>@ivhN3SuXN+ZG#CXh=&CUJb`P{{k@9?yv8ZET>2@d&QPR{!KULT9HD zYQdj~(f@<{uAmK`I+3e?!jB>Z|8;qWTF5;0{6V*q<&_muedfj?*Zl2y-67m8MbvYip5hxdlybCAG~~9Im8^@kT}=lkrMJPV-h$(ORCw&>6-L z8j56XEHa4Icq;ee>B2sN6B(CaowXx+RD-+120vFza_$a14}rGU$r#csS5 zFE5gpCEf08JKxJHK4|(B1a=moDUp`H&Z@4(otI8%xdK7O#Q8A&Kn9M(kb@;!F!5mV`C`{M NO6R(>#}yOP&TmR}k3IkZ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..367f6012d731606ffcd95b2905785dbdf4b7c483 GIT binary patch literal 1916 zcmb7F*-jfV6n)QExIwB~G!PLGLMBCmkcO%(6%&cRjU3xatav<@mn?jJuV>4ICN1P) z;<@&@>+zWbF+ram2E*IO$7D1ZK3^xlPg0U&r zfqx-JzYBQU-NOA)Fb7Ya$kcz~ClEG5oBj;6klEDpC*91JmsX5jUK7SQFyU*)?uFvS z^s{s<6bS+E;-#>&z9gycwa|`VK4k|XBLvISZUXCiF_9T@b^rQ6`b_%ZwwX*IP-(`P z@r9rqGdEk9hc6XUInLMEfNhMeY&q@oQ& ziJ>!$pENYd+E}CztM*jv#pyW=C6DozLlLT35uvw(j?iX&CBF8&m18l*abV_F>k1AR zG4oxR=Ig8EMN4=4+FkEe86TK=1{6Dw(1b|K-)2?y;=xNNv|IzFV&Z%dK%jx+pmVT9 zi%c+-@+ngax(zB4o?FteF*FLTbfec8#MPIYmT(d-w1ZhGkT}#hgz-S~l&c^vhlnL0 zUS;Ni(eJ<&{1ejfzuQesUGKDaPa5Ogo2^_`8q{zUm=HS<*)(woG!GVE RFZX?>bf#MeTrn|e{Q;V(kTL)O literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..e5ba977b18faf2a326b89835afe139284cb16cd9 GIT binary patch literal 1910 zcmb7FTW{Jh7=7ngcvPCUDlICls>)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gI6sTfN&g~3 zE&LNP`c1&o{ub_sf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UE#N|Nb82krP3^K}p`La;pTC$Mf7Gno_DbZ!WA&!rCz+i3*qRA!7BUkb`G zZk;XT)^Qqo6k#qxxYATke`77OEw^C8t)zC@io+EzG2Y53WHMQ6$Z63^D%x<97&^!J zK|_yXfAJD*YP&inkWiA-X zc*>N5Zi9Y==awjJ46Q|j<3#0|9$VLXyN<0{C?Az}&0R+V{R z^c!#`xdL=nh|K%Y;aUZ3koxx_fw=+r(n9^SzUn*+gm3x+SLYC|a)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gI6sTfN&g~3 zE&LNP`c1&o{ub_sf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UF=2uTQKETQoC%$;fk9WZ)FrRnXEPBv}h$2ZTLwH zon!oXy7=g94yf? z7Yt=QWlBM}K|{iGOBgnWR-v_O3|fP@`qa=8PQrC|Fe?QDhgyd)9!Z{Y6{O`5u>_>6 z$~-Xo4Y-nA0Xi#$=6&dJwE{NC{rix>+yH!Oq5fH4b)E&nH+_MtbBI>CSy)YIJ+e^v zZda|o{i;6|Wsg3FzhzQ3I$W%BDwr`JO%SKJWC8JeuZO+q~*3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^=%Sv3ub}} zm!v$)fP29tSsJ1>H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABbDq2r_Q zYv51B=>NfeSI`DeoygTc;a?&I-*tJ0TF5;0{6V*q<&_m zF>akLg*vMslu&8?)i*^0xJI5FPHC}c8TX~=2bN-A0llo&e0 z_(4OFtc^tmu^Lb1UYwmmQ;Ha`B^04X6%l&f=m@RbSK%Aa8#$I;oCINRl&+L;8MD}h zZoa%oUY2ybukCyra9&w}8ay1=8ekCwTaTTN))vOs*h zt5)59*6)h4Mjyl9A}QV=E&toyG}84>d-tF*F1*>=RXu}Rj)D?m2QxNH9D+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzA5qjO?2(24e;Tz8zIhOsK1W9g`t`u(>v)GN7;^js1 zvZUL6ZRdMg#Rn~)g1F8iG$qpV*ICuIxbxBpEmt6>m^dFMAIQLQ7;vyei(D|2@sueA z)rRpAt{XF9ZD)56DEbY! zQd|K#EoRL7(BZKP*f8nehXmyY;7bejSANxb76jka1s)OXwd(e> zeoK@!`WXHeN%8Jz`QPrQk*;^zy9bSN;my{r>gm&R6qFD~Y1!wDTM8rH!Tl literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..27309b701e1460f4a00f2f1dd9233a24b05aa254 GIT binary patch literal 1913 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzsA!onic- zp-9%oB7<0sr*bdO&Y&eljMoB+P@{4@a*3GN%jpvOV%N|aGEH_G53b>3}?8Zy+ z@*;Uz((S&s^S!L%gPu=8U}q7U5^4GCtm<0adFh0fD-cvnoDb6vWZ*aqIas1aE*Q#q z%9Mg?!w3o2jk&Nkw20P2qu*-8<)=oLa1x$o2fb1--%#ri#zV<7uEJ#5M=XKKRb?I& z{RUhqt^l1DbLM^M@Mr~WnD*~Of^q}!rG@${zv?^-f^X^qkIp_?=4Ng+p>@jw@$If! zb^BSrDasms41bHHcz?9~Z+Fv3*E{XqgT}b0EdAxME`3`3*{)k3IkZ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..026080c88204d95408fa592b3091cc96dc6ab669 GIT binary patch literal 1915 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz}t8!>}()Oru!YVylyY#FzX(`cj!a~{H#rcwfHYmsfa1#NC6war!>uEdG)Mn)l%@k&EZ^Hx&PTA;+x z8ODzqiezmpGKkf9D)-{-44P8Jcq^d@HL8fv+eSxd-M$Lnc;3jd?BXN{bE9;ngv*%4 zE_CztRr0c=+kI{4ds)Q?WuAh><7=((<=i)wQ_y(g`hBAgP!*ALbv(z;PIJutbYo zFqH9>DFxMrArh_|lVNRW5v|8YztxD#FO4kWBs|X!dZl2(q1GXchmvPph1s%?SOT-F z$~-9g9k^0l0Xi)v&BxH;;R@I=?>~kF-&D?52>y`!L z+g-Kl_N#tZlr{Ps{uW8`25I@h1PJXn0a Q*!7mux$f+7#l*Dp2h{A40RR91 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..4deac474aac6e0e6de47e28f64325e71250240c0 GIT binary patch literal 1909 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz=0&0?sw3ZPtjfn!6F39lj8)|&0;EZ;+nz@K<&Bo!C^a%pq$E#G2;tCImWHC zW!ySWqmClXc?egU%I2@FMYiP@G`N-2Hd}GHk|o9)8HG&7D-Ai#TS-N0X%a(c7(Z$# zlC`nOAXek4+>5g_Xh#v_t#~5T=pjOHTO6Tv<0^dPc_YWNf0H1|jnb9kEn^nD@lw3L zN?w+9yRYqhFRS>V$x{&5S%juUTK+bxx)%3dI-%tX#1s?f!{h@QI1U32mS~X+hBBTq zrJ&j{KEiclCaeuDqV>S&w;FNzrI96^geTcSuN2HS)H;OmQ1XncFje*uOJHhMnFmF` z16PVGK&Qow`4~DpRskC({l}1?+yH!Sq5jFQI?saOo4UZGvyYa!nOjY0-LgP@yQ@~+ ze${V@vPPf7-y$jA9WDRc-89nmPJ8#LF)qB>+EqP$T8@GeVh7VTOB{k}2aB&4yWUSa M*PT7Cn3#6{012Fps{jB1 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0a42c147bbc697cb0d15d746c2a6c3b0b71fcf7 GIT binary patch literal 1913 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz$Y&KK$1-NJ7~+NXt9HU5rXB(aRTdRF_k%SP4Nbx_+0wnw4F*&Pi4lK@r9rq z!$ zA2k%o+E`=|tMOFs#n~COq=@lWKoM#*5uvwjj?lV!6~6Jjkz?7zNs#46=}G~YF^k=J zDPCVCFH5@J*LJ>_ReaFpDG2N=LQ^6wf16cZi+eAf&~gQWiiz`K`hg4_ham?`w8#ZR z8Bdu~P;D3?;kq#w)`k|*dT8`pjkx^M$P!M%)9j#E3g#PX9m04hdB#_Y=fDP0BV@ObL0KT?R|KwMlXF>2yUEtB#N6XyIttPZ?Ss=dM zRjY2l>NiDMqtD@QkreNbmjCT;8tHncy?fLc7v5~`s-8eCM?nd(g9)1@4#C8O#n+2n P?4ICN1P) z;<@&@>+zWbF+ram2E*IO$7D1ZK3^xlPg&z9gycwa|`VK4k|XBLvISZUXCiF_9T@b^rQ6`b_%ZwwX*IP-(`P z@r9rqGdEk9hc6XUInLMEfNhMeY&q@oQ& ziJ>!$pENYd+E}CztM*jv#pyW=C6DozLlLT35uvw(j?iX&CBF8&m18l*abV_F>k1AR zG4oxR=Ig8EMN4=4+FkEe86RkU1{6Dw5Kxl8O{!|egO^TdxdukX#QC6qzyilX=3t2y znP4d8Q>GL&8$={Lx0GRHXw+HBMz1l4t1mSx;UwH=2cuG;aHw$z)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gIPFL1q<;~i z7XFDC{U+e~*%t1Hf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UE#N|Nb82krP3^K}p`La;pTC$Mf7Gno_DbZ!WA&!rCz+i3*qRA!7BUkb`G zZk;XT)^Qqo6k#qxxYATke`77OEw^C8t)zC@io+EzG2Y53WHMQ6$Z63^D%x<97&^!J zK|_TmzS4;(Sm(V1eTxaIi$nTriaJ zlqm(x2KfljElt=MT6NZfF=)-<>Qlo?I0-k|!Kf5y8)_ZGcqDnoRZx{f#1c@gD)T_- zH{eQc1?a5MnD?Q>tqRzn^zTCgaRczBh5BcG)p-{9-t+}-&LLXmW??m<^~gfuyIr;V z_N)F%ls))@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gIPFL1q<;~i z7XFDC{U+e~*%t1Hf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UF=2uTQKETQoC%$;fk9WZ)FrRnXEPBv}h$2ZTLwH zon!oU zc_8!~a3!|_bXMrh`_SQL1#D3J_aT9}0r=8F{j3I$W%BDwr`JO%SKJWC8JeuZO+q~*3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^=%Sv3ub}} zm!v$)fP29tSsJ1>H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABhHq4L+j zpNP@_gZr+a4W2rYtAE13L!?(p*S&vJevqb zLcp7NDeP<@Nv8W9wB=JY*+Iw%!SeVhfpxQ(%AB}{{{S?fOCQ{}lL`8%%osDi5R_xw zI$OrA<1`v6!kmS0rKyy_+FE2=Zb6${No}(ghbwVnypd7JWW3Ul)4Y{bv=%5abcXSR zh9X%Tiwt5lp31#AJB6kcF2_b+`Ce4C&4>QdZ!>4cUmP*hBu591G1;5bYHgWgZ0m z23)DF0G$?t=6&e!Yz1r>_wPf3a0BqAh59SM>O2d2Z|VY1&OTb^W^Og1b;|JMGr@h#idBEO7`%9xOhe?|Mn; MTzB@kVq)6)4HE*7rvLx| literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..aeee09ac7e4bf9e66e7f8319788848958d48dd86 GIT binary patch literal 1908 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzQlw;gFTgI*9 zH0mhAoQH6wscin*T4Y;pL4#XKZL<}JD_LT^kx|HGywZ@e6mL@TDhVg@jB3T=Y z3}Q8&%Dp%{gLV`#UW+F}jUFQOy2TM%H?G1ro;PwV`!@-a+$dcs-ZEyf8!yGni{xcV zxBJ@8_p*u)dOihloka-5$X_Q_m*UP#C$wCFmSW<37<`}t$6>y~5-oDUP{vcH6cih# zN4Rc`gtei?vz{0IRv#`uHL8S@@E|*Am4eZRT8A(mN}h2QhRQx-2@I_&^C0Lq;7V-; z=(HFy??Z>DDqzE)e;*Qr8-On@)L;2k=ULEuQx|x0_R%sobE^rhTNa3Kch#!f&-x`% z*63sSTO`G+qve0Qn?||bY4091#)UUqyQ+sz%TZ85>|nTNi9;~#VDb54*Xv2=y0ga> H6VuLbN>hzd literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..d601e01df0257b3fb27ae757184487f0edb5abf3 GIT binary patch literal 1912 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzF>akL z7A_Riuual}vap$EITCPA(F>yW&KTv_=Fyml}7P(+3<0(@L ziVYJaTsOwT+R)-z&y0Sn50{@BRl-Slm>slA!FWThLl_Sw&$tSMWgoEw23M7N5cC^x zrM3ceT8x?Zp~I6EuwmH04++8zz?T;4ul%a>Ea<(d3p_ddXqlV2)r8h93&gj(YSry$ z{h}yq^fCM`lH&Ez^1t0pqg?N_cMlrl!kevK)dQ&IC@3LzFkrL9AsBeD_3I$W%BDwr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABhHq0^HH zwct<0=>NfeSI`DeoygTc;a?&I|8;qWTF5;0{6V*q<&_mqHn)=6W-AU?;>36(qmapXr6H$zE2(HLP-5r| z;|C2zvNjeO#A-a1dvSIOO(|l$mQaKmRYd4@qa(C#UxjZxZ{%2ZaT0{NQMyvXWz1q1 zy7}@Vd0Eo!zP9tdsN#dFPe5X)5dul_*Gbi-xbxBpEmxqZm^dHCAE>}_m~ya0i(D|2 z@sueA#fBLYt{a14ZD{eVr$)ckhl@{*D&Zu&&JJ3oV8Ef)A&iHTXIzERvX58-qpQk1 z2>K1UQdT%}w9L)iYC`Ll1>)OX zwd(e>{#2AT`WSADr1*fe{BL*DDAzmf-Gj!s@MddQ^$2P?3QC9_jMyx32u2<(KA-RU ONa+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zP~x2Z+}(5cJ`baQ{7jEAGgtK`>Fr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2GL^H;2>@ivhN3SuXN+ZG#CXh=&CUJb`P{{k@9?yv8ZET>2@d&QPR{!KULT9HD zYQdj~(f@<{uAmK`I+3e?!jB>Z|8;qWTF5;0{6V*q<&_mcgey&D^VilQ+j0vU+)8SjtvFoC661}GLMG#thMeZDq@uMniJ>!$A2bxn z+E`=|tMOFs#n~COqlocZJP~U25TVyCj?lVs6~6Jjkz?7vNs#15=}PgIF^k=JDPCSA zFH5@J*LJ>_ReaF%DTwPVLLf%|I;px8cV0T7x~4K1Gayy&<3aQUfGC7gs8*+Humj5gFdgz-@FjH@tI_7O{9XjPd9LB9c4 zYAZme#fW(yI=oc@8wUORkRaRud}*Qn%C9=lg5I0Dz?-v=mbsZ*O=#V+KzzHaR^5Kq zABnO?AH!{t6rYZk|LtxX<$9;Rd(apc-fZov9zHEcK?$*g;hH55!LWnH=ZjsRC!Oog K9#>3EJHG*c0gYP# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..352535b74739e956adb292b2200735c356386e8c GIT binary patch literal 1912 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zP~x2Z+}(5cJ`baQ{7jEAGgtK`>Fr`JO%SKJWC8JeuZLqa_x3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2GL^H;2>@ivhN3SuXN+ZG#CXh=&CUJb`P{{k@9?yv8ZET>2@d&QPR{!KULT9HD zYQdj~(f@<{uAmK`I+3e?!jB>Z|8;qWTF5;0{6V*q<&_muedfj?*Zl2y-67m8MbvYip5hxdlybCAG~~9Im8^@kT}=lkrMJPV-h$(ORCw&>6-L z8j56XEHa4Icq;ee>B2sN6B(CaowXx+RD-+120vFza_$a14}rGU$r#csS5 zFE5gpCEf08JKxJHK4|(B1a=l75F~${R9%WYFP+eG1$v5!^I`ab3LJ+S2TQcb1w$E6 znNm<}m>}W0F&5T_7SDQS^jm$n{M4utPQuIVpj8US8)_ZGcqnev~F1-zTH); zZa?b}MOmYd;kHPM&qvGub~lZ3z0=-3Xp9SQwsutypq8VcgxJA=%@T)T;KAba#ja13 M&UI&xD<-C$-`P@*^8f$< literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..ccaca6e8ea2fc21d02487839ff0076b2b603ea2c GIT binary patch literal 1915 zcmb7F*-jfV6n)QExIwB~G!PLGLMBCmkcO%(6%&cRjU3xatav<@mn?jJuV>4ICN1P) z;<@&@>+zWbF+ram2E*IO$7D1ZK3^xlPg0U&r zfqx-JzYBQU-NOA)Fb7Ya$kcz~ClEG5oBj;6klEDpC*91JmsX5jUK7SQFyU*)?uFvS z^s{s<6bS+E;-#>&z9gycwa|`VK4k|XBLvISZUXCiF_9T@b^rQ6`b_%ZwwX*IP-(`P z@r9rqGdEk9hc6XUInLMEfNhMeY&q@oQ& ziJ>!$pENYd+E}CztM*jv#pyW=C6DozLlLT35uvw(j?iX&CBF8&m18l*abV_F>k1AR zG4oxR=Ig8EMN4=4+FkEe86TK=1{6Dw5Kxl8O{!|egO^TdxdukX#QC6qzyilX=3t2y znP4d8Q>GL&8$={Lx0GRHXw+HBMz1l4t1mSx;UwH=2cuG;aHw$z)@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gI6sTfN&g~3 zE&LNP`c1&o{ub_sf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UE#N|Nb82krP3^K}p`La;pTC$Mf7Gno_DbZ!WA&!rCz+i3*qRA!7BUkb`G zZk;XT)^Qqo6k#qxxYATke`77OEw^C8t)zC@io+EzG2Y53WHMQ6$Z63^D%x<97&^!J zK|_TmzS4;(Sm(V1eTxaIi$nTriaJ zlqm(x2KfljElt=MT6NZfF=)-<>Qlo?I0-k|!Kf5y8)_ZGcqDnoRZx{f#1c@gD)T_- zH{eQc1?a5MnD?Q>tqRzn^zTCgaRczBh5BcG)p-{9-t+}-&LLXmW??m<^~gfuyIr;V z_N)F%ls))@RTE^J4UZw`Ady9;1f`#K)zCh`(-{;&29c(QR z!TId-U5`%=#0-778;!r--zSsN`0+aVebVXmP|6iI#BqkE>+q6L&qzXZwM?dh*kr+t zN<^lmJ=W_05UI)yM=N5KP$|l1hKRpl5t}eGQ z--LvU_QQtHkMYmHVPCD=kEueP_M2Ffx#5J}$eb?XInlh04dr1xhG(%gI6sTfN&g~3 zE&LNP`c1&o{ub_sf;o8VM6Uh|KY_3j+Vp3nh0IgWA9OogURg1AMNJsrz=W?EyA_HP zGt9H8P$UF=2uTQKETQoC%$;fk9WZ)FrRnXEPBv}h$2ZTLwH zon!oU zc_8!~a3!|_bXMrh`_SQL1#D3J_aT9}0r=8F{j3I$W%BDwr`JO%SKJWC8JeuZO+q~*3C-0anFwN& zIX5a1nU?lYuL3}%DmNT0iBUqOD4!zgog~(F_Ou~sM$l+5Kui!gXPux(dpp~;S2|;; zd+_9yx11elVo1gvH{Ca2tTTeT10o?)NCqH6QRx{+U2AABQ0X;PdP2Fg^=%Sv3ub}} zm!v$)fP29tSsJ1>H;2>@ivhN7=EXN+ZG#CXh=&CUJb`R3`le{ylY zf%zsRFxm|pJU_-i{)Amyw;NN1IzDP*jc0}vb|rHlHa5`1cm&sCtABbDq2r_Q zYv51B=>NfeSI`DeoygTc;a?&I-*tJ0TF5;0{6V*q<&_m zF>akLg*vMslu&8?)i*^0xJI5FPHC}c8TX~=2bN-A0llo&e0 z_(4OFtc^tmu^Lb1UYwmmQ;Ha`B^04X6%l&f=m@RbSK%Aa8#$I;oCINRl&+L;8MD}h zZoa%oUY2ybukCy&9SM8(KW;snKus;o?)HN;nCRvx8PC7;va{2;-sT8CPMn>?4-I=&CXg zf_?+8)K-8_i$U`~ba=J`HjMlCAwjqS_|iiCm0xw91-&+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzA5qjO?2(24e;Tz8zIhOsK1W9g`t`u(>v)GN7;^js1 zvZUL6ZRdMg#Rn~)g1F8i1Y+c`ld4N`=cN-`u0TsMaXt(_P=Vtx-(ZOrxnL;cDN_oH z4bvlBH%7wR(BfIoi+-yQm!BF{!by0L9kfcpXhW?-7!M`SxC%pMAF%|6R+V`W^c!%c zwgPlojF|VK!&4QoVbH%13BnD)mlo=;{HpUT=)I{6JURPlnVY%Qgw`z!#J9U@)$M2f zk|=BRG5jr(;?>dezuirvT<^4Z4;tgbo2^~d!>8paC?R$*T(iU>7TjavW! literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b0ba002d56de04a7f64cc51c3be396caccb687a GIT binary patch literal 1912 zcmbVNZExBz5dO}u@TfFxRa(>+Rh7#swUn`GeVH1h?ptJR6D%CZ@(Yyy`knJe=wNGy zAUJ0~clX>qx$KE4`fxWK-Q3?ND*1KP>Ge>`6*t6jh9;|UlTgn{LUXl9CW6>x z&W%b$rlmd9s{jzG$_+Stlsc-p+RImChLI z9z1#FEoTRs7?N?vP4^8L>x`i8fJn#`k^zWNRC>ly*BaUjRC*1So>1<5eVc^af|;Pg zB`MD`;9f9EmWF7}%^~$d$!@3PgLiaGR=Ibvq3BD^8Dm)(F&=Yeb8|m<{`U0TKfS!z zzsA!onic- zp-9%oB7<0sr*bdO&Y&eljMoB+P@{4@a*3GN%jpvOV%N|aGEH_G53b>3}?8Zy+ z@*;Uz((S&s^S!L%gPu=8U}q5mLGss0)up)e(g`hBpr@EPABG>Oz;T#yutbYoFqH9> zDFwxb2@y`!L+g-Kl z_OpIblr{Po{uW8``e^yz?xs<$ciOuLjd9`4)~@OS)N&M*5IY#KS>g~3JXn0b*!7Ci Mx$f+7#l*Dp8`*k}^8f$< literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..cabcf1ae65705db3bbe6701f06a589efed9ddbba GIT binary patch literal 1914 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz}t8!>}()Oru!YVylyY#FzX(`cj!a~{H#rcwfHYmsfa1#NC6war!>uEdG)Mn)l%@k&EZ^Hx&PTA;+x z8ODzqiezmpGKkf9D)-{-44P8Jcq^d@HL8fv+eSxd-M$Lnc;3jd?BXN{bE9;ngv*%4 zE_CztRr0c=+kI{4ds)Q?WuAh>&LRYoYx0o^DOASsS7+g`)HY)xz&W$EepiA zyK2?#SN*CeYxFt%Et28|((=FEO`}}zw0Dmhm{Xg-Pz-ciD~B#V-=6K literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..2f111fd96779010b8e6353ed1d8c33679a68cb46 GIT binary patch literal 1908 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz=0&0?sw3ZPtjfn!6F39lj8)|&0;EZ;+nz@K<&Bo!C^a%pq$E#G2;tCImWHC zW!ySWqmClXc?egU%I2@FMYiP@G`N-2Hd}GHk|o9)8HG&7D-Ai#TS-N0X%a(c7(Z$# zlC`nOAXek4+>5g_Xh#v_t#~5T=pjOHTO6Tv<0^dPc_YWNf0H1|jnb9kEn^nD@lw3L zN?w+9yRYqhFRS>V$x{&5S%g50{B2ToDek>=Ldz9sDJIT`!3QdE9OfG=(IOWNWjtj{ zL9tv_>{_2KeMqe?gl53+++DHv_2bqM331YX_V`o_U=(*TzIp!t9tmf90euB4u)%%I0VBE7GE!Ry`FTg LJ9}I)G41>TnhcF# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..b49faba5573568eac9a748fce52fbe5f28522d9a GIT binary patch literal 1912 zcmb7FZExBz5dO}u@TfFxRa(>+Rh7#swUp6meVH1h?ptJR6D%CZ@(Yyy`knJe=wNFR z1n2DM?w-3Rmpw5>pYDgFn}>&FJRCh;CBKh4y&g)r;)XcR&}0>E66zUAXs#B?L=cjXvG+u5$Y(iubD zgJ-Y2AnMFoe|U>5DA$=G5`^ZO3ygzT0?t*O0S{P6Uv>hZiMH?C(A1<#;%}*@ijE~im^MPI5C4f zn+Qchz$Y&KK$1-NJ7~+NXt9HU5rXB(aRTdRF_k%SP4Nbx_+0wnw4F*&Pi4lK@r9rq z!$ zA2k%o+E`=|tMOFs#n~COq=@lWKoM#*5uvwjj?lV!6~6Jjkz?7zNs#46=}G~YF^k=J zDPCVCFH5@J*LJ>_ReaFpDG2N=LLf-~HmSN4_g*@o~74K1Ga%;>lJaQUTCC7gtZ*+Humj5pLegz-@FjH@tM_7O{9a8;QH zLB9i6YAZme#hCdRIy_kc8;1SIkRaRud~Kor$*(%kg5I0Dz>~9&mbsZ*O=#V+KzzHa zR^5KpFN(58pTplGDPA8f|J&U(%Joir_oy*0yxH1SJ%Cz{f)Zi}12#(>f`JE%uNS*s OQ99S1J+7FTcK!hLagO)^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..e11f92ff8df3965d38121d3f17051e112d851d40 GIT binary patch literal 1923 zcmb7FZBN@U5dPj@;T37xP@tkPhD?V99V^p-PP7vH7P+?DT6G-DFD>QQ@0_=$EFDlO zsdM&o_uSo+%bu8`4|mst@Avod_Lq5L3@cOjEUtCxX~` z!HtSVqNP35>i`fll^Tv##3-R=D4ineog~(__Ov0hgrGsckC-5E&N@Mn_IAE&FL%aJ z`{2AOjGgsPv4Zwl%a9Qt4EwbcAweo5wgjW|;{p zTs-4R0=Z|I#4AIz=H`%kp=7(&^1(a0CF|6?k)h~YnKQ;RGh#gE%BJSN|8(>C)I088 zoNs}C2ni|e#`T{bM!yusXDuSIUhU5w^L^Ex!-hw%XJ*;enUdlsSW zcDix!NQ{0Jd3q9|{Hg-~RL}`eok-QIa0nYBpH|>&Eo3_Le5mVud11xaWm#c-1x>zW z>{cjFOg~K~LXi;hX0#G^-j^iNy%uWtWsPO{-+k6WeT_e?G%MKR;#dwQRU@~57;Iycb ziZ&u8hR!h_YABMmxl)JvpZ!tt^-F}ZI4_kHp-LJNdfDv=ZCY61n|bD)oCJMtl`j`^ z9y8m8jy^w2o|m=RS98u6Mf`41Sbl*p`Ab%1J?^}8Ld!MCYDAn5a}cE9IE*`3qGc)= zO8AT^1r>*367E|QVq>T^q#Pf;S~D&_Rnml$@K!tMmx4)$YKJhsmOSAq%$R+|VwhnC z^PuWC;BtWl=rkKR??Z)PM6U&a)u;sxI*Q?4wm`7FH8lwk#yx z?5b9`-{m(WuhGZww~VtdNzH$=n@YOgY43&_}<_PP7vH7P*ewT6G-DFD>QQ@9exZW$A!Q zNu9HwyXWqnT=w`BeYm?BT;JcvkE`zYZA&Kb)oQqIhG?d8rA z)INB0(ixTn|BW@U&9|V%HBwEsaJY~r##@X$lkr*sr$voaw2>## zbdK>*L6MBfH9A!P?2po~Um_IZywp&HDp5q}WuGIoXthh~M?v$}f;4d&#P-$DI?FE3pPajfi!A0)iAQgFy#Nw9I&dQZ`GZ zgo?u$@%OFSurbscQVxz@tr-`eDrv$=c%LowOTvspwL=)+2$nMGr^`M{Vwhe9bFb<* z;BtX^=roL)_o4l(<*|O^zYhuO^}y#E>c9CF=MyjbsxI*Q?4wnt7e*0Uw#+Bq?5b9` z-{sdLuhB>Uw~WI#q~^cbO(k7tm9s;IaqwnJSLFn%Ir2&vSs1cuY!Qq+SbRR;^_^0g LYV9$Z@M-HWnIDiC literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ef27dfd260d4f04676311b442ca52b132f3a29d GIT binary patch literal 1921 zcmb7FZBN@U5dPj@;T37xP@tkPhD?V9g>}<_PP7vH7P*ewT6G-DFD>QQ@9exZW$A!Q zNu9HwyXWqnT=w`BeYm?BT;JcvkE`zYZA&Kb)oQqIhG?d8rA z)INB0(ixTf~2a~LJhyrW*Z@34a?(h4C`t!6&bM=lRl_E6V5xWQ}GHa z%orzZ$tlCQb~cY&#;H^i!kqbVxhdDc#v0h>ThQhjsU}-ET*wpSEk>Tnc&&ibqDCs( zNR((g$9SlqNXFzk9jbr!N9oru5eji$>L@~$Fe3D_*Adz@u)sI-%=f9<{F5^6A z*d49-^Rr-iS(|+|=X_Db?|OaZ7s!&mWL4JV&I!wvSc9lW#5z9(K?;_^u!AL9W;{VD zngOhXnO{;ByW2-~5X6i5GoU7kGX4(JIpmqX;cq<`Zvr zRjb?Y@{5tz=%fEz#^F0s^WW^IlCHDL*`dNXcr&G|athTPc_oZ24B9ld2*w^PKA-RU OR;f(2_LxlgwDlMN2$8S= literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..f46311f22deda0d9f8f4c555751febbcaaa9a562 GIT binary patch literal 1923 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTFU6OwoC=7`xY6;fQ92&e&MCRe&@Uqy0Eqg zf^+tB_uSo+%K@LFPY?aU?c-xO?hl@>!#^jDRtwFTWSUqOp~=SIgw#bOq={UH6HZLH zU|NPeR>B;sjR)|VOf*9)q9vCzluQwIRuE$v2TGG!Owgd$LlGx%&R9;7a%R47FLM^5 z=Fzj0&ami65lv!dnQp!VW31-X9uWadfeb*nq{2lEHI1e%pGvDtrNyPa*xrWzHqDGv zZo?UiW5_+tBwT5t6w}AlaVeUOhV$OhJ=rAAjxnOVCk`diY;t7|;N4y)YxS96^iB-3OYCdVBZGeC?EKl1ZtjonzB*d0ndZ7D6IPbPf z#%m}yV;r$1rwrrD*(`1mr_@Or=E8@|OqmL{)|73w1&ywdscP7NvP-#dpKDw1=Tzx5}2`AyHw$LvLlMYo5VcZuiX421?LllKD z!*b?c)$hP%0`t&mI&eOQ_79lH`nmrxB&gQ|Un;2o=9iz3yy(liz~gg>R*7C1MQG77 zpLo5iO5J`J?~JTQpZ(u5Oy49G|MhN4={l>N9V(1dZ>Dfn%%O@SuY{3>Q5%OA!Qg|% S*UNoxE0w6m0h1A*HvR&B1(I+8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9b97cd47327d336721a342f29e89a98b8869f4d GIT binary patch literal 1917 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTE^J4woDCD_boDx0Sm{m`~s!Fe&@Uqy0Eqg zf^+tB_uSo+%K@LE4|l`SxBL5OG8{czM}JOQoer8a$uzMnM$?VIiKvT7L{qtnrkt2) z$+V1kqJ%kC8xP=fnQDgCL`yE`D4ikdtRTj;4wNSIgrL!2fMQPIoUxoD<;-H=UhXVL z?W0F0oni5jBAO)3GTnXy##qg%Jt6{_0vUjCNrj6UY8y>EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?d{-HQNq z&wBytZ5=!ktzJd;&OBxZ_ftYA9JM@^uficLd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH@--MD<&!;ghx41_;=~^0XVlx?0RcN^Heq0E$nA^A77Yym|^V z#xYxQ$}p~-&EuAFDurZWE_}G$l=pvY&DiE!(Bc}YCR=v6kS4}Ej69RcMggZ~ja0Ce zC((3)@mN8TjL9`RR{!jW$(Qp0WpQ3=C_t4c0`#)a0opdNz&G>EdpPyV+$mo!;XG!x z8?O2DvtW5yn|(Fsd{xBndTr%DkR*G_s;tMI6P7En0YQbtIzItH3YNj3gC$y}JVptd z$5KMYVT}0e&TQBkY7Hp|N59sLt520Q;UqlI7WyS&#-Z9FjE90HO#10^h~fyQSHaw? z`VF{TU>-WlM$P-s{?YPSKk?s(1oe90a}D+1{EG9j7kyP1czh1gI@L>~2rXOY6K{4^ ztK0ALZIRdLqyJk)**m1>zu8SCU1ycEV})_%&6KXn2~>0Bl@MAOvPonSj67I;zTEeo NQmJYkFd6e%>o245kQx90 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..3ac0bd364f159da54e205f74bd38156d57adc41e GIT binary patch literal 1921 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTE^J4woDCD_boDx0Sm{m`~s!Fe&@Uqy0Eqg zf^+tB_uSo+%K@LE4|l`SxBL5OG8{czM}JOQoer8a$uzMnM$?VIiKvT7L{qtnrkt2) z$+V1kqJ%kC8xP=fnQDgCL`yE`D4ikdtRTj;4wNSIgrL!2fMQPIoUxoD<;-H=UhXVL z?W0F0oni5jBAO)3GTnXy##qg%Jt6{_0vUjCNrj6UY8y>EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?d{-HQNq z&wBytZ5=!ktzJd;&OBxZ_ftYA9JM@^uficLd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH>utZXiga`YqJ($=YlK1dL&M+KpgcEoLGmwqh~>)u+OFr*$e` zL4_IPn5{Tv7}w6`amzTBO0qB)K3s0fHL$g2Z1XK>bB$D!EjwJu6XP95p2=jRfYY)@ zD%eVtXu807te{B7=6z`YczLX!`tL)6dOh&DhWc-Q#rfEazN!m6K8I+X>ZMVHmM!y% zH@m9U?RWXc$ZPb`|E;3zJyP@E?52{gv&z}A!Z`D0N>}9+syXsX2rUfSB(ey`9xOgz Q?t53MRJ9J6jQOnf7YXo@vH$=8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..c426dddf71e88169516ba2ae61ee62748fab4b3a GIT binary patch literal 1922 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$En{?ATc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)!`{r*0h42BO^(VtGE-9~e+xFL=+G~I-ohU#Bj7GMhP`X$qZ5NB(b(}pbeSF1P%Lr!~}tJ)(MKVw~KvynKOo( zM~_~4%h{18hQ!=)(|iNQIwPn%A`+Ma8Gr~yrDq&9t)cCZO1n&@EtEUo-bUdz%}h|? zqB)Oa$UV&@S{tG@H^ zRwzzPKZ&P8kr44_ycTxRmn7D`2CDg_eReGbykU9LjbL3aW-=kJ;-y1;m(p-9$dLLKRU_Q&zpFD;bDc`2k8Dy7jvFT34B+ZN{ddY)M)r$L@O<;!%O z#Y}ghqtDNhXJxJTRiE=^9v|e{Im><^O#YHpQI9(>ozQXvvKkZT!x#i9I1bYemS~j- zhGITvNs?jq_PcmxWHtI2{#H@?BB}VVcT-B&JMG;_W1M=kg{xu?RU8E+j2%qcIC2OEA1pp! Q?0Z@1L^lq&Vq(_#3z7|zH~;_u literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..9fe447f3bc5cdc66ae5068f5948b67f71386064b GIT binary patch literal 1916 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$EoE$4Tc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)zbbAKOA2E&J|=ufB7ZlgI@+z`hZnr^~PL_H%BP1Guy3Sy%r zH!2dbmiAb00zk}FVmMk8qlB8HWQM4Bl33d~(1y%of`jXvG+r_@U%o#(? zqerj25eZCz3_ygU(ld^l*3foHrCp}d7RsG(Z=-OVW+td` z(VWLIXY!dIrhN5pp&KS$oi1CCgo0$9l(~rle-bwfJ zVh8kHNJwcvuK)Bf{#Ap!v2H&o4|ZC{nk)<_>{=#tIbIOWtI&`i#zVNKTfI*AyoJtA z!>`0Y5~E)Q2Fyu!2lrD!BRq8?QLn-wDttR!7Ie5ms?L@k&XtMr4x_+iveCe4Ss~Ths*@PH zz<8vgNY-W&9qE7e$MM%MEtJN2DWMiBMbSbp``kj?=H>W$o>>p4L6|$`%M_f&On1j? z@%$`#R@Qo7^*LYW@j;lKv+M_w1dpJzezWnJL$IYjHkEUhNAXjw?S-c_Y; zzl)beR-=#MZxyAlkc$6$H>GsF)836V#;G@3xGE-4#Zgeg*ujjABZpw*!Q%79zSopa LbmM?4CT5Mlxk!)V literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..198293e2e5f9c383d75bd9efb258a50edf7d25ae GIT binary patch literal 1920 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$EoE$4Tc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)zbbAKOA2E&J|=ufB7ZlgI@+z`hZnr^~PL_H%BP1Guy3Sy%r zH!2dbmiAb00zk}FVmMk8qlB8HWQM4Bl33d~(1y%of`jXvG+r_@U%o#(? zqerj25eZCz3_ygU(ld^l*3foHrCp}d7RsG(Z=-OVW+td` z(VWLIXY!dIrhN5pp&KS$oi1CCgo0$9l(~rle-bwfJ zVh8kHNJwcvuK)Bf{#Ap!v2H&o4|ZC{nk)<_>{=#tIbIOWtI&`i#zVNKTfI*AyoJtA z!>`0Y5~E)Q2Fyu!2lrD!BRq8?QLn-wDttRs|xZe9|tv76R6=Jn2TTE*CSI5LYtkgXk0KgVQRNAfVig zG2<&iImVTcn`5QD8FJXyCN0kZNre zN(^0KJkn4kYcrXS^gsLK`0JMzO5?ngQ45vAXrY(AZlP@hb9_C|tdG+m&YkjQD$Zi2 zyW_QZewI8dYrU`foGQQ@0_=$EFDlO zsdM&o_uSo+%bu8`4|mst@Avod_Lq5L3@cOjEUtCxX~` z!HtSVqNP35>i`fll^Tv##3-R=D4ineog~(__Ov0hgrGsckC-5E&N@Mn_IAE&FL%aJ z`{2AOjGgsPv4Zwl%a9Qt4EwbcAweo5wgjW|;{p zTs-4R0=Z|I#4AIz=H`%kp=7(&^1(a0CF|6?k)h~YnKQ;RGh#gE%BJSN|8(>C)I088 zoNs}C2ni|e#`T{bM!yusXDuSIUhU5w^L^Ex!-hw%XJ*;enUdlsSW zcDix!NQ{0Jd3q9|{Hg-~RL}`eok-QIa0nYBpH|>&Eo3_Le5mVud11xaWm#c-1x>zW z>{cjFOg~K~LXi;hX0#G^-j^iNy%uWtWsPO{-+k6WeT_e?G%MKR;#dwQRU@~57;Iycb ziZ&u8hR!h_YABMmxl)JvpZ!tt^-F}ZI4_kHp-LJNdfDv=ZCY61n|bD)oCJMtl`j`^ z9y8m8jy^w2o|m=RS98u6Mf`41Sbj~3wEQKlvLJU}I-%tnbTuN*hd~Hha2)0xEYUI* z3?+QVl!B7OGzs^u5wS7UI#SM$UacD!pDJy_NqDUtG)%#$L$yN~UrU~F6^6_{VlfP{ zf_aej8*sV80(6>9ocE!_8y2u(@V^fU@(sY}8tT9K73Wz{eN`8DfA-NTH4Cc=En5~6 zZ+2Cy+wbzDk=N*B_*=%=r=;e;*-fQg@3ePAjdAA9mafV{RC5%RFmf}<_PP7vH7P*ewT6G-DFD>QQ@9exZW$A!Q zNu9HwyXWqnT=w`BeYm?BT;JcvkE`zYZA&Kb)oQqIhG?d8rA z)INB0(ixTn|BW@U&9|V%HBwEsaJY~r##@X$lkr*sr$voaw2>## zbdK>*L6MBfH9A!P?2po~Um_IZywp&HDp5q}WuGIoXthh~M?v%C9LAhP|X!7Ua$e%avGzqDI6zKLSAumcgWhC0b@YK`EOh zQbNgLj`;i5aM&1X9VsVAuhxx=Pn9;|B)rfT8YW@Lq1qvgZv;!3^y6h8B{7Vzg1MLV z8*sV8Jaign&HK>)-SSvJ^52I9`Fh}U4fWsrit~wAeN`8DfA-NT(+i^rEnDUjZ+2Cy z+wbyok=N*>|69i4BU1C&{rK@rT)f{;xj4VvqG`0w49xOhe@A^=w MOttoyO!&0*7Yk>QV*mgE literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..80a947c22da4cd66991b3ece5122ecb7077089a4 GIT binary patch literal 1922 zcmb7FZBN@U5dPj@;T37xP@tkPhD?V9g>}<_PP7vH7P*ewT6G-DFD>QQ@9exZW$A!Q zNu9HwyXWqnT=w`BeYm?BT;JcvkE`zYZA&Kb)oQqIhG?d8rA z)INB0(ixTf~2a~LJhyrW*Z@34a?(h4C`t!6&bM=lRl_E6V5xWQ}GHa z%orzZ$tlCQb~cY&#;H^i!kqbVxhdDc#v0h>ThQhjsU}-ET*wpSEk>Tnc&&ibqDCs( zNR((g$9SlqNXFzk9jbr!N9oru5eji$>L@~$Fe3D_*Adz@u)sI-%=f9<{F5^6A z*d49-^Rr-iS(|+|=X_Db?|OaZ*OUmuUeYQHa_5BQN~}RuBVwH&gP;Y=VA{bFEi;~= zl+6+;q2w@2{C#UcYz(!Il+&YE>&C^WN}F&JUTO;slQ8H|?GVN{f~8FQ5wnky7)DsZ z+{^k6xLjc#It}yYeQ5uFd8{A%??ZxoJ@C1P`fq;4`NXTfstde7`)HNvg;9i-E%S*t zyQPTLg0t7N5^| PeXLZbT6;_;eA@a8a_Eu! literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..5fe0f19c1f92f7e50c037772dcd2332a25e71d31 GIT binary patch literal 1924 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTFU6OwoC=7`xY6;fQ92&e&MCRe&@Uqy0Eqg zf^+tB_uSo+%K@LFPY?aU?c-xO?hl@>!#^jDRtwFTWSUqOp~=SIgw#bOq={UH6HZLH zU|NPeR>B;sjR)|VOf*9)q9vCzluQwIRuE$v2TGG!Owgd$LlGx%&R9;7a%R47FLM^5 z=Fzj0&ami65lv!dnQp!VW31-X9uWadfeb*nq{2lEHI1e%pGvDtrNyPa*xrWzHqDGv zZo?UiW5_+tBwT5t6w}AlaVeUOhV$OhJ=rAAjxnOVCk`diY;t7|;N4y)YxS96^iB-3OYCdVBZGeC?EKl1ZtjonzB*d0ndZ7D6IPbPf z#%m}yV;r$1rwrrD*(`1mr_@Or=E8@|OqmL{)|73w1&ywds=CrH?F>v+JuwvSX*eAgi(hohcNC77BlIG%pr T;_KzU*Of|CEK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?d{-HQNq z&wBytZ5=!ktzJd;&OBxZ_ftYA9JM@^uficLd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH@--MD<&!;ghx41_;=~^0XVlx?0RcN^Heq0E$nA^A77Yym|^V z#xYxQ$}p~-&EuAFDurZWE_}G$l=pvY&DiE!(Bc}YCR=v6kS4}Ej69RcMggZ~ja0Ce zC((3)@mN8TjL9`RR{!jW$(Qp0WpQ3=C_t4c0`#)a0opdNz&G>EdpPyV+$mo!;XG!x z8?O2DvtW5yn|(Fsd{xBndTr&u84-rPq*WH=&I!wv*npx!Vx1p>pasid(!mm~QXZp( z&0{H{EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?d{-HQNq z&wBytZ5=!ktzJd;&OBxZ_ftYA9JM@^uficLd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH>utZXiga`YqJ($=YlK1dL&M+KpgcEoLGmwqh~>)u+OFr*$e` zL4_IPn5{Tv7}w6`amzTBO0qB)K3s0fHL$g2Z1XK>bB$D!EjwJu6XP95p2=jRfYY)@ zD%eVtXu807te{B7QkjnI0+B6g@#EObf|U+h`;QW#l#b=>Jwx_9ChIZ+25@*IDK4SYe!bGo`C?4AmTYC4?3xZ4y}oa}O4u RFZaEyRH|ABOvZfH`U`!-k^KMw literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..0b0da908e235ad22e5185606a2b5ca3c8eb6d73f GIT binary patch literal 1923 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$En{?ATc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)!`{r*0h42BO^(VtGE-9~e+xFL=+G~I-ohU#Bj7GMhP`X$qZ5NB(b(}pbeSF1P%Lr!~}tJ)(MKVw~KvynKOo( zM~_~4%h{18hQ!=)(|iNQIwPn%A`+Ma8Gr~yrDq&9t)cCZO1n&@EtEUo-bUdz%}h|? zqB)Oa$UV&@S{tG@H^ zRwzzPKZ&P8kr44_ycTxRmn7D`2CDg_eReGbykU9LjbL3aW-=kJ;-y1;m(p-9$dLLKRU_Q&zpFD;bDc`2k8Dy7jvFT34B+ZN{ddY)M)r$L@O<;!%O z#Y}ghqtDNhXJxJTRiE=^9v|e{Im>=CA}xPOt0>5wmriK80bPxW^I;Bx7959h2TQa{ z1Vb^OGo_&9FigUAXF_ZZm5vnSqgUz1<)>1ca1x$s2Mtp&=}_em#skS?uELBtL@a_C zmNO5segiI3Sb)ybf%86ec)|iU%>DNvLB0X_TtWRezx+H4sxRvT&(9%RCuV6ip+(C= z;`Od7b^BesGqM_e41cRAeUnuD*SjgT>z(#)q%lst*}_#Zh$@bP62=ZjZ5%lSlMfc3 RFZR8ybfOyvTrn|g`~?Pel7j#M literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..89608b5bb926679024612b54664113a32342319d GIT binary patch literal 1917 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$EoE$4Tc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)zbbAKOA2E&J|=ufB7ZlgI@+z`hZnr^~PL_H%BP1Guy3Sy%r zH!2dbmiAb00zk}FVmMk8qlB8HWQM4Bl33d~(1y%of`jXvG+r_@U%o#(? zqerj25eZCz3_ygU(ld^l*3foHrCp}d7RsG(Z=-OVW+td` z(VWLIXY!dIrhN5pp&KS$oi1CCgo0$9l(~rle-bwfJ zVh8kHNJwcvuK)Bf{#Ap!v2H&o4|ZC{nk)<_>{=#tIbIOWtI&`i#zVNKTfI*AyoJtA z!>`0Y5~E)Q2Fyu!2lrD!BRq8?QLn-wDttR!7Ie5ms?L@k&XtMr4x_+iveCe4Ss~Ths*@PH zz<8vgNY-W&9qE7e$MM%MEtJN2DWMiBMbSbp``kj?=H>W$o>>p4L6|$`%M_f&On1j? z@%$`#R@Qo7^*LYW@j;lKv+Oq`((;$Iih|sE>4cUWP}G<>A0{AZ!EqRLutcjwFckAS zQwmBBV zx8KFvBCFBI@VAQ6cSyy5y_-_I-f8bf8spTPEnF2NsNyInVeDYY#*srX^I-A$V&8j8 MC%SRK6%(_@Un^RWEdT%j literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..80aac1fcaccd6edd6950694a98200d426f993b2b GIT binary patch literal 1921 zcmb7FZExBz5dO}u@TfFx6)l2xUFEV$EoE$4Tc(24eT$53z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luyav)~t!`)zbbAKOA2E&J|=ufB7ZlgI@+z`hZnr^~PL_H%BP1Guy3Sy%r zH!2dbmiAb00zk}FVmMk8qlB8HWQM4Bl33d~(1y%of`jXvG+r_@U%o#(? zqerj25eZCz3_ygU(ld^l*3foHrCp}d7RsG(Z=-OVW+td` z(VWLIXY!dIrhN5pp&KS$oi1CCgo0$9l(~rle-bwfJ zVh8kHNJwcvuK)Bf{#Ap!v2H&o4|ZC{nk)<_>{=#tIbIOWtI&`i#zVNKTfI*AyoJtA z!>`0Y5~E)Q2Fyu!2lrD!BRq8?QLn-wDttRs|xZe9|tv76R6=Jn2TTE*CSI5LYtkgXk0KgVQRNAfVig zG2<&iImVTcn`5QD8FJXyCN0kZNre zN(^0KJkn4kYcrXS^gsLK`0JMzO5?ngQ45vAXrY(AZlP@hb9_C|tdG+m&YkjQD$Zi2 zyW_QZewI8dYrU`foGjqE#Xo zius%=1to`360SS*VQZ*#q!=E(N;fV)mD+@p@Ju^sn1VTnDu*y0NFH+)Cd?sX5lpb0 zd64xRaGAmabe4{r_o2h{6|iCIzYhuW4Z!CL>c9Er=UGsFSr>SI4$(R>OREVjS{4$o zcU7s|@8XS-)#zjRTSe)6q~gEcO{rb)w09$oaq7($u8J{KaTJs=b}(q;$RU_}u=sqj P?_H%6-8kTiiCN<>k^_;& literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..338226f8164bf67916a24a709ffaaaf55f377e42 GIT binary patch literal 1924 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTFU6OwoC=7`xY6;fQ92&e&MCRe&@Uqy0Eqg zf^+tB_uSo+%K@LFPY?aU?c-xO?hl@>!#^jDRtwFTWSUqOp~=SIgw#bOq={UH6HZLH zU|NPeR>B;sjR)|VOf*9)q9vCzluQwIRuE$v2TGG!Owgd$LlGx%&R9;7a%R47FLM^5 z=Fzj0&ami65lv!dnQp!VW31-X9uWadfeb*nq{2lEHI1e%pGvDtrNyPa*xrWzHqDGv zZo?UiW5_+tBwT5t6w}AlaVeUOhV$OhJ=rAAjxnOVCk`diY;t7|;N4y)YxS96^iB-3OYCdVBZGeC?EKl1ZtjonzB*d0ndZ7D6IPbPf z#%m}yV;r$1rwrrD*(`1mr_@Or=E8@|OqmL{)|73w1&ywds=CrH?F>v+JuwvSX*eAgi(hohcNC77BlIG%pr T;_KzU*Of|CEK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?g8^Im{X zx)%ZJZ5=!ktzJd;&OBxZ_ftYA9JM@^uficBd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH@--MD<&!;ghx41_;=~^0XVlx?0RcN^Heq0E$nA^A77Yym|^V z#xYxQ$}p~-&EuAFDurZWE_}G$l=pvY&DiE!(Bc}YCR=v6kS4}Ej69RcMggZ~ja0Ce zC((3)@mN8TjL9`RR{!jW$(Qp0WpQ3=C_t4c0`#)a0opdNz&G>EdpPyV+$mo!;XG!x z8?O2DvtW5yn|(Fsd{xBndTr&u84-rPq*WH=&I!wv*npx!Vx1p>pasid(!mm~QXZp( z&0{H{EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?g8^Im{X zx)%ZJZ5=!ktzJd;&OBxZ_ftYA9JM@^uficBd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH>utZXiga`YqJ($=YlK1dL&M+KpgcEoLGmwqh~>)u+OFr*$e` zL4_IPn5{Tv7}w6`amzTBO0qB)K3s0fHL$g2Z1XK>bB$D!EjwJu6XP95p2=jRfYY)@ zD%eVtXu807te{B7QkjnI0+B6g@#EObf|U+h`;QW#l#b=>Jwx_9ChIZ+25@*IDK4SYe!bGo`C?4AmTYC4?3xZ4y}oa}O4u RFZaEyRH|ABOvZfH`U`!{k^KMw literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..b49cdc6be5ee137b764147452395fc5246d08226 GIT binary patch literal 1922 zcmb7FZBN@U5dPj@;T37xP@v*v3>k(5rIcwvCt8Vpi(K1ntvZh7mzMJDcg|Z=mJX=e z)H(aPd+zSZWk*cVr~6+2`r#oS_4<#O@t=cMr-P!(g~v8Nn&klM;kIt2PzZ)e-~a%T*+ z_ny4+ma{!g3`w}-ru`0#bw*IPMG5`^ZO3yfITSGe`l}?pPM<{o?zKz3emYJZ! z#Z#Unkb9O%yfj2>ZuY4cO14`qAH1VGvP!)h8j60DIb$p{BgP}HY-%35&$my{=SPPZ zXB(g!At9yhxbE}g@K*!w+PdwaBG_>iYcw;Quq&C;`EW)wuR}wA8290tZ2`pyog9Z> z1%Dz&zll3IiBOKMa6c9F!BZzv^(Oo@LhwB}slYugWIFYHpzCaTVa3>GIbnPSExu&z zPAE=HH%-Pukr45AxD<^Rg-y)R7d99-eRnmyi>uyJA-NFLj%ro!gIH+@@e7TJC znAt9L^yNkJysXW>nsdG=;)Aq@ukuxoP>wkLBkXbI#fG^aZmDut1x1A5sP7j z70iRI-+{{&7NC=C-h2!lp0I!oWB+4FkZ%CK)KLG;uQ<Z`iI^RtVVshL|%XxXxm zc(bco-F}y^jJ!sl!`~v#UL-aD&2B2~dZ)b`XpA#&wschvqMDP=e*qM0l1cyo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..6957270fbc22a31e00543ef6ed17e920eff1b6a8 GIT binary patch literal 1916 zcmb7F+fExX5Pi>AxIwB~v=DJAMTR0lNYko7DiVpltsL7)ta!baFS+pb9q%n0x+xGP z8}Ha>=FH65**!5qAMSho+lPmE)ayT9$A6Amoer9E#SL+sq47H0#MCnq(^M_uu^={{ zbE9IBXlW1iIsn8}rG}#=F-oW@N+*bVCyBMKJ#EM|A*kQ&A|?o&vrbT?y`Ama%bhXQ zK6vuVTh0zNF(l!RoAw(p))_(F0g=EI$N)qrDm~+&R612E9iiO$<~9zuS!RL? z7f*STK<-&4@zM~jxjCd>DA{hceDIF$$U60IXejzt=8Un-j2Mr&vZ;CKu6{hNE>DiH zF1A45g@lxLQ(r8gy4I0R)Kq3$aL!YK-byw!iurWa>DoqT71pe zolu;ZZkmjRA|YZ|l0;v&P{Su{u!|663Cok?7}nKdB2(fj7G2PLDt&NRrxCPMm@#I2 zAt=YVb~cY&#;Np?g*gx5a#P;@jWuJNZ$X1=q?&Bm;X;@gZ!ro?Mr#e6<~35$Mx4aZ z8O8$*MY1+m=s^FoKTN)UiBJ~jrGg?<=vDh!u>#9|m;1@j>5 zH{f!G1?VK3H19))XDeXCz<(bSGQ{OqG;YUWlGTDB}C-t4MY zx8LQ^I-A$V%KX* Mr@FPr6%&)zUpl6b^Z)<= literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..28960bc48cdb995ceaa4be6248359addb367e25d GIT binary patch literal 1920 zcmb7F+fExX5Pi>AxIwB~v=DJAMTR0lNYko7DiVpltsL7)ta!baFS+pb9q%n0x+xGP z8}Ha>=FH65**!5qAMSho+lPmE)ayT9$A6Amoer9E#SL+sq47H0#MCnq(^M_uu^={{ zbE9IBXlW1iIsn8}rG}#=F-oW@N+*bVCyBMKJ#EM|A*kQ&A|?o&vrbT?y`Ama%bhXQ zK6vuVTh0zNF(l!RoAw(p))_(F0g=EI$N)qrDm~+&R612E9iiO$<~9zuS!RL? z7f*STK<-&4@zM~jxjCd>DA{hceDIF$$U60IXejzt=8Un-j2Mr&vZ;CKu6{hNE>DiH zF1A45g@lxLQ(r8gy4I0R)Kq3$aL!YK-byw!iurWa>DoqT71pe zolu;ZZkmjRA|c`uNEPDRC8(E@(cLJ~*vY3Hm9_ z7&E>Qlw({wo5wBVR2s>`oQH6^DObS8nz7Bdpvg5-O}6ZCAx@097zHMywFXY}8mVX_ zP-5r|lm5uum8j?kuo1-_YQ-p6s!=2rP~5$7?p z-QiL^KTDpMwb@s5&R0cz5Oxr0_)Unk{3WfjAopH6q2(G>H6+f5AqZM<9A+IX(IOQL zC49=1f|A1|3D>Rhurbs+QqGRcS~spfRoaA;@JKson1V5fYKJiHNuF>O2FyNUF$}PR zd64xRaJj+)bdpV*_o2h{6|iCGzYhuW4Z!Cb>c9CF=UGsFRTp@E_R%smbE^q0TNV;; zc2%p}@A8F_*XU#TTg2IGq~^cbO{HD$w08rIapujIuF5e~a}<;?bTDU=*ddsEu=sqj P>s6&w-P+@diAn1(plgwg literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..99ff27cd0c4e2602dee58c53d6a8ec8ce0139248 GIT binary patch literal 1923 zcmb7FZExBz5dO}u@TfFxRayk?Smm-xEv0l?Tc(24eT$4^z`}7Xzd-4)-#KrDF03tr z;GF&3J$Luya=>Tk)BRw0{qPV@2E)h8@K3wZYN0ukOcTo@G~M`{kh+M3G?A-t%83b= zOv{kRN|t|t35{^n@+5nFM}Fs_`<;udjAnWSOPe7MY%iC}9@*=AeN=L)GhTY5OxC&oLBJd?>r0jFh! zRIt@3(R6|FNI{W|$%Hym|Ll+B@81HH#(6EI0F}}R(CcmoXxqXZU(Yk^>A?9I+CO0)>*xN*kRV?Ve5s)Rn_qrD@~SWE0?*GOS|@sG6rn}S zeB$-4Ds}r^yfd;IefEE=FnyC${MWlFwd<^McBC*)y_v#QF^DRTyb{J1Mr|Bg1d|UI SU(ffwtyH2K2TVqM*7yqsQ<8%K literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..c913c159c6ef972b80ade9bf46476ec6028c08b0 GIT binary patch literal 1917 zcmb7FZExBz5dO}u@TfFxRayjeta4eUmNGW2EmJ}2zD33{VBt8HU!e5Y@0>S67uFU* zaL#`2p1XT;IpTBl;chs(xxWvm!_mW4_@~`ywa|h|rio<{nr;0}NL@rin#grH>^q%>K?1dRp*6mbIQjO7$5XO@TdGG`HL zo;*6~42w<_(IjS;>E;_S#%fOO2@${)$N+>(DqO@+(`efAskF*eT3p)m-EHV^)66*K zHe9echTPLk!i^?MF?~uMm!jEdIPV?Zl5OJbL{s#w$Qfgi8Zn+SX%c-uc>3}9)bDmK zFZMv+xqy@o;|5O;lV3Hs8)FZH@?d9Wtm#rS5?zagt|m*OSQYB?!*~SObgSPx3sAdr z9-!V1|46iY71-@~%pUHighn`Oc_LqhLqzztdnM_i;v!i%Hda--EVp88)1)xIh8|x< z(JhyZ=s^r@!cGw08*ut{g31M9><{}}sWHA82C&GD$RT^GAxf$b# ztvO{FSI%Z}i#Vk~(lF;fTxQDJzq6)nvn}Xwg;bp_J)A2O<2^>6$#koL)2c!$*r}6f zy2N;_ph(7K5*@35_9yYzF9Ay9yp&LYN>K#pWuF7IYhI48=b80z=7qUezD&Vc%yf6M z;m^;4Wo51RRiE=^9v{Tn?q$C@5r(~_RTSjT3Coq(f}$qGIzItH3zos4gC$xgJVG&B zL{dV@VT}0e-fY+zDjg{XN59gI%TJ{?;UqlI78)jD#-Yj~jE927O#10^jG_>xSI*qa z`VF{DVIDe9N6q`t{@LS67uFU* zaL#`2p1XT;IpTBl;chs(xxWvm!_mW4_@~`ywa|h|rio<{nr;0}NL@rin#grH>^q%>K?1dRp*6mbIQjO7$5XO@TdGG`HL zo;*6~42w<_(IjS;>E;_S#%fOO2@${)$N+>(DqO@+(`efAskF*eT3p)m-EHV^)66*K zHe9echTPLk!i^?MF?~uMm!jEdIPV?Zl5OJbL{s#w$Qfgi8Zn+SX%c-uc>3}9)bDmK zFZMv+xqy@o;|5O;lV3Hs8)FZH@?d9Wtm#rS5?zagt|m*OSQYB?!*~SObgSPx3sAdr z9-!V1|46iY71-@~%pUHighn`Oc_LqhLqzztdnM_i;v!i%Hda--EVp88)1)xIh8|x< z(JhyZ=s^dkwRUsAZ z6iPH*Vmww*Bx5p}j@3W=llbeG0HtwW$|yjkFaq?l*8$o!Fvr*P%=$R<;@m4=rs6DS zx;xqM=V!sPvex^m&-pTs4+3rXvfrEt!(P%V3UcR!r&6165}s)b4U;hEP~{NDL&0Ju{e(G2Q3w+( zXYOVF23)2v51psu=6z`Ye0i*&`tL)6d_C~Fg8FZM`T5AJzN`y8KgVd3=#^1~7A^CM z*So6J?RW9U$ZGV_|ER literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..ef7e3b9de85be365046298f8a05750a26e325e72 GIT binary patch literal 1922 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTGr8NZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|mst@Avod_K96I0JfOjEUtCxX~` z!HtSVqNP35>i`fll^Tv##3-R=D4ineog~(__Ov0hgrGsckC-5E&N@Mn_IAE&FL%aJ z`{2AOjGgsPv4Zwl%a9Qt4EwbcAweo7*_tW|;{p zTs-4R0=Z|I#4AIz=H`%kp=7(&^1(a0CF|6?k)h~YnKQ;RGh#gE%BJSN|8(>C)H^x8 zINt*O5E4?_jq5)>jD9uXzFW5&R0KP%VvXm96Luw2x){xg=5=Vu590y+W?Q}PX@v5t zf7||Ll*FuU{gR#d)ct2vyRE(93Q|Xw$+1-^?@bM4zncuZViZyq1KUddh}}DxcF3Q6HdZI?Vw=_1|6y$!uVS9gsU)O_7RI= zgcZz#tlxmk6&9e=Y~H*N9iFg&4P*a(NRV#;KG#tH&96Amg6gZf!1J?@R;gK7O=#J& zka)AJTHSt^uZ+A#AH&}=&R!%n|IKbH?RuxZ8)}R*Z?<$*4x*Z)poEcwNt?tD!Q_L* S=kr}JE1l}r9#>3ETYmv1Gm=dJ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..adb02359854a66feaeeda63ae1dfe13c4081b394 GIT binary patch literal 1916 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTH3K`ZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|g|%>-+n7d^32sjQly+Zx&lsdTDTIzqX#&21cRv&;k) zE}rouf!wo9;*}v`QRPhl6C6c$WZjH%o$^u88IGnWm9wCfBNzG)H^x8 zINt(&7ZOt1jq5)>jD9uXuC3b*DuSI>vBq=53A>UhU5w^L^Ex!-hw%V@v#nnDG(!1R z!5@jyui{RQBNVRgChSiIeel$YRJ{s6ix7N|x>e8{Eo3_Le5mVed11xaWjSGd1ued0 z>{cjFOg~K~LXi-$FG-?%E!6PI8tftjnZokqIEHn#n97v6ibWsvo=P7a)@cOo6lRPW zUkb`GuAR-}mT@Y5WMR%ixZIR?e`C$q=3CI<8mT5*cDN8G##@X6lkr*ur$voav=JvU zbdK>*Ly@e_6*|=a?2nSKUm}#nd8wcXRicQ{%RWbF)4T%T%ro!dB=jb;-|VK+u6NqIp~g7#W=mJ)2&y>>N*Fnqu}SO@%sg0pKHv44 N(y4CkamB>6^%qNAkM{ro literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3df1baf675f1eb03a7f5de7fa28df496279c315 GIT binary patch literal 1920 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTH3K`ZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|g|%>-+n7d^32sjQly+Zx&lsdTDTIzqX#&21cRv&;k) zE}rouf!wo9;*}v`QRPhl6C6c$WZjH%o$^u88IGnWm9wCfBNzG)H^x8 zINt(&7ZOt1jq5)>jD9uXuC3b*DuSI>vBq=53A>UhU5w^L^Ex!-hw%V@v#nnDG(!1R z!5@jyui{RQBNVRgChSiIeel$YRJ{s6ix7N|x>e8{Eo3_Le5mVed11xaWjSGd1ued0 z>{cjFOg~K~LXi;hflJ?)B+>K!@62bWlCJdqz{@;r4LT)RDymA zGscWB1?3pm&gOB;IF&}SFlQlLZpsy~v1V-ZEogF$RFf?`T!<6nEk=RKc&&lcqDCs( z2$UE)$9SlrNY>^m9qNDfN6FVO5z6AcR8fQ~VMORD{jC0eF} zp@h$vQc!Z3B;j{!JZucLj+C>bSL?>br%Ib}5*}#>4O1}YQ0)-LH;=s23)SN0G(#j=6&e!dghWc-Q#d#J~U)2SkpMA7S&BAIz%a(=2 zn_boF_Pcyx!#^jDRtwFTWSUqOp~=SIgw#bOq={UH6HZLH zU|NPeR>B;sjR)|VOf*9)q9vCzluQwIRuE$v2TGG!Owgd$LlGx%&R9;7a%R47FLM^5 z=Fzj0&ami65lv!dnQp!VW31-X9uWadfeb*nq{2lEHI1e%pGvDtrNyPa*xrWzHqDGv zZo?UiW5_+tBwT5t6w}AlaVeUOhV$OhJ=rAAjxnOVCk`diY;t7|;N4y)YxS96^iB-3OYCdVBZGeC?EKl1ZtjonzB*d0ndZ7D6IPbPf z#%m}yV;r$1rwrrD*(`1mr_@Or=E8@|OqmL{)|73w1&ywdscP7NvP-#dpKDw1=Tzx5}2`AyHw$LvLlMYo5VcZuiX421?LllKD z!*b?c)$hP%0`t&mI&eOQ_79lH`nmrxB&gQ|Un;2o=9iz3yy(liz~gg>R*7C1MQG77 zpLo5iO5J`J?~JTQpZ(u5Oy49G|MhN4={l>N9V(1dZ>Dfn%%O@SuY{3>Q5%OA!Qg|% S*UNoxE0w6m0h1A*HvR&B50Y>I literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..89645723cd8f980f20255bda375f4e4e5540674c GIT binary patch literal 1917 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTE^J4woDCD_boDx0Sm{m`~s!Fe&@Uqy0Eqg zf^+tB_uSo+%K@LE4|l`SxBL5OG8{czM}JOQoer8a$uzMnM$?VIiKvT7L{qtnrkt2) z$+V1kqJ%kC8xP=fnQDgCL`yE`D4ikdtRTj;4wNSIgrL!2fMQPIoUxoD<;-H=UhXVL z?W0F0oni5jBAO)3GTnXy##qg%Jt6{_0vUjCNrj6UY8y>EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?g8^Im{X zx)%ZJZ5=!ktzJd;&OBxZ_ftYA9JM@^uficBd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH@--MD<&!;ghx41_;=~^0XVlx?0RcN^Heq0E$nA^A77Yym|^V z#xYxQ$}p~-&EuAFDurZWE_}G$l=pvY&DiE!(Bc}YCR=v6kS4}Ej69RcMggZ~ja0Ce zC((3)@mN8TjL9`RR{!jW$(Qp0WpQ3=C_t4c0`#)a0opdNz&G>EdpPyV+$mo!;XG!x z8?O2DvtW5yn|(Fsd{xBndTr%DkR*G_s;tMI6P7En0YQbtIzItH3YNj3gC$y}JVptd z$5KMYVT}0e&TQBkY7Hp|N59sLt520Q;UqlI7WyS&#-Z9FjE90HO#10^h~fyQSHaw? z`VF{TU>-WlM$P-s{?YPSKk?s(1oe90a}D+1{EG9j7kyP1czh1gI@L>~2rXOY6K{4^ ztK0ALZIRdLqyJk)**m1>zu8SCU1ycEV})_%&6KXn2~>0Bl@MAOvPonSj67I;zTEeo NQmJYkFd6e%>o24FkQx90 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..513ff2695ac3b18f7c6cb0880eee43999971493f GIT binary patch literal 1921 zcmb7FZExBz5dO}u@TfFxRa#WqvdU$ZTE^J4woDCD_boDx0Sm{m`~s!Fe&@Uqy0Eqg zf^+tB_uSo+%K@LE4|l`SxBL5OG8{czM}JOQoer8a$uzMnM$?VIiKvT7L{qtnrkt2) z$+V1kqJ%kC8xP=fnQDgCL`yE`D4ikdtRTj;4wNSIgrL!2fMQPIoUxoD<;-H=UhXVL z?W0F0oni5jBAO)3GTnXy##qg%Jt6{_0vUjCNrj6UY8y>EK9x?DN{35(vAvD_ZI&6Q z+(vVjB#?WSNwn5PDW;F9<5IL+E$6+XTe3-=4K+ny%bYP5nGxd&lP1;ogQp*lPyN&G z)#VQ8I~S1Be%#>cA^g>V`)2HZP!a5`iZxkiM&cWh(q*_Hiq)Y$Ka59k&9?g8^Im{X zx)%ZJZ5=!ktzJd;&OBxZ_ftYA9JM@^uficBd{256>!IQzojW#Gb-t{yVr;XlFus8% zU&rw+myGB^noPMQKH>utZXiga`YqJ($=YlK1dL&M+KpgcEoLGmwqh~>)u+OFr*$e` zL4_IPn5{Tv7}w6`amzTBO0qB)K3s0fHL$g2Z1XK>bB$D!EjwJu6XP95p2=jRfYY)@ zD%eVtXu807te{B7=6z`YczLX!`tL)6dOh&DhWc-Q#rfEazN!m6K8I+X>ZMVHmM!y% zH@m9U?RWXc$ZPb`|E;3zJyP@E?52{gv&z}A!Z`D0N>}9+syXsX2rUfSB(ey`9xOgz Q?t53MRJ9J6jQOnf7YX{2vH$=8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..c9e96d49978360eb544ea2f40ac508315800ec32 GIT binary patch literal 1921 zcmb7FZBN@U5dPj@;T37xP@v*v3>k(5rIcwvCt8Vpi(K1ntvZh7mzMJDcg|Z=mJX=e z)H(aPd+zSZWk*cVr~6+2`r#oS_4<#O@t=cMr-P!(g~v8Nn&klM;kIt2PzZ)e-~a%T*+ z_ny4+ma{!g3`w}-ru`0#bw*IPMG5`^ZO3yfITSGe`l}?pPM<{o?zKz3emYJZ! z#Z#Unkb9O%yfj2>ZuY4cO14`qAH1VGvP!)h8j60DIb$p{BgP}HY-%35&$my{=SPPZ zXB(g!At9yhxbE}g@K*!w+PdwaBG_>iYcw;Quq&C;`EW)wuR}wA8290tZ2`pyog9Z> z1%Dz&zll3IiBOKMa6c9F!BZzv^(Oo@LhwB}slYugWIFYHpzCaTVa3>GIbnPSExu&z zPAE=HH%-Pukr45AxD<^Rg-y)R7d99-eRnmyi>uyJA-NFLj%ro!gIH+@@e7TJC znAt9L^yNkJysXW>nsdG=;)Aqv8X;6I!l7Rzu=^n1Ub$$6?sP5-n1} zP{OB7DX2J%l5pLa4{JlMA?5HmuQlW1OC?P>3D2~Hekqu9sCEeBp5zHvVZ!Vp7Q+N9 zmPA@-T-{5q5hj+ah?UyS9O8MXBRC~Gq;-1vSlIh zW>>Yk{Vv}ad5u1YzeSwAM{53?-Bi-`PJ1`d7-!yW>8hMVHAg`SLkEL4i5-H$2aB&~ Q+ul_=)vX<_n3%Nw0-!vR{{R30 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..20b9cddc891a1bc261ad5cc3910615fde50ec3dd GIT binary patch literal 1915 zcmb7F+fExX5Pi>AxIwB~v=DJAMTR0lNYko7DiVpltsL7)ta!baFS+pb9q%n0x+xGP z8}Ha>=FH65**!5qAMSho+lPmE)ayT9$A6Amoer9E#SL+sq47H0#MCnq(^M_uu^={{ zbE9IBXlW1iIsn8}rG}#=F-oW@N+*bVCyBMKJ#EM|A*kQ&A|?o&vrbT?y`Ama%bhXQ zK6vuVTh0zNF(l!RoAw(p))_(F0g=EI$N)qrDm~+&R612E9iiO$<~9zuS!RL? z7f*STK<-&4@zM~jxjCd>DA{hceDIF$$U60IXejzt=8Un-j2Mr&vZ;CKu6{hNE>DiH zF1A45g@lxLQ(r8gy4I0R)Kq3$aL!YK-byw!iurWa>DoqT71pe zolu;ZZkmjRA|YZ|l0;v&P{Su{u!|663Cok?7}nKdB2(fj7G2PLDt&NRrxCPMm@#I2 zAt=YVb~cY&#;Np?g*gx5a#P;@jWuJNZ$X1=q?&Bm;X;@gZ!ro?Mr#e6<~35$Mx4aZ z8O8$*MY1+m=s^FoKTN)UiBJ~jrGg?IgqCX%)Q~tI<{wDGaTs&3M2l1~l<+B2 z3MvjmBwV*9!^Ti+NI5nxYt6X&R7n#~!qe=aUkWB1svW|(CwanEm@WH=#W1@H=0VkO zz~ur9&`CCE-iHp4R=|dN|2`zBHvpe&sQ>0yoM%DwRbAlm*+ zzsq+;UZaoUZxLs2kedHyHAxIwB~v=DJAMTR0lNYko7DiVpltsL7)ta!baFS+pb9q%n0x+xGP z8}Ha>=FH65**!5qAMSho+lPmE)ayT9$A6Amoer9E#SL+sq47H0#MCnq(^M_uu^={{ zbE9IBXlW1iIsn8}rG}#=F-oW@N+*bVCyBMKJ#EM|A*kQ&A|?o&vrbT?y`Ama%bhXQ zK6vuVTh0zNF(l!RoAw(p))_(F0g=EI$N)qrDm~+&R612E9iiO$<~9zuS!RL? z7f*STK<-&4@zM~jxjCd>DA{hceDIF$$U60IXejzt=8Un-j2Mr&vZ;CKu6{hNE>DiH zF1A45g@lxLQ(r8gy4I0R)Kq3$aL!YK-byw!iurWa>DoqT71pe zolu;ZZkmjRA|c`uNEPDRC8(E@(cLJ~*vY3Hm9_ z7&E>Qlw({wo5wBVR2s>`oQH6^DObS8nz7Bdpvg5-O}6ZCAx@097zHMywFXY}8mVX_ zP-5r|lm5uum8j?kuo1-_YQ-p6s!=2rP~5$7?p z-QiL^KTDpMwb@s5&R0cz5Oxr0_<=0>OIBq)?!9zE%Qc8?IH6^B6*u3OV#W2iNx937XnW?X%$qzNbCiFVL01yc^y4q@DrJmD(Lmwm)ym|q3+ zpz1f^a)AZtBpWvGLx;yJV8hIR9}?6XfX_A5fAcHOvmpAaF7Wv5qh)I5RufvbEF|9S zs#dq(<@+M9(Z}$&h_kmy&406-O1j=@?*Tk)BRw0{qPV@2E)h8@K3wZYN0ukOcTo@G~M`{kh+M3G?A-t%83b= zOv{kRN|t|t35{^n@+5nFM}Fs_`<;udjAnWSOPe7MY%iC}9@*=AeN=L)GhTY5OxC&oLBJd?>r0jFh! zRIt@3(R6|FNI{W|$%Hym|Ll+B@81HH#(6EI0F}}R(CcmoXxqXZU(Yk^pGbf|I&h`;MWn?w_?Eh9_`XZ_LuXj^Q*IDK4NMW3MGli>S4pkg^C5$ah+Bmcb1|KZG Rp6`2EsYEpnn2h+W@fVTnk~jbW literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..c20fe3d06c6a28e861f168a35f98de11e7f743b0 GIT binary patch literal 1916 zcmb7FZExBz5dO}u@TfFxRayjeta4eUmNGW2EmJ}2zD33{VBt8HU!e5Y@0>S67uFU* zaL#`2p1XT;IpTBl;chs(xxWvm!_mW4_@~`ywa|h|rio<{nr;0}NL@rin#grH>^q%>K?1dRp*6mbIQjO7$5XO@TdGG`HL zo;*6~42w<_(IjS;>E;_S#%fOO2@${)$N+>(DqO@+(`efAskF*eT3p)m-EHV^)66*K zHe9echTPLk!i^?MF?~uMm!jEdIPV?Zl5OJbL{s#w$Qfgi8Zn+SX%c-uc>3}9)bDmK zFZMv+xqy@o;|5O;lV3Hs8)FZH@?d9Wtm#rS5?zagt|m*OSQYB?!*~SObgSPx3sAdr z9-!V1|46iY71-@~%pUHighn`Oc_LqhLqzztdnM_i;v!i%Hda--EVp88)1)xIh8|x< z(JhyZ=s^r@!cGw08*ut{g31M9><{}}sWHA82C&GD$RT^GAxf$b# ztvO{FSI%Z}i#Vk~(lF;fTxQDJzq6)nvn}Xwg;bp_J)A2O<2^>6$#koL)2c!$*r}6f zy2N;_ph(7K5*@35_9yYzF9Ay9yp&LYN>K#pWuF7IYhI48=b80z=7qUezD&Vc%yf6M z;m^;4Wo51RRiE=^9v{Tn?qxraBzwuKsK=cXmMgIZK~0EtegJ|LEQ2`*OSDdSgkrXc zq=bsY6!F)+(XcaA8dA)Sex(_gpGs-MNqC$s^h?5sLzP1q4+V>v^uy&CMIj8YoVi!^ z8*rJxJanE;n)jjoqvf%F;J*(E>h-|q3hKZ4<>w zx8KFfBCF9y|F;g)S4hQwy_-_H&MIfe3ggt9DO?p3sN%>gVPav%#-T+p@?i1#;?QeK MC8}}6WW?uS67uFU* zaL#`2p1XT;IpTBl;chs(xxWvm!_mW4_@~`ywa|h|rio<{nr;0}NL@rin#grH>^q%>K?1dRp*6mbIQjO7$5XO@TdGG`HL zo;*6~42w<_(IjS;>E;_S#%fOO2@${)$N+>(DqO@+(`efAskF*eT3p)m-EHV^)66*K zHe9echTPLk!i^?MF?~uMm!jEdIPV?Zl5OJbL{s#w$Qfgi8Zn+SX%c-uc>3}9)bDmK zFZMv+xqy@o;|5O;lV3Hs8)FZH@?d9Wtm#rS5?zagt|m*OSQYB?!*~SObgSPx3sAdr z9-!V1|46iY71-@~%pUHighn`Oc_LqhLqzztdnM_i;v!i%Hda--EVp88)1)xIh8|x< z(JhyZ=s^dkwRUsAZ z6iPH*Vmww*Bx5p}j@3W=llbeG0HtwW$|yjkFaq?l*8$o!Fvr*P%=$R<;@m4=rs6DS zx;xqM=V!sPvex^m&-pTs4+3rXvLDEjy<}C?b0 zFoCg--iVCdf;;f_22yR^N|;QSr>SGj?pI3E29W4TILh4 zcU7s|@8X4#)##)DTZiduq~gEcO(|Vxm9t}oaq7(!u8JvCapaXSu`p-j&>|Rnu=sp& P=vAc>)i`1@;`7E|9!8OR literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..6028fba165cc043a786b248a69ad88ba773c9ddd GIT binary patch literal 1921 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTGr8NZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|mst@Avod_K96I0JfOjEUtCxX~` z!HtSVqNP35>i`fll^Tv##3-R=D4ineog~(__Ov0hgrGsckC-5E&N@Mn_IAE&FL%aJ z`{2AOjGgsPv4Zwl%a9Qt4EwbcAweo7*_tW|;{p zTs-4R0=Z|I#4AIz=H`%kp=7(&^1(a0CF|6?k)h~YnKQ;RGh#gE%BJSN|8(>C)H^x8 zINt*O5E4?_jq5)>jD9uXzFW5&R0KP%VvXm96Luw2x){xg=5=Vu590y+W?Q}PX@v5t zf7||Ll*FuU{gR#d)ct2vyRE(93Q|Xw$+1-^?@b;Po845>^-g;?)EH;pZ0V|;Lp4W12_pxCHi;dA!3T@a R=eyojI@PT`u9%p%{sOT9k^%q# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..0ea06e92e5d98051ef2238bbb2b03113a92216cf GIT binary patch literal 1915 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTH3K`ZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|g|%>-+n7d^32sjQly+Zx&lsdTDTIzqX#&21cRv&;k) zE}rouf!wo9;*}v`QRPhl6C6c$WZjH%o$^u88IGnWm9wCfBNzG)H^x8 zINt(&7ZOt1jq5)>jD9uXuC3b*DuSI>vBq=53A>UhU5w^L^Ex!-hw%V@v#nnDG(!1R z!5@jyui{RQBNVRgChSiIeel$YRJ{s6ix7N|x>e8{Eo3_Le5mVed11xaWjSGd1ued0 z>{cjFOg~K~LXi-$FG-?%E!6PI8tftjnZokqIEHn#n97v6ibWsvo=P7a)@cOo6lRPW zUkb`GuAR-}mT@Y5WMR%ixZIR?e`C$q=3CI<8mT5*cDN8G##@X6lkr*ur$voav=JvU zbdK>*Ly@e_6*|=a?2nSKUm}#nd8wcXRicQ{%RWbF)4T%T%ro!dBmS&us}ozQX(f*KL$!~6p&I1XbDmS~v@h7vww zNNqCwa^h?2nL$yN~-$J7l>8tT9K73WzHeN`8DeD={QH4Cc=En5~6Z+2Cy z+wbySk=N*B_*=%=8>HsH*-a%~@3ePAjdAA9mafVPRC5%RFmfPtN;K2 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..688064da3aee8198af4f42d9d05da5e753446aba GIT binary patch literal 1919 zcmb7FZExBz5dO}u@TfFxRa(@SvC3tYTH3K`ZJ7#E_boEE0Sm{m`~s!Fe&@Uqy0Es0 zfOGb9_uSo+%bu8`4|g|%>-+n7d^32sjQly+Zx&lsdTDTIzqX#&21cRv&;k) zE}rouf!wo9;*}v`QRPhl6C6c$WZjH%o$^u88IGnWm9wCfBNzG)H^x8 zINt(&7ZOt1jq5)>jD9uXuC3b*DuSI>vBq=53A>UhU5w^L^Ex!-hw%V@v#nnDG(!1R z!5@jyui{RQBNVRgChSiIeel$YRJ{s6ix7N|x>e8{Eo3_Le5mVed11xaWjSGd1ued0 z>{cjFOg~K~LXi;hflJ?)B+>K!@62bWlCJdqz{@;r4LT)RDymA zGscWB1?3pm&gOB;IF&}SFlQlLZpsy~v1V-ZEogF$RFf?`T!<6nEk=RKc&&lcqDCs( z2$UE)$9SlrNY>^m9qNDfN6FVO5z6AcR8fQ~VMOR$DNl>Xt@SajfnGM27(kEhfxPhv`hs< z37;{gpyDt{!td5}*cfUJDMv@I){Ki!l{DcbJkbvNrC`dT+98Z@Bu}^s^JO2g80J^O zJgE8&xLjZXI?aa7`_SR>3fM67--iVC2H=2ASSbRR; P^`_FPZtZc!#I*GnJ;;$n literal 0 HcmV?d00001 diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 41592a08b..01a3b8d31 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1232,6 +1232,73 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}}); } +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW32) { + require_compute_capability(6, 1); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW4_NCHW32; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f)) + .set_dtype(3, dtype::QuantizedS8(1.9980618f)) + .set_dtype(4, dtype::QuantizedS8(1.9980618f)) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &bias_rng) + .set_rng(3, &int_rng) + .set_param(param); + auto run = [&](const TensorShapeArray& shapes) { + checker.execs({shapes[0], shapes[1], shapes[2], {}, {}}); + }; + + run({{16, 4, 23, 40, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); + run({{16, 4, 92, 160, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); + run({{16, 4, 46, 80, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); +} + +#if CUDA_VERSION >= 10020 +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) { + require_compute_capability(7, 5); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW32_NCHW4; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< + ConvBiasForward>( + ConvBias::algo_name( + "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64", + ConvBias::DirectParam{}) + .c_str())); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f)) + .set_dtype(3, dtype::QuantizedS8(1.9980618f)) + .set_dtype(4, dtype::QuantizedS8(1.9980618f)) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &bias_rng) + .set_rng(3, &int_rng) + .set_param(param); + auto run = [&](const TensorShapeArray& shapes) { + checker.execs({shapes[0], shapes[1], shapes[2], {}, {}}); + }; + + run({{16, 2, 23, 40, 32}, {20, 2, 3, 3, 32}, {1, 5, 1, 1, 4}}); + run({{16, 1, 92, 160, 32}, {24, 1, 3, 3, 32}, {1, 6, 1, 1, 4}}); + run({{16, 2, 46, 80, 32}, {4, 2, 3, 3, 32}, {1, 1, 1, 1, 4}}); +} +#endif + #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) { require_compute_capability(6, 1); -- GitLab