From 5f44203d7bbaf34f2fc4a3e82b056b32431e1de7 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 17 Nov 2020 12:47:13 +0800 Subject: [PATCH] feat(dnn/cuda): add a cutlass impl for fusing convolution and dimshuffle GitOrigin-RevId: 3fc6faef01202867f54206367d32ab01659326d0 --- .../conv_bias/cutlass_convolution_wrapper.cu | 126 +++++++++++ .../conv_bias/cutlass_convolution_wrapper.cuh | 16 +- .../implicit_gemm_int8_nchw4_dp4a.cpp | 198 ++++++++++++++---- ...s_int8_implicit_gemm_cutlass_wrapper.cuinl | 15 +- ...4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu | Bin 1687 -> 1915 bytes ...m_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu | Bin 1681 -> 1909 bytes ...dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu | Bin 1685 -> 1913 bytes ...p4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...mm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu | Bin 1680 -> 1908 bytes ..._dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu | Bin 1684 -> 1912 bytes ...p4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...mm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu | Bin 1680 -> 1908 bytes ..._dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu | Bin 1684 -> 1912 bytes ...m_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu | Bin 1682 -> 1910 bytes ..._gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu | Bin 1676 -> 1904 bytes ...emm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu | Bin 1680 -> 1908 bytes ...cdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu | Bin 1688 -> 1916 bytes ...4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu | Bin 1682 -> 1910 bytes ..._ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu | Bin 1686 -> 1914 bytes ...ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu | Bin 1687 -> 1915 bytes ...p4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu | Bin 1681 -> 1909 bytes ...a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu | Bin 1685 -> 1913 bytes ...ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu | Bin 1687 -> 1915 bytes ...p4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu | Bin 1681 -> 1909 bytes ...a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu | Bin 1685 -> 1913 bytes ...4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu | Bin 1683 -> 1911 bytes ...m_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu | Bin 1677 -> 1905 bytes ...dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu | Bin 1681 -> 1909 bytes ...ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu | Bin 1687 -> 1915 bytes ...p4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu | Bin 1681 -> 1909 bytes ...a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu | Bin 1685 -> 1913 bytes ..._ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu | Bin 1680 -> 1908 bytes ...4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu | Bin 1684 -> 1912 bytes ..._ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu | Bin 1686 -> 1914 bytes ...dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu | Bin 1680 -> 1908 bytes ...4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu | Bin 1684 -> 1912 bytes ...ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu | Bin 1687 -> 1915 bytes ...p4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu | Bin 1681 -> 1909 bytes ...a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu | Bin 1685 -> 1913 bytes ..._ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu | Bin 1680 -> 1908 bytes ...4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu | Bin 1684 -> 1912 bytes ..._ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu | Bin 1680 -> 1908 bytes ...4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu | Bin 1684 -> 1912 bytes ...p4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu | Bin 1686 -> 1914 bytes ...mm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu | Bin 1680 -> 1908 bytes ..._dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu | Bin 1684 -> 1912 bytes ...dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu | Bin 1685 -> 1913 bytes ...emm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu | Bin 1679 -> 1907 bytes ...m_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu | Bin 1683 -> 1911 bytes ...dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu | Bin 1685 -> 1913 bytes ...emm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu | Bin 1679 -> 1907 bytes ...m_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu | Bin 1683 -> 1911 bytes ...p4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu | Bin 1686 -> 1914 bytes ...mm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu | Bin 1680 -> 1908 bytes ..._dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu | Bin 1684 -> 1912 bytes ...dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu | Bin 1685 -> 1913 bytes ...emm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu | Bin 1679 -> 1907 bytes ...m_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu | Bin 1683 -> 1911 bytes ...dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu | Bin 1685 -> 1913 bytes ...emm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu | Bin 1679 -> 1907 bytes ...m_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu | Bin 1683 -> 1911 bytes ...div4hw4_nchw_128x128x32_64x32x32_hswish.cu | Bin 0 -> 1891 bytes ...a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu | Bin 0 -> 1885 bytes ...ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu | Bin 0 -> 1889 bytes ...cdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu | Bin 0 -> 1890 bytes ...4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu | Bin 0 -> 1884 bytes ..._ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu | Bin 0 -> 1888 bytes ...cdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu | Bin 0 -> 1890 bytes ...4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu | Bin 0 -> 1884 bytes ..._ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu | Bin 0 -> 1888 bytes ...a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu | Bin 0 -> 1886 bytes ..._dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu | Bin 0 -> 1880 bytes ...p4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu | Bin 0 -> 1884 bytes ...hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu | Bin 0 -> 1892 bytes ...div4hw4_nchw_1x1_128x128x32_64x32x32_id.cu | Bin 0 -> 1886 bytes ...v4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu | Bin 0 -> 1890 bytes ...4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu | Bin 0 -> 1891 bytes ...cdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu | Bin 0 -> 1885 bytes ...iv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu | Bin 0 -> 1889 bytes ...4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu | Bin 0 -> 1891 bytes ...cdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu | Bin 0 -> 1885 bytes ...iv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu | Bin 0 -> 1889 bytes ...div4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu | Bin 0 -> 1887 bytes ...a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu | Bin 0 -> 1881 bytes ...ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu | Bin 0 -> 1885 bytes ...4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu | Bin 0 -> 1891 bytes ...cdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu | Bin 0 -> 1885 bytes ...iv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu | Bin 0 -> 1889 bytes ...v4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu | Bin 0 -> 1890 bytes ...ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu | Bin 0 -> 1884 bytes ...div4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu | Bin 0 -> 1888 bytes ...v4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu | Bin 0 -> 1890 bytes ...ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu | Bin 0 -> 1884 bytes ...div4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu | Bin 0 -> 1888 bytes ...4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu | Bin 0 -> 1891 bytes ...cdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu | Bin 0 -> 1885 bytes ...iv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu | Bin 0 -> 1889 bytes ...v4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu | Bin 0 -> 1890 bytes ...ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu | Bin 0 -> 1884 bytes ...div4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu | Bin 0 -> 1888 bytes ...v4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu | Bin 0 -> 1890 bytes ...ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu | Bin 0 -> 1884 bytes ...div4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu | Bin 0 -> 1888 bytes ...cdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu | Bin 0 -> 1890 bytes ...4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu | Bin 0 -> 1884 bytes ..._ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu | Bin 0 -> 1888 bytes ...ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu | Bin 0 -> 1889 bytes ...p4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu | Bin 0 -> 1883 bytes ...a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu | Bin 0 -> 1887 bytes ...ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu | Bin 0 -> 1889 bytes ...p4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu | Bin 0 -> 1883 bytes ...a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu | Bin 0 -> 1887 bytes ...cdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu | Bin 0 -> 1890 bytes ...4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu | Bin 0 -> 1884 bytes ..._ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu | Bin 0 -> 1888 bytes ...ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu | Bin 0 -> 1889 bytes ...p4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu | Bin 0 -> 1883 bytes ...a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu | Bin 0 -> 1887 bytes ...ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu | Bin 0 -> 1889 bytes ...p4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu | Bin 0 -> 1883 bytes ...a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu | Bin 0 -> 1887 bytes ..._ncdiv32hw32_128x128x64_64x64x64_hswish.cu | Bin 1695 -> 1924 bytes ...imma_ncdiv32hw32_128x128x64_64x64x64_id.cu | Bin 1689 -> 1918 bytes ...ma_ncdiv32hw32_128x128x64_64x64x64_relu.cu | Bin 1693 -> 1922 bytes ..._ncdiv32hw32_128x256x64_64x64x64_hswish.cu | Bin 1695 -> 1924 bytes ...imma_ncdiv32hw32_128x256x64_64x64x64_id.cu | Bin 1689 -> 1918 bytes ...ma_ncdiv32hw32_128x256x64_64x64x64_relu.cu | Bin 1693 -> 1922 bytes ...a_ncdiv32hw32_128x64x64_64x32x64_hswish.cu | Bin 1694 -> 1923 bytes ..._imma_ncdiv32hw32_128x64x64_64x32x64_id.cu | Bin 1688 -> 1917 bytes ...mma_ncdiv32hw32_128x64x64_64x32x64_relu.cu | Bin 1692 -> 1921 bytes ...iv32hw32_1x1_128x128x64_64x64x64_hswish.cu | Bin 1696 -> 1925 bytes ..._ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu | Bin 1690 -> 1919 bytes ...cdiv32hw32_1x1_128x128x64_64x64x64_relu.cu | Bin 1694 -> 1923 bytes ...iv32hw32_1x1_128x256x64_64x64x64_hswish.cu | Bin 1696 -> 1925 bytes ..._ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu | Bin 1690 -> 1919 bytes ...cdiv32hw32_1x1_128x256x64_64x64x64_relu.cu | Bin 1694 -> 1923 bytes ...div32hw32_1x1_128x64x64_64x32x64_hswish.cu | Bin 1695 -> 1924 bytes ...a_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu | Bin 1689 -> 1918 bytes ...ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu | Bin 1693 -> 1922 bytes ...iv32hw32_1x1_256x128x64_64x64x64_hswish.cu | Bin 1696 -> 1925 bytes ..._ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu | Bin 1690 -> 1919 bytes ...cdiv32hw32_1x1_256x128x64_64x64x64_relu.cu | Bin 1694 -> 1923 bytes ...cdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu | Bin 1694 -> 1923 bytes ...ma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu | Bin 1688 -> 1917 bytes ..._ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu | Bin 1692 -> 1921 bytes ...div32hw32_1x1_64x128x64_32x64x64_hswish.cu | Bin 1695 -> 1924 bytes ...a_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu | Bin 1689 -> 1918 bytes ...ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu | Bin 1693 -> 1922 bytes ...cdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu | Bin 1694 -> 1923 bytes ...ma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu | Bin 1688 -> 1917 bytes ..._ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu | Bin 1692 -> 1921 bytes ..._ncdiv32hw32_256x128x64_64x64x64_hswish.cu | Bin 1695 -> 1924 bytes ...imma_ncdiv32hw32_256x128x64_64x64x64_id.cu | Bin 1689 -> 1918 bytes ...ma_ncdiv32hw32_256x128x64_64x64x64_relu.cu | Bin 1693 -> 1922 bytes ...ma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu | Bin 1693 -> 1922 bytes ...m_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu | Bin 1687 -> 1916 bytes ...imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu | Bin 1691 -> 1920 bytes ...a_ncdiv32hw32_64x128x64_32x64x64_hswish.cu | Bin 1694 -> 1923 bytes ..._imma_ncdiv32hw32_64x128x64_32x64x64_id.cu | Bin 1688 -> 1917 bytes ...mma_ncdiv32hw32_64x128x64_32x64x64_relu.cu | Bin 1692 -> 1921 bytes ...ma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu | Bin 1693 -> 1922 bytes ...m_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu | Bin 1687 -> 1916 bytes ...imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu | Bin 1691 -> 1920 bytes dnn/test/cuda/conv_bias_int8.cpp | 41 ++++ 167 files changed, 343 insertions(+), 53 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index fd840927..f9dd4c45 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -275,4 +275,130 @@ INST(true); INST(false); #undef INST +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const float* /* d_bias */, const float* /* d_z */, + float* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* d_src, const int8_t* d_filter, + const float* d_bias, const float* d_z, float* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ + cutlass::layout::TensorNCHW, float, \ + cutlass::layout::TensorNCHW, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 4, aligned_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAdd>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = float; + using ElementAccumulator = int32_t; + using ElementBias = float; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombination< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const float* d_bias, const float* d_z, float* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index 172ed5d7..2d78e8c3 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -22,8 +22,11 @@ using GemmCoord = cutlass::gemm::GemmCoord; template void cutlass_convolution_wrapper( - const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, - const int8_t* d_z, int8_t* d_dst, int* workspace, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, cudaStream_t stream); @@ -46,6 +49,15 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, cudaStream_t stream); +template +void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* d_src, const int8_t* d_filter, const float* d_bias, + const float* d_z, float* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 22451bf3..3eb7bb9f 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -32,10 +32,26 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; - if (param.format != Format::NCHW4) + if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW && + param.format != Format::NCHW4_NCHW32) return false; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR // TODO support group conv available &= param.sparse == Sparse::DENSE; // mode must be cross correlation @@ -46,9 +62,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( bias_dtype = args.bias_layout->dtype, dst_dtype = args.dst_layout->dtype; available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && - filter_dtype.enumv() == DTypeEnum::QuantizedS8 && - bias_dtype.enumv() == DTypeEnum::QuantizedS32 && - dst_dtype.enumv() == DTypeEnum::QuantizedS8); + filter_dtype.enumv() == DTypeEnum::QuantizedS8); + available &= (bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8) || + (bias_dtype.enumv() == DTypeEnum::Float32 && + dst_dtype.enumv() == DTypeEnum::Float32); // TODO: support dialtion available &= dh == 1 && dw == 1; // only support sm_61 or later, platform should have fast native int8 @@ -81,8 +99,23 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR auto&& stream = cuda_stream(args.opr->handle()); int8_t* filter_ptr = nullptr; @@ -115,47 +148,107 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float src_scale = args.src_layout->dtype.param().scale, filter_scale = - args.filter_layout->dtype.param().scale, - bias_scale = - args.bias_layout->dtype.param().scale, - dst_scale = args.dst_layout->dtype.param().scale; - float alpha = src_scale * filter_scale / dst_scale, - beta = bias_scale / dst_scale; - int8_t* z_dev_ptr = nullptr; - float gamma = 0.0; + args.filter_layout->dtype.param().scale; + float alpha = src_scale * filter_scale; + float beta = 1.f; + float dst_scale = 1.f; + if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + float bias_scale = args.bias_layout->dtype.param() + .scale, + dst_scale = + args.dst_layout->dtype.param().scale; + alpha /= dst_scale, beta = bias_scale / dst_scale; + } + float gamma = 0.f; if (args.z_layout->ndim > 0) { - z_dev_ptr = args.z_tensor->compatible_ptr(); - float z_scale = args.z_layout->dtype.param().scale; - gamma = z_scale / dst_scale; + gamma = 1.f; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + megdnn_assert(args.dst_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + float z_scale = args.z_layout->dtype.param() + .scale; + gamma = z_scale / dst_scale; + } } uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW4) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4< + false>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else if (param.format == Format::NCHW4_NCHW) { + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + } } else { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW4) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4< + true>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else if (param.format == Format::NCHW4_NCHW) { + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + } } + after_kernel_launch(); } size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm:: @@ -174,8 +267,23 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; src.init_contiguous_stride(); TensorLayout dst = src; diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl index 785b7978..256742a1 100644 --- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -19,25 +19,28 @@ using namespace cutlass_wrapper; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, - const int8_t* d_z, int8_t* d_dst, int* workspace, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, cudaStream_t stream) { typename Convolution::TensorRefSrc tensor_src{ - const_cast(d_src), + const_cast(d_src), Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), conv_param.wi(), conv_param.ci()})}; typename Convolution::TensorRefFilter tensor_filter{ - const_cast(d_filter), + const_cast(d_filter), Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), conv_param.fw(), conv_param.ci()})}; typename Convolution::TensorRefBias tensor_bias{ - const_cast(d_bias), + const_cast(d_bias), Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; typename Convolution::TensorRefDst tensor_z{ - const_cast(d_z), + const_cast(d_z), Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), conv_param.wo(), conv_param.co()})}; typename Convolution::TensorRefDst tensor_dst{ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu index 61802649b04294c09573db86f44c319effde6760..4f0eebb74b659460ee9929c44833da05abb82de9 100644 GIT binary patch delta 200 zcmbQv`P#LE2A5|`qV$v%wQo7XY!ViX5*bQCi4N{o%-feg5U&GpPqj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$yb97P#LE2A5|`qV$v%wQo7XXJVH5{)bQCi4N{o%-feg5U&6Uhfj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmFcvBVt delta 150 zcmey$H<5S4I>yb97`HHr2NxylC}if97#qcx=qUIkR_2$M0L3=1Wp-kmEXA6k0+h3e zFVRv+i7y5z AhyVZp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu index 9f0eb40e639f376090a66f8e34a8e92cf5a1a819..e47ab05203d3163c749f99423c61ed013d76343e 100644 GIT binary patch delta 272 zcmbQr`;%|OIz~>P#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%%b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97kw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu index a3a08d3cda3fb9e9d270ff4172218d9edc209b27..30eec1369454a5e7c09e70ece0ab6a1801f8019d 100644 GIT binary patch delta 200 zcmbQn`-^YGIz~>P#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`L9j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$;M1& delta 165 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm9ic<>! delta 131 zcmeyuH-UGyb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmAiEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYN#p5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu index f16fe480f220ff32f9c3baf60f11b799a5f83efb..1e095d923d257e8afea08c5da62d667ca76ce6f7 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9zUj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`L9j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$;M1& delta 165 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm9ic<>! delta 131 zcmeyuH-UGyb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmAiEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYN#p5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu index 06e8522df2e7c79356f501395b107ecfa2c49be8..65a5fcc71dae4599e09c4acae3af3f8ff49a642e 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9zUj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3~j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(H%sPZK{j!PoYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$RJ3K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo?Rgu#bCN_B<>lXmMQd1%T delta 150 zcmeyyH;H${I>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe+qEX|ss0+h3e zFVRv+i7y5z71I3QkdDV{uo^$P$G{BtD$ delta 153 zcmeys*TcJE9pmOlj2jrmgNu@N6f*NljE&+;bQF9NEAvZBfMT0hGCML(7G=#~1Ik&% zmrS0^DiH%=YAK||r)B1pq!#HYa4A4Ra(-TM3CtXjilofMVr=q2vp~{SNOE9fC)coj F0RR~zFKGY( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu index e00211922f633f348dce0908c1166091065d8097..0aaa3f7a6f536dcf73c202bb3883f0fb90978701 100644 GIT binary patch delta 258 zcmbQh`-N}AIz~>P#LE2A5|`qV$v%wQo7XXJVH5{)bQCi4N{o%-feg5U&6UiKj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXmAc2f)h delta 131 zcmeyuH-UGyb97`HHr2NxylC}if97#qcx=qUIkR_2$M0L3=1Wp-qoEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYa=Pi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu index 34a0ceb0b4362ef6b1192b70ec6185dc7d327d1a..dc198d3496c7eba44e21221812b70c01a615cc50 100644 GIT binary patch delta 200 zcmbQi`-gACIz~>P#LE2A5|`qV$v%wQo7XY!ViX5*bQCi4N{o%-feg5U&GpRAj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPAWP=t3#?*Xt~sf> zsd**AMah#5SS3J0yig&x%$$I>yb97P#LE2A5|`qV$v%wQo7XXJVH5{)bQCi4N{o%-feg5U&6Ui~j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(H%sPZK{j!PoYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$RJ3K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo?Rgu#bCN_B<>lXmKdQ%_( delta 150 zcmeyyH;H${I>yb97`HHr2NxylC}if97#qcx=qUIkR_2$M0L3=1Wp-wqEX|ss0+h3e zFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%(Vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$i_@% delta 152 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`Mqj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmEj8hc= delta 150 zcmey$H<5S4I>yb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmMmEXA6k0+h3e zFVRv+i7y5zP#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9!b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`Mqj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmEj8hc= delta 150 zcmey$H<5S4I>yb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmMmEXA6k0+h3e zFVRv+i7y5zP#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9!b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(4@>4`AvSS^oYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$jV@Koq%U=9HusP4;A!2I=I22s&jZ7EjJ*l>`a# YL4<%#1Q}K}`2edTrz=cs@_g1W0Fxp|*#H0l delta 130 zcmey)H<@?CI>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe$oEW?_i0+h3e xFVRv+i7y5z{m8dr9V4etVr715iA!U!W delta 131 zcmey!*UP(M9pmOlj2jrmgNu@N6f*NljE&+;bQF9NEAvZBfMT0hGCMI&7Gup|1Ik&% zmrS0=Dgk5OWRd;b1iBHN*EY?xrQhP#LE2A5|`qV$v%wQo7XXJVH5{)bQCi4N{o%-feg5U&6Uhfj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmFcvBVt delta 150 zcmey$H<5S4I>yb97`HHr2NxylC}if97#qcx=qUIkR_2$M0L3=1Wp-kmEXA6k0+h3e zFVRv+i7y5z AhyVZp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu index ecbb616ea85e06e737b328b4ff790a391e4be00d..ab97934953fb585e1ced968b17c9fb94f7e96daa 100644 GIT binary patch delta 200 zcmbQv`P#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`Mqj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmEj8hc= delta 150 zcmey$H<5S4I>yb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmMmEXA6k0+h3e zFVRv+i7y5zP#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9!b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%(Vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$i_@% delta 152 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJViX5*bQCi4N{o%-feg5U&E?F_j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm8o>K|{ delta 131 zcmeyuH-UGyb97&kGB2NxylC}if97#qcx=qUIkR_2$M0L3=1W_D(rEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYAq?p diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu index d36b4d313593f357f72901b87a5aa3eb1170f439..b91efded3aa1b20c3ee78cedc181d72516c11c22 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG4#j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe?sEX$gq0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%(Vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$i_@% delta 152 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJViX5*bQCi4N{o%-feg5U&E?F_j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm8o>K|{ delta 131 zcmeyuH-UGyb97&kGB2NxylC}if97#qcx=qUIkR_2$M0L3=1W_D(rEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYAq?p diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu index 65c596004b4c5803a63c978b0a7a0405347a34df..19f337a174bbaca1445e471cce1360e7255d40f9 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG4#j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe?sEX$gq0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`Mqj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(7fa@30XA`koYdUZ zyprIeWG#i1_~N2us1aae^7DW;^Rr2)K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zoyRgu#bCN_C4>lXmEj8hc= delta 150 zcmey$H<5S4I>yb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmMmEXA6k0+h3e zFVRv+i7y5zP#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9!b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%(Vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$i_@% delta 152 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJViX5*bQCi4N{o%-feg5U&E?F_j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm8o>K|{ delta 131 zcmeyuH-UGyb97&kGB2NxylC}if97#qcx=qUIkR_2$M0L3=1W_D(rEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYAq?p diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu index bff2c6a570cc5e2457699077f02e48d41e4115d7..a29be07e1d171b1d9084f9cfcb570612c32daf55 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG4#j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe?sEX$gq0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%(Vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$i_@% delta 152 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJViX5*bQCi4N{o%-feg5U&E?F_j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm8o>K|{ delta 131 zcmeyuH-UGyb97&kGB2NxylC}if97#qcx=qUIkR_2$M0L3=1W_D(rEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYAq?p diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu index c2ec29a2688a6cadfd6c8aad30a99a043c68e8cf..34896baf1e5431ae8432e785b9f2ab05a6ded26b 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG4#j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe?sEX$gq0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`L9j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$;M1& delta 165 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm9ic<>! delta 131 zcmeyuH-UGyb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmAiEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYN#p5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu index edc33c2362a80a10cb1094fdbe6701b4f0d17994..0bce8bf5ed58afbb4cc6f8d79940056e08aaff3e 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9zUj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%%b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97kw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu index db532d619f5e5190917b8e4b29485b4a7cba57c4..bf0fdcff5ca0517944a7ea3e032b4e31b8cfce5a 100644 GIT binary patch delta 258 zcmeC@{mi#v9V4etVr715iA!taI6PrAn^$P&1oKpY* delta 131 zcmey&*U!6Q9pmOljGGw6gNu@N6f*NljE&+;bQF9NEAvZBfMT0hGdnR(mSD|b1Ik&% zmrP#3Dgk5OW|g*v>d;b1iBHN*EY?xrQhP#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(4@>4`AvSS^oYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$jV@Koq%U=9HusP4;A!2I=I22s&jZ7EjJ*l>`a# YL4<%#1Q}K}`2edTrz=cs@_g1W0Fxp|*#H0l delta 130 zcmey)H<@?CI>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe$oEW?_i0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%%b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97kw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu index 7a554763630a041c3b986c96d4d365b149111f0a..c0da1d27367253dca4f67cc6aced74da2b8f7874 100644 GIT binary patch delta 258 zcmeC@{mi#v9V4etVr715iA!taI6PrAn^$P&1oKpY* delta 131 zcmey&*U!6Q9pmOljGGw6gNu@N6f*NljE&+;bQF9NEAvZBfMT0hGdnR(mSD|b1Ik&% zmrP#3Dgk5OW|g*v>d;b1iBHN*EY?xrQhP#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(4@>4`AvSS^oYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$jV@Koq%U=9HusP4;A!2I=I22s&jZ7EjJ*l>`a# YL4<%#1Q}K}`2edTrz=cs@_g1W0Fxp|*#H0l delta 130 zcmey)H<@?CI>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe$oEW?_i0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!WE2N-bQCi4N{o%-feg5U&2`L9j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(KTGE1bF5-qt~sf> zsd**AMah%(SS3J0yig&x%$$;M1& delta 165 zcmeyxH;s40I>yb97P#LE2A5|`qV$v%wQo7XXJW)ufb>6nrdoLEv?lvpx(CrjpJel~H1oYdUZ zyprIeWG#i1_~N2us1aae^7DW;^RY>&K@_=V=9Husfz+jeS=f{+L6kaWCKiJfCIM+| a%J?A4fSv)Vs+zo)Rgu#bCN_Bv>lXm9ic<>! delta 131 zcmeyuH-UGyb97&kMD2NxylC}if97#qcx=qUIkR_2$M0L3=1VRmAiEXkU|29&di zFPXfMRRYGm!zyhJ)uE-35}%ZrSgfPKr2qxV`FX`9AQOSQK+;u6a$wQP4QyWkYN#p5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu index 608d5af475f076b2b21bb603aad490fd46fee202..57e4e210a81491bcfb5bfa75723faa590a1b13ab 100644 GIT binary patch delta 273 zcmbQj`-5-8Iz~>P#LE2A5|`qV$v%wQo7XXJXA}o=bQCi4N{o%-feg5U%{9zUj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(FH5FENo7H5USe*l zf^&XeS$yb97`HQu2NxylC}if97#qcx=qUIkR_2$M0L3P#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%%b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97kw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu index cfe256ca4523f993e92432107e07b920b36448b0..557610101842ffb4c909c3513141d3cef1dd125e 100644 GIT binary patch delta 258 zcmeC@{mi#v9V4etVr715iA!taI6PrAn^$P&1oKpY* delta 131 zcmey&*U!6Q9pmOljGGw6gNu@N6f*NljE&+;bQF9NEAvZBfMT0hGdnR(mSD|b1Ik&% zmrP#3Dgk5OW|g*v>d;b1iBHN*EY?xrQhP#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(4@>4`AvSS^oYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$jV@Koq%U=9HusP4;A!2I=I22s&jZ7EjJ*l>`a# YL4<%#1Q}K}`2edTrz=cs@_g1W0Fxp|*#H0l delta 130 zcmey)H<@?CI>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe$oEW?_i0+h3e xFVRv+i7y5zP#LE2A5|`qV$v%wQo7XY!U=#;(bQCi4N{o%-feg5U&9%%b>6nrdoLEv?lvpx(A4{e}No7H5USe*l zf^&XeS$yb97kw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu index 71b62fb26ab113b59c8784887a9265ee70d88045..4698d99f6739fd8927e49591ad6ef8cfa3c1d4cd 100644 GIT binary patch delta 258 zcmeC@{mi#v9V4etVr715iA!taI6PrAn^$P&1oKpY* delta 131 zcmey&*U!6Q9pmOljGGw6gNu@N6f*NljE&+;bQF9NEAvZBfMT0hGdnR(mSD|b1Ik&% zmrP#3Dgk5OW|g*v>d;b1iBHN*EY?xrQhP#LE2A5|`qV$v%wQo7XXJV-yE+bQCi4N{o%-feg5U&DG3Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpx(4@>4`AvSS^oYdUZ zyprIeWG#i1_~N2us1aae^7DW;3$jV@Koq%U=9HusP4;A!2I=I22s&jZ7EjJ*l>`a# YL4<%#1Q}K}`2edTrz=cs@_g1W0Fxp|*#H0l delta 130 zcmey)H<@?CI>yb97`HKs2NxylC}if97#qcx=qUIkR_2$M0L3=1XLe$oEW?_i0+h3e xFVRv+i7y5zEplzQwdy#QW4DxFzq9kwl(hpY zH7{pBchB8Dz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1YO-N#sYZ;q=y)2VwFW|q2BT=B zNMh(5<3}C)Lpc;FM5;X%Ys2&$IuT;LR!)R!DMaXXcO$gvS&6SbZ*eSIH}QJhs#|e9 zikM*)TKMuJc~Q^ZzINw(RmKN7ohKceqv_D85>*xB%kRVqNd}*Nmn_sz~d6n070dXIqRc02sCbVjmPkgtl zM%{i_--Mz@pZ#r_gfB(I|86(6Vx85_K5C4Ex4d#y&6|cJuY{q6iJB%B!F+>4&xe literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..c447d319401b553ec8317177424a026ec544b9ee GIT binary patch literal 1885 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzq9kwl(hpY zH7{pBchB8Dz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1dTFf(Ge;=UNdF+BOvpuR{l>#b)SfZEjd zM;-g)I20*Fsy!8J!}J__5MsR6OoVC?MCf&2BedyOiLX6xaV(lP@k-pPTX8Uom|=Ce z5-%^37xmojYj?g^Wqc6g8T9!)LZCqYnp9Pa2Pdu2at#84)>{sP42B(AW`d!V&zMqB zWEd0vx^?z9hDLtXVCXgKfAyu-A?$<`YoSF7W*BO0!niMa%9WoUhlnLGol3@D!tcNp zg7VO5m?a-W`y=GBexiR233Bzomj>#;`IY;bS9x6*5ceTkWoD6ULaSE!#CN-D)a`fm zH7IKI+258)_(nAR?{-rw)>-ZBqsBOR%PUvaq-i+vN*G$0rDn}gwgx literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..898466e16fced80c4d1ec1e5b5518962b4185a0c GIT binary patch literal 1889 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzq9kwl(hpY zH7{pBchB8Dz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+13S`+<2RKVu^}4t__UkAb7=~{tM3^g`k+?PD%%1@9(#1fcLC1Wq) zci;*^dFV9ElaHbOG4fbH)jx&=xq9GB1NGnh%Kgl%ysis~`w*=%v&c1}RjYjByInQv z_PhEb6gB$nZ_6ZnCmQ~DyQvlHtakQMV;sEYm8)vnG#q&)3@yylG_eS#8$5iy-1jZg MnQk3$#l*Dr7j{U7_y7O^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..83dd420481fb91f3ea88a77d51cc16a0897cfd57 GIT binary patch literal 1890 zcmb7FZBHUG5dO}u=wWiXGpodx7*jbDT{*m*;zdZjZ=2E^j%sewsXZOSeeYx-TZw7;8)ayT9C4Y}voer9E#SO8Xq4CN;B-AmI&`d3ou^@Re z=SC$W)iOWSD-RG;l^Kqf#3-SrD4QVatR#8f+S7(iQ-b<&jF=#>FSmjs?ef{KwZa)g z?Sm($T+Z2nCWfTka?}0*CbvdVdq5;)3dsOOC@LM}sGS?y@l-lBm5xyMeEpdC$G}Wb zVUsCOQ{Wz$BuhiI=H`$(p=7(&a^5?-Bdg5Wp`qwU#TjE6j2Mr&$}=;FpKqU@yQh~I z8#cqwtOZsBR0f?CZ8N9@V_*LTg^KLnP<{@cg-ZcWJ)u}j4uS` z7&rD7L4$t4b1jPqZJLU%*P%nE#b#)1ZEjfVWW&}{j2LgECjlCQ<>Q&5_B8yfto>CkP||MFX{L)Zz&)-ZBgT^>`%PUvaylFV{N*G!gsA*ym%s2S+{bJY4NN2jW J#}yNk)<5|ChkyV8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..5dc680a7104601b625c5b7e5d604e438a327484f GIT binary patch literal 1884 zcmb7FZBHUG5dO}u=wWiXGpodx7*jbDT{*m*;zdZjZ=2E^j%sewsXZOSeeYx-TZw7;8)ayT9C4Y}voer9E#SO8Xq4CN;B-AmI&`d3ou^@Re z=SC$W)iOWSD-RG;l^Kqf#3-SrD4QVatR#8f+S7(iQ-b<&jF=#>FSmjs?ef{KwZa)g z?Sm($T+Z2nCWfTka?}0*CbvdVdq5;)3dsOOC@LM}sGS?y@l-lBm5xyMeEpdC$G}Wb zVUsCOQ{Wz$BuhiI=H`$(p=7(&a^5?-Bdg5Wp`qwU#TjE6j2Mr&$}=;FpKqU@yQh~I z8L7DR+LO+~-!&>_=eGqkieH!O9sVQVQvj5pHpX^d7H5}G$Miq=vjhR!g4(6K*_ zLykPLZ8nf1PbJ@ld4K_@1zx4u0TN0ddp#s!K_1zOfZ!4DN_oH3{%2i zH;(?=(BN0ig>Iw%m)~j~!cI7_7FwiWgrUYJjC+!&T>0U#k5~f3sm$0*_yf2?P#!u7 zqvUgFe}p{N5A@F=L9QP7(m?%}U%8)omDhCvaqpvLX6Csjv}%^j%ri4jXZOSeeYx-TZw7;8)ayT9C4Y}voer9E#SO8Xq4CN;B-AmI&`d3ou^@Re z=SC$W)iOWSD-RG;l^Kqf#3-SrD4QVatR#8f+S7(iQ-b<&jF=#>FSmjs?ef{KwZa)g z?Sm($T+Z2nCWfTka?}0*CbvdVdq5;G1u_5;ib}^gYUhS_d@7wfm5xyMeEpdC$B>zz z!X{ImrjUEcBv~4wH8+RU2_@UDmh;}x9a&}04h=;=s+=*F!HDsQt2{G<`1$tfxqEtf zv4Qz1BxJN37C%1@f4_y@gVoeyV3^CrqChH)RRVX1p`d>)~b zWB;pJJ`tne1)iP0!TnOu0Y|OK)VuJb2*GzzlWn{0V$#n^_NFusP~UNLqj z6elLm(y>s)2Ylqxbz5l5XCgCVLo{gd$#DYzOH;VjS;rvqOgitZNx=)IG-J&8LQsxz zV{Z{Os0TdPqKMF@spxqfIxsCZLtATe!%~wCTT3Zoyg|p)7_BrAnl~6lYdI1_XBa=| z*q_LuNFh?~saPAPXV8Zb@6aL>45fU^l!79| zr0~~`} zhfcyc`5f9GBaih%{c}i=s|UU`Q2))Z+|Rtq>$-rr_t7#l^IQ{JwaO>H-BqJ*f2tQk zQKPT^wn)NjqTzqLn_98XYG)rb#=%=&xvHj3!;x3Q(84%P6N_NF!JqFJyIw^))2%(O In3%Nw0h`B$!T`;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzia2wl(hq@ zDvxWQ`#Q%r2V#moJ@jwC4+hDofBSTu{5ffLI%vifH^g#=CTstaP{&9@Gqp@6g5=49 z8KF}lH_^oKpQek3A&AA!~}tTxfK*?m(TaD70wuH zAB~-IIcGIqx0alXd3o&`|WPsu^P$j2Mr&$}=;FpYO)cy|b&! zEzA!gp`!h;`1xu0>tERS-0sH=;=6GbWi&UOup61t#c)nEZ=!r*7~jG*EcH&h7ZEx? zi_mGe&T>qQeiK+=ws1ccFyN>anR*j`5+V3+I@8xeW;4ehbu(LDS~0evCX8=@+iS+| zh2q4-SvnDl_<(oAmB{D47TWQf%8b|$4@f@kCh%OE!mX}52AOBld3Vhuyktr<#*8lo zG(88YYho48W}|!K@vmf z7(eRRpUI&}AyVzBSR1D2z(k1gS~wA^r4XUl>_%wAS&6SbZ{=7}H}P`Z>RWL-ikM*) zD13R5yy)j{U%TtQD&u!SQNZ(ggg}SCkKR|LRMvL)Zz&)&fNe1{rE>!niMa%9S4>hlnLGqRNcDgx`TH z1m&UAFi<{*_6Nyh{aF7P66ETEFAdax>nry&ukyMtAnrr7%FH6ygjQ7f!gsrB^zC=` zNho^s+258)_)s+b?{-rw)>-ZBqsBOR%PUvaylFV{N*G!gsA*ym%r|)Wdb#goq%++* J;EIW9>n{_nhlKzD literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..7e4273a7ab9825f6075a546ad6027b83545356f7 GIT binary patch literal 1884 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzq9kwl(hq@ zDlcb0clX>qz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRrkrjQIkgrd?hj@r4Q9Z#iGQ|Sn0FE+1+-k-#2t1R{J8FjaVkylSGrkm*W8BzV z1P#go&$S>Tv~4Q-U55^t7Mr1^wYg!blMP!-8DhMZj!$E>){xMmkx{gfA~AH1@uQCY zX&j0aBGsOXwPAV=JqR&gOD00K2qN^luMygGtHjryw{k3+Ht|B-s#|d`ikM+_xDqcf zk{9*d?Q1vRt1^BUv;=)Vj}R!3zfP(u#e*1Ei=JT%4bX|C^Ae5 zf89Fz8$*L%H5Yn~`d@vibqG7*z*=aLf)R!qn=tN6o^s`f$01?~45u<npuq5To^SU=D|h6K5K;7bGbUw-9&=2c$T1;l-bR+(Akn$W6Mp7?H8jk^7=J_bdN zKKt7;37?3D|J`nC#X75EplzQwdy#QW4DxFzq9kwl(hq@ zDlcb0chB8Dz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1(GH|u^HN0n;VvzY}i^#5#ud7p2ldcfzYDCDB8%87&^!J zQOEv74n+!)YEQ-5Fg=Gpgcz@76QNoN5qjO*2yHr6;%m=a9E-+HycoCYR-BC@W>_7r z#LJ81MLl=>+MVxJ8NUm9flld4Mb;G`8=u0cf5ddp#!!MsDuOfZ!48B+?1 z43olNw~qhD(8#Zv4ZTMFufEhegq?6`Ewo6%7(`v9FZX?lbf#Me JTrn|i{ROsPhQ|N^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..ec78da97a6961366625a51984ef9af3d5f844b3d GIT binary patch literal 1886 zcmbVNZBN@U5dPj@;T37xP@sY#giMD7p)1p%Otez>EplzQwdy#QW4DxFzq9kwl(ieI zs`7I7bNAfc*Uu-tJ6UZSxQhpju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR5qU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1(2;&=Q>@{QeLUCf^ zES(5Ne89WmO62oy3+?z#Wkzg>2Oa)$n!s~u3b#7#7&M+q=iN0)c)gTnj2T}F$}w*2 zErJH|fah8h5!yBtEw4idrp0FHYHe;cbQ*@q z`_TRzd8{Al??ZxGJ@BQ0`fGmWe&$79*9EkFh*p_dSOnt@9zI|0`w;0&w+^^sV%qu* D!vluK literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..27df4ce87072239f0b236d7559e98958152071ea GIT binary patch literal 1880 zcmbVNZBN@U5dPj@;T37xP@sY#giMD7p)1p%Otez>EplzQwdy#QW4DxFzq9kwl(ieI zs`7I7b9c|(*Uu-tJ6UZSxQhpju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR5qU~+2&wMRrkrjQIkgrd?hj@r4Q9Z#iGQ|Sn0FE+11{9P9n4^OHT`#%^ZK!%~W}5#n=Wy7~eo+uNk`+iW3uO z=|m{v19n?z$8Rb#VnZb8?U&O8o=a1>)r4bEcP5>8)C}*HQkpSld?_f$xUshg8e{{W zYc)h@+f+2W4jnQrHbXyabHh?68@84j#CR(mpT=mdA)!ShqiCZ;V(1*>M;-g4I20*F zsy!8J!}J__5MsR6ON43-MCf&2BedyOiLX6xO)Y}=%c?alkjEplzQwdy#QW4DxFzq9kwl(ieI zs`7I7bNAfc*Uu-tJ6UZSxQhpju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR5qU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1(2;&=Q>@{QeLUCf^ zES(5Ne849zUAKjH{H8J^HbjH|emPCxxip1aopcNe&!qFtniRZRN;AfcF9qcoH})1m zgLJ@it%wM1n~H|lp##%mGxW4JH!L;Tu(i}7##?kejnP^Ip+$pHw9z6lbdK?(j{R{Q ziWDN%o{F_$dJcUEFxBpzTAn%FH6ygjTKciSKsRsN2u#V^GxSqrWYa z@QG;n-|eOrth3tLM~!jtmRGK-VbgHrl`yn0O4Gz57;f`;T37xP@tk9giMD7SqIagOtez>EplzQwdy#Q?X;9%zia2wl(hpY zHIHkb`#Q%r2V#moJ@jwC4+hDofBSTu{5ffLI%vifH^gy9w25aGaRjmQ9{j7Hbpddl33e1(1y%Xf^Op&F+pITb%G+D+xfn=!Wl#D zqj9cs%h{18hNRqa(|!lWIwPn%A`&u%WB?)*l{w?6Z4K@CQaW`h9iiOC<~8xJp)x^* zOJ+Pxq4rQovNA+#ZjNa#lx(+Jx%ZCl$vVs3&`|WPsu^P$j2Mr&vY8pg&v)bJ-r3dV z7UqYLP|_%pEF`N_4nwqH#qYO1RVceHI<;o9{L&OpoQf0#;^_BaXmwDY6Q1>BPWoBVDp%qoW@ZGK& zefwQ~6p9{w_P1pcJ{1lByWP~1&7IEOqsBORvz4oA;4~b0B@7*m)HHDj1{^$mz1;UX N(wS}@aK*&5^%p+Chyefq literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..021eee4646893c907859dd2eabcb918dd8559be6 GIT binary patch literal 1886 zcmb7FYfsxS6#d>`;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzia2wl(hpY zHIHkb`#Q%r2V#moJ@jwC4+hDofBSTu{5ffLI%vifH^g#=CTstaP{&9@Gqp@6g5=49 z8KF}lH_^oKpQek3A&AA!~}tTxfK*?m(TaD70wuH zAB~-IIcGIqx0alXd3o&`|WPsu^P$j2Mr&$}=;FpYO)cy|b&! zEzA!gp`!h;`1xu0>tERS-0sH=;=6GbWi&UOup61t#c)nEZ=!r*7~jG*EcH&h7ZD1V z({7#Sm>B&g^8749r(OTAaX%GM;HVXudJ}#EVI#C@OUkb`G zZtN|B2I+w3S`iW2HWh@|p+ly{X5h3oH!O9sVQZ;FjJMM9X^hqy5?VAeiZ)s#hR!j5 z)UiK~Ly~9 zC=Z>6Ve&DwKSLhtNBYN*pjQukX`udFU%8)onb&;*bswTtW)`_7w4%xvzS~u!Z@;V0 zLD8en{x0_nB&T3~LHO9ePUb(79O~a8_!qCDnO%sb?w86vI%Y7dro$1yA JS4>P>e*vgbg{uGn literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..540f13ec1b2dafca49f05cd7cfe47c6eafe11e09 GIT binary patch literal 1890 zcmb7FZBN@U5dPj@;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzq9kwl(hpY zH7{pBchB8Dz8r`t`t;Dh{XQ5ZqyFvFb@Jz=)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG6hRju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR6VU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+13S`+<2RKVu^}4t__UkAb7=~~9C=Z>6f$}l5KSv(x$NI;RpjQukX`ud_U%8)onb&mzbswTtW)`_7v}%=4e7CDc z-F{b}grY{D{cV|q4@JZOZa1}Loz>1hYK()oymD2In}#E=grS9jnkE*(c!P(pm-{|O NI@7HKu9%p%{sQ1qheH4W literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..d9fa27c5b76ad06afcf38290012703e3acb01e95 GIT binary patch literal 1891 zcmb7FYj4^x6#dSx@TfFx6)oyRRpqiuZOUL;OQ(XwzD341VBt8HV?*h$-?j4~w6L}a z5_0WxU+4JdKupn>hi>m~Fo;Lp-qTI|_oUHoqZwD+5X%{wto=hw9V0PK)iRz4lEn*d zR4fuLvtzyX05Mak;b=vS5^9FhDWc9wl4XqpZOAMks24?u2?F~vD=5+~o9|o8oiWrr z8aw4O&Wq1|ULF=@>`N%+R(krCpWM7Rp|39%KI)Dic)L zc*c_iY7dpfD?_yA=9oI6WV6w5-aEP{>(tqyq3B0hGsZF)F&=T1rDhO4_s7qj^XscE z%ugYqqW!Sw`Dyt3E$l9{`!R#4KQ5z;=7tk?D^t1{&WYxAlrIe99$dpx=j8OVh0ad> zugY>vjD8n*asCGPO92CpT9K-E;U_Hw-%V$_TF7+f_@l08%L^;UHq?aiEpU6o*u7Ak zm?%vqLJ=RZKU|4y-f5s6pQ%iV4e@~Fv(p&<7p8El>W)C>sdV05Jqa(F!i+KFOF=ot zwY_=JpdavD$)bg}O*!jT=#Xi?8HlaSHA|Ij*jk7Yd2qTp-3Q70!Ioa8LDl z<)PCsQ9g(EC&^>|T>l&r^y+~xHPnCWEA}%l^QteP?nAUn%_7r;mQ?w|ce|?f?N9km z$b0nF-}B;sX6jW7+RR9No)}eIQaAZYTw&Pr@C># I6%*6OKVzMT%K!iX literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd9b9db793a6809e3194b2d3ed8d863ab97b2580 GIT binary patch literal 1885 zcmb7FYj4^x6#dSx@TfFx6)oyRRpqiuZOZ7hmQDqUeT$53z`}7X$L7&rzia0~Xkl#; zB;?xXzRvN@fta8#kKNu~zaI~~z2}?w?@6QGMpLf1A(k^VUi*ibI!0oeszp2&B#YRmNWv{Q%@1HQYXr4NL_(&J3_ygU(lL&jnW1f8O1mnhEtI|7JjVVpR3@mf z@suYC)E+8{mxgG~%`tUC$!4SBym$0K)~T}tL(z}2W{hPpVm#z3OHDso-H%qC^XscE z%ugYqqW!RF^*s3f7Iv4}{g{4qKPsaPXND7YD^ofj%!uZ7lrIe99$dpx=j8OVh0ad> zugY>njD8n*asCGPO92CpT9K-E;U_Hw-%V$_TF7+j_>-<@%L^;UHq?aiEpU6o*n?1< zm?%xgLJ=RZ(?B~u6PXelA_2W;r!o95OyO2l9D%@7>Aa(QcrTX1j4|U2K{>{?y?M}} z9PnHTqJ_3iIp0<2kZHadD6PyjOODPJ2T7r;S?@mex1REeO4Uh~>Q8@393<#{W|oV2kQ;#S}CbCJgktAnL@d67Kt z=WbuS>%A`GgMiP0=L_g2kr{uTRoRP2C#}$O4GIG8Er&q{!wxM{!BE1dOeyFxj0u0; zI{O<#t-x|HbZP`#f2)KDJK@Ay;7Gv?L$ys9cO_4_^3&rGu^6UPk+IkC2XMKfJaiIf z$>-4i40)`d=$}J^UOn)ohWc-P#eU{xUiAgkeTbH+nP-~Nk}6;LZdbLw{VCrDd5^yO z+aeC{h?@W1ZYs$-tDSw)7zb}z>8czxHAh|v0}Hb>i7kTB27kU^?RyXDR5uQ|Vq((x E2hMVZbpQYW literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..3581ac6bc2dde32d33d4359e0802f30a2299a0de GIT binary patch literal 1889 zcmb7FZExBz5dO}u@TfFx6)ozEs>)@R+LY00Eu9Jy`xY77fQ92&j?GJd{m#w{p@p?Y zkdU*VyXWp6Uk=0seR=Hm?)v?B*zG;v#D7m3?KYZn#SO8Xq4C;3#MCho(^M_uu^?GI z=SIaM(K0*MYYz}pl^Twg#3-SrD4ihctRz|1IM9Yn6M}kCgqR?(FSCLo?XuavwcHs) z&7+Z1F5~P-6GIYixoLg?lUXCEJt7jA0vUh^MWtgLH8VroK9zQrN?Rygq=zdg28O{tR>{h09K9~{B>nNWX#yz-(rOwIeWec61 z`d`IzM2vnHcyay)_e((s9JL}<@4`=72)>)jbhVJ_)bS@>XUhvK#x~@H@h$ZBhOq~s zI5AP0jD;dT;4_!5(?B~u6PXelqCtz#PGk6An8K~fIs%cW(s^ff3SKaU8Dqv5f^v*& zd-I?{J>a<#MGI}4@}5_r1Jisnw6!wVEEU}B$sX6jW7+9F6No)~}H~91YYTvs^r@C># I6%&)jKLx6X4FCWD literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..2a13f48cec1f12ef1bd99191a3e769e8478a05eb GIT binary patch literal 1891 zcmb7FYfsxS6#d>`;T37xP@tk9giMD7SqIagOtez>EplVGwdy#QA1&qA@7j4ZW$l2f z%H!JSzRvN@0iU8z5B=NkgF!Ot-#%R@e@~^P2eAP#I6T zO=c|1p!QHnveHB;rjJv{rD(TW&U;7qWSu)Z)G7K_)r_$SMvO;Hnp_X!=ezN9@9gSw z3-f~usAxYdetsJM`WN=y*!`G6d^fJ5jOLmVdL#05F`N^{nkZiw#2|SmkaI5Q%LFTz|-d!^ZFPYMeF=b1h zGK?F0i=aV2;JKDXgtkou>viamX|WlIt<4Qfoov`ziV@?jbbK14wSt5ejf|p=Ac;=r z7(c4mpUEN5AX4qASQ}>Nz(k1gS~wA^r4XUl>_%wAS&6SbZ{=7}H}P`Z>RWL-ikM*) zD13Pltmx-%U%TtQD&u!SQNZ&#bdw0fUT0PH;=u{al~{w0fP2eeoWaOL%bZh`u^E*T zx(vg@U$-W}#?UCR8V|h&0assYA;L~Lw-z{(Fv(D36UKePGA8{DIYcyp8C7QNHT(`- zp(qcXhKceqv_DB6>*xB%kf2u&d}*NmTVJ`KdYRXK0d*guRjwCCk+h=97rxt7qi?^f zZ$i*c<$k;+x; JfJw@yt-pRmhtL23 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..33a1b07f187ab90c46cb1fb50680b123df5e7a48 GIT binary patch literal 1885 zcmb7FYfsxS6#d>`;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzia2wl(hq@ zDvxWQ`#Q%r2V#moJ@jwC4+hDofBSTu{5ffLI%vifH^g#=CTstaP{&9@Gqp@6g5=49 z8KF}lH_^oKpQek3A&AA!~}tTxfK*?m(TaD70wuH zAB~-IIcGIqx0alXd3o&`|WPsu^P$j2Mr&$}=;FpYO)cy|b&! zEzA!gp`!h;`1xu0>tERS-0sH=;=6GbWi&UOup61t#c)nEZ=!r*7~jG*EcH&h7ZEx? zi_mGe&T>qQeiK+=ws1ccFyN>anR*j`5+V3+I@8xeW;4ehbu(LDS~0evCX8=@+iS+| zh2q4-SvnDl_<+3@+VPvpjMxwf=soQw@LZb0t*$r*foIZrN6qkFETtJ^#+QO}j2nB4 zpg}p{xfVo(woL`!b?A_3u^A|>%?(SPY}i`L5aX?Md>W&*hJ+T4jG~PciJ^0hA9d_c z<4~j!srFQ?4byYrAjEhrnF!S)h|p_ZBeY?w#Mhp;ax6%jcp+}}tvDA&%&#V9?JUD5EmTOQDaBn#bG8lGfnF)qcK4VHjmtjo! z>(<%d7#amugQ3?T;Oa{)MA!)@)&fTgW*BO0!niMa%9WoUhlnLGoyv^8hTnlJ6y>4Q zFiSp$_Gidr{Y3v567=eUFAdax>nry&FY~%DpzcGo%FH6ygjQ7f!gsrB^zC=`H7I)Y z+258)_(nAR?{-s5)>-ZBqsBOR%PUvasA)LzN*G$0rDo4g0g?a!0 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e86a099f9c6fb891474bacae88c7982df2aaf32 GIT binary patch literal 1889 zcmb7FYfsxS6#d>`;T37xP@tk9giMD7SqIagOtez>EplzQwdy#QW4DxFzia2wl(hq@ zDvxWQ`#Q%r2V#moJ@jwC4+hDofBSTu{5ffLI%vifH^g#=CTstaP{&9@Gqp@6g5=49 z8KF}lH_^oKpQek3A&AA!~}tTxfK*?m(TaD70wuH zAB~-IIcGIqx0alXd3o&`|WPsu^P$j2Mr&$}=;FpYO)cy|b&! zEzA!gp`!h;`1xu0>tERS-0sH=;=6GbWi&UOup61t#c)nEZ=!r*7~jG*EcH&h7ZEx? zi_mGe&T>qQeiK+=ws1ccFyN>anR*j`5+V3+I@8xeW;4ehbu(LDS~0evCX8=@+iS+| zh2q4-SvnDl_<&Dbx?T(I_)TR-Y={OFpLP>?E=}Q9R~>`MGwHmuW(r;~r5R(!mx6MP z8+(hOK|SEP7Da@%O$Fz5=#Xi#8ECD|4NIMD*jh>vvEM zb?i^%P^1v4_EfA5({tb<#CR>62-QM}&}(iZv|+5o*Pgd>EQp(UF>dv(I2%RGusU3c zmlw&4e(v_QyWXoZei!rvJfA~1iOl)ytg2o-IBA8JYfuqzZ#fJz7Psy|*a@fB0!Iqw7;0?7xG#Cim7gGoh$S$g%8b2+-+?O> z<)PCsPdR#f@Ice`rz?RWJ> zD0=kS-EplzQwdy#Q?X;9%zq9kwl(ieI zs`7I7bNAfc*Uu-tJ6UptMT`<^hO#N5xs$}&)`2!;mJ-yDW5fi3ebxzzbZ+PS)(U3~ zwU5TR$}ML{ni!ID$4&bU80(Cn?ubZW3S@{QeLUCf^ zES(5Ne89WmO4xa~g?9X=G9xa;gARW=P2jmSgwd5?%g#>c)gTnj2T}F$}w*2 zErJH|fah8h5!yBtEw4idrp0FHYHe;9N}afg`h8mkN?n$0<<>$vCVhPNrlChWY8*qi7JaihS z$@|d$AbG5x>F+~=Ts`omf%Ue^W0eTY_>Sy)YI)heI(ZdZ-E{j9zRMU6iC z+cF7XiH85(ZfeEmPUr4XV;sEM%2hRQ8jidYh7P7_nm7dW4IVyU?)wtyOt%iWVq)6* E4J_)15dZ)H literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..68a8a13bbeaf52764aff28e76e7d90f0a8d1462a GIT binary patch literal 1881 zcmbVNZBN@U5dPj@;T37xP@sY#giMD7p)1p%Otez>EplzQwdy#QW4DxFzq9kwl(ieI zs`7I7b9c|(*Uu-tJ6UZSxQhpju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR5qU~+2&wMRrkrjQIkgrd?hj@r4Q9Z#iGQ|Sn0FE+11{9P9n4^OHT`#%^ZK!%~W}5#n=Wy7~eo+uNk`+iW3uO z=|m{v19n?z$8Rb#VnZb8?U&O8o=a1>)r4bEcP5>8)C}*HQkpSld?_f$xUshg8e{{W zYc)h@+f+2W4jnQrHbXyabHh?68@84j#CR(mpT=mdA)!ShqiCZ;V(1*>M;-g4I20*F zsy!8J!}J__5MsR6ON43-MCf&2BedyOiLX6xK9au{MT=+H6~45fU=l!79|i162~bH6b( z_*Fxp+o=E5r&@=w6Hcpz7AcrtsIdv-p5!T4esUZlmcV2xGxidG1FjI1hfc#Bc^}#z zA&>Rb{C!A}s|UU`P=DoD?q^=*bzMN*hiH|VMXm|0TIGrFcGalc&+1E1)aawXEtBwl zX!zglrdF)8+Sx~qaqyN`uBu7XaO9ORv@l20#3Gn%@bLL^-#18Sx^=)66Vujjoj-)T literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ca682f4f14d504a6604b93dc886c0146fe2f4bf GIT binary patch literal 1885 zcmbVNZBN@U5dPj@;T37xP@sY#giMD7p)1p%Otez>EplzQwdy#QW4DxFzq9kwl(ieI zs`7I7bNAfc*Uu-tJ6UZSxQhpju8_C_T^Skq+LGWw^le~ zsC_hc%H^CLX<|sqEjR5qU~+2&wMRq(Qy>Enp{R6>qjqj+$EVV%Q|Sn0FE+1(2;&=Q>@{QeLUCf^ zES(5Ne849zUAKjH{H8J^HbjH|emPCxxip1aopcNe&!qFtniRZRN;AfcF9qcoH})1m zgLJ@it%wM1n~H|lp##%mGxW4JH!L;Tu(i}7##?kejnP^Ip+$pHw9z6lbdK?(j{R{Q ziWDN%o{F_$dJcUEF zZw!t6s=?50)c@*JtwY!eC)PrX6wENx*o1LU@{}t-Jq{5|U^+u)u`Lg>T6Kc=%c?a zlkkmb_}}fOR;;ty*+-3W@RnDus%g`3J)veYQ|UuBgP{pO|A#=%l-JJcXoBT zh560}RJ0!!zdR3r{tde|c0Xni-;b*(qq$~;-iSP14Ch3#CdwCv@g3a5Qtz}Ip_A@K zgigD4mSdvTyMX6sTeu$*7;vfOx%@Bu1j0sW)0w{FBA+?-q?*~X(u%PSHDP=M++I`q zz$GI(&a(-Z#0R_|uDF@^T4={L6*;jX9*}(6P2jyWg-2a?3^LD!^X{5Sc*&Gzj4504 zlwsW1TLcaI0nfE8BD8HPSg%8eOpDDxY;A5>>SV*#Qj8dHrQ_2WtraA+Xk-*^1W9x{ z$M{Lb{!9*e29at{#o91C2PQ&{x59}~ErkfZWj8__&Psgkc`L_)x`~(LR^N)#QN#?Z zK;i4FU`0Q7``TUaRT&?&d}^(6FCLw+T!}U42)MTl#uLe z6^ioEX_zP}U`s^ReB>X8F{&%~nCF`tm_DNwJyqU^XHEw8i K4w$5T+WG@yVu#BB literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..75acf999d7c056d31356a9ad5ba7bb686efb3163 GIT binary patch literal 1885 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4(7`k)6Rp&}MXv3(RvpK3?3VKDckEo6vUY*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTE)>gig8_ z5jySGS&oU(?*g8mZQ*_>V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!5Cnp&i#$X2ga_K<{Zcf%noB9(Bbr2t1R{J8FjaVkylSGrkm*W8BzV z1P#go&$S>Tv~4Q*u0w}Ri_Ji3ZEjfVWW&}{h8S<9O1!>G zUi5Rfuif=tmGMEvXTbA0bd$)Ozs;)Z#iNr}Xt@Rj0r!@}AcJ9tmYHBE*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTE)>gig8_ z5jySGS&oU(?*g8mZQ*_>V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!6P8rR%lOj%zA2VnZ~b__UkAdua-fy6PB2o=NAOHB<0{Da{x&z7&*W z+}K+L4e9~UwJ0LAZ7MjgLx)U@%|L5yZdmGM!`4!Y7;mNH(-^HaB(!K`6m8^444q^A zq+@>~ha!bYwWnfjn4SXxds&h_m;yjgMo*ZnP4d8Go}=D z8AgS_Z=L^*p;2Hp9C{4`uD;Ylgq?6|EpVh@j-kdTjQf(OT=@xdh*$y>s?6AH_#L=H zQ64%C^WL9ZV8+CcrYzH&eFGOzmr>OMrP%q((EXhoGTe7CDc-+onp zgrY~E{bQMg|3t(8Za1}Loz>1hX^exnymD2In}#E=grS9bnkE*(c!QU(m;3%jI@7HK Ju9%p%{s0AUhYbJ# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..b8c1e2789777ffabaeb72505fb9e4e42a3a68ea1 GIT binary patch literal 1890 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&en9E3-*sMu4z_j( z5_0WxU+4JdKupnxhhG17Fi1wd{?m2x>!j7`pcz-(5XTvstiwY>JtGOt)H0a}Vv_|o zDiNuc_E@h2K+IHTI9d^-gqopjil}#zSlc?#hRjle`f-eyAh6FmL6P=$zHhB?#!&lc z?3K5i9cf}n${jcDH(;zYg1RFjAyY^OAVN{;8AokvXeX4?sY~ey<-Tkllkk{VCa7@9 zjHfBoo>!8r4AGjKW9o&H?N-YN@93VaGw+6mqOVoW7|Yy<@rWy%nL+$~H-7G(UtMlt zz6l8x?T5wBPs1O7!fvhGj~T>w<0{H%Za85#GNX&(oM_%eg~BlI!!=*(o}EUh_|-{{ ziP8Uq`>uchPo2orKj9}4g72m=JuPH5^ZZdaljWrqW0#kN@eNRW&DgzAoR~OECqj`B z@NT#gcHV8F9iOSph|A*v$7iPr{4Y)Ut*$!;m1oiicg-Y%VoEc{j4uV{7&rD7L38`17BSvRC!{f2Ye;C($SB(Akr+D1_)*8a~h?%bfg)c9X7yaDr zYj?d@WqgqGIq-Y|-6YcT*I8A)c<|B*E!UtU;NEeVW-#&4G7}7?e8!Z5F2k${*R27t zF*FLSrbD+uz}2T(h_Dlmtpkn}3^LT%gmF*ul&dg84iQUWM3or_4Zi_bC@Mgw`9OIe zI-I0{4P*U%NYHBlzBEvOt*_kAg3Rl_fVvOSDl-eK39YCKh3|IN=-bcgl~DBPW4JAo z{6*35zuQeMS?{!Wj~e6Lo2^_`1E=9AC}HSepr(mKFyP?N=gWOBBc18i0ar{+TfYHr C=7*2~ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..393b5a6cb0b4fd2e8043b3e0db6b54c218a37ada GIT binary patch literal 1884 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&jt!;1e%H=}(81OY zK|-#5?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8)7*_leK?HsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1oh(>F+pHoZUsf!<@0@Og)@fQ zM`Nd4&e@SBhNRqb(|!Xcw?h1x?U$;uF|xjCjzDA{hcocE6I$vSg(Xejzx)r_$WMvO;X<(V18&v)bJ?)lZ_ z7Ur9fP|_%pEF`N_4n+H1z{h2q4-SvnDl z_<-FO+VPpnjMxwf*gZQ<;D2cfx4PmO^qooP9W}#yt(0br8D9#@F>dTFf(GG$=UNXD z+BOw**P%nE#b#i%Ha9GFvSDkfLX5Z4@o9|K8WLJGGKw}zB!6_#?UCR znhV_q0au@DA;L~LuogH{Fv3t{6UIHsQ?C5*I7BRg;Z$bqHT(u#p(qcXhEei9v_C^0 z>j(P#kf2u&d}*NmT3@-Jd70OJ0d*guRc02sCbXi;7rxt7qi;W}mqF2^kN&ny!YiWT yf47@jvd(H}A2r6oTVA=UMoq(!SHjT3C`}WKV6?%X&zJjNLpsx~1Fo2uwtfTK9fdjo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1ae6410e2759c092448213b16dac1026b5e29e1 GIT binary patch literal 1888 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T4#_({vX_T1r2c2icI|zei9-0ZVJ=WLS{3^A9a%~FRd8ckP^l>(AsOp?uFvS#92BK ziuiy}T)J)x?f6V(Mr?=%9X>lv;D2cfw>s+>G@eQ4oi!dTFf(G$` z=UNjH+BOv}uR{l>#b)SgZEjdV(1*>M;-eEITR^G zsy!8J!}J3B5MsPmO@wM8MCf&IBedyQiLX6xaV#1)@mk!fTX8gsm|=Ce5-%^37xmoj zYj?g^Wqc6wIrRAgs!3$dU$d%e@!+HtTCPDw(0j{amchJ3%S6` z#?VNxnho7X1Fk;RLWG@gXf5Rgj4~EzKx7+z{2CPyg!j7`pcz-(5XTvstiww}JtGOt)H0a}Vv_|o zDiNuc_E@h2K+IHTI9d^-gqopjil}#zSlc?#hRjle`f-eyAh6FmL6P=$zHhB?#!&lc z?3K5i9cf}n${jcDH(;zYg1RFjAyY^OAVN{;8AokvXeX4?sY~ey<-TlQlkl2XCa7@9 zjHfBoo>!8r4AGjKW9o&H?N-YN@93VaGw+6mqOVoW7|Yy<@rWy%nL+$~H-7G(UtMlt zz6l8x?T5wBPs1O7!fvhGj~T>w<0{H%Za85#GNX&(oM_%eg~BlI!!=*(o}EVM;yglU zr*)EJV)XyuzAGTWQztU@PxwiM;J;~1PYapNJb%>9WO-@D*ySZ*d;`>8Gj=Z&CnnC) ziBKd2yc@2Bop)Pk$8Rb#;_`UF@!4qt&!s89)pf_9@=W^Ru9-woOliiL@ui>~Sb)L>}X{ZX#4mAwsX&jnIa(5?_1X%CVqs64bcWx8iUVG4oZR@a0AF zqMy5c?XLH#j1N*i2c9pWn?zdvI;*M|4_-Q<#u^_BZska^t~Q1>BPWoBVDp%qo3@ZGK&efwE`5{e#u z47X*He<&LMce|-2>z(%QQDdBYvz4oA;4~ZsB@7)5)HHDj1{^$mzTEdQ(wS}@aK*&5 F^&6uchnfHY literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..33332c776f52be397055bfa28a50fff5bcbb093a GIT binary patch literal 1884 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&jt!;1e%H=}(81OY zp**gA?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8)7*_leK?IsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1oh(>F+pHoZUsf!<@0@Og)@fQ zM`Nd4&e@SBhNRqb(|!Xcw?h1x?U$;uF|xjCjzDA{hcocE6I$vSg(Xejzx)r_$WMvO;X<(V18&v)bJ?)lZ_ z7Ur9fP|_%pEF`N_4nQ~!jYLZ3F8}}_L{MKp*S&d zmQI8sK47$S} zT~;aVS!V zRC_AchUo=x5MsPmOoVC?MCdiI5!$d-;%m=aIToZ%ybibeRve2WW>_7r#LJ81ML&1@ z+FkEe86QM^4m@8#H;K&o>#V9?JUD5EmTOQDaBn%xF_?8|nF)qcK4VHjmtjiy>(r!nh}S%9S4;hlnLGoXU*7hTniI6y>4QFiPHs z_Gidr{Xl;o67=eUFAdaR>nry&FY~%DpzcGo%FH6ygjQ7f!gsrB^zCQ$F(`WU(chLy z_(U}P?{-s5)>-ZBqsBOR%PUvasA)LzN*G!grDo*T3 Bg+u@V literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..37369d3e85fd0620e08286d455a84fd97ac53878 GIT binary patch literal 1888 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&jt!;1e%H=}(81OY zp**gA?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8)7*_leK?IsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1oh(>F+pHoZUsf!<@0@Og)@fQ zM`Nd4&e@SBhNRqb(|!Xcw?h1x?U$;uF|xjCjzDA{hcocE6I$vSg(Xejzx)r_$WMvO;X<(V18&v)bJ?)lZ_ z7Ur9fP|_%pEF`N_4nQ~!jYLZ3F8}}_L{MKp*S&d zmQI8sKHw9VuG>O8ep8td8=?WjXQv4~m!@#5tByhAnRMP+GX<}g(u^_VOF=otjlD(C zARh2sYa&A1rh@W1bjY;W46N4XhNVt6Y%P_D@m4xMjnP^|LW@R5(MFBL&^gABI`#*0 zC{l=2dn(q3=>_l*V!T#OglZu~=ry+y+Avn)YtLIb7Q{`w7PtCV9E~DoSRJm!%Zubi zKX?1uUGG&HAB21kJYPUJiOl)ytg2o-IBA8JYfuqzZ#m2|n0IKI35HTWV@g4nVN&?( z*74sM8UQgO5*a?T$0!IqQ7;0?7xF>nal^-C7h$S$f%8b2+-+(I=<)PCs zPTq(1=g4FIP=6m1^y+~x4b)%jEB7-m^SUpf?nAW7%p%u>R#f@Ice`rz?Pv8tD0=kK z-xOc2i5%S?%nj#yEJ(D_7OHX*lvq7+M&oX<`wKH+cAbx$je?Gu=Aiiiv6K EH${tv+yDRo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ffd599ed326bee539aacadfd5cc040f63ade96f GIT binary patch literal 1891 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4(7`k)6Rp&}MQ-f2RvpLkrKSA(9Xpq%tlc0F zxg7gDx9{`G0iU8zkNvyb!5|s+@1C!d-zTk32hEsdnpj5BWbHqa)KQY8xm+d_PE4|3 zS|&VG!W^r$2k@E9HA5?+C6_alPf_ZuAjY%~lqR!`pu0FmloQxzEKiYgX1;H&a7I!4 zXzZjjj2J)veYQ|UuBgP{pO|A#=%l-JJcXoBT zh560}RJ0!!zdR3r{tde|c0Xni-;b*(qq$~;-iSP14Ch3#CdwCv@g3a5Qt$jMLMPpe z2%UE8EXPEvcL7hkTeu$*7;vfOx%@Bu1j0sW)0w{FBA+?-q?*~X(u%PSHDP=M++I`q zz$GI(&a(-Z#0R_|uDF@^T4={L6*;jX9*}(6P2jyWg-2a?3^LD!^X{5Sc*&Gzj4504 zlwsW1TLcaI0nfE8BD8HPSg%8eOpDDxY;A5>>SV*#Qj8dHrQ_2WtraA+Xk-*^1W9x{ z$M{Lb{!9*e29at{#o91C2PQ&{x59}~ErkfZWj8__&Psgkc`L_)x`~(LR^N)#QN#?Z zK;i4FU`0Q7``TUaRT&@Tcm_P5LpO;q>}^(6FCLw+T!}U42)MTl#uLe z6^ioEX_zP}U`s^ReB>X8F{&%~nCF`tm_DNwJyqU^XHEw8i K4w$5T+WG^2Hiysv literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..2fea2ac0f2524087e331043efe55da14da82ef1f GIT binary patch literal 1885 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4(7`k)6Rp&}MXv3(RvpK3?3VKDckEo6vUY*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTF{62%U5< zB6QlVvm6tn-vvDFZsC3?V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!5Cnp&i#$X2ga_K<{Zcf%noB9(Bbr2t1R{J8FjaVkylSGrkm*W8BzV z1P#go&$S>Tv~4Q*u0w}Ri_Ji3ZEjfVWW&}{h8S<9O1!>G zUi5Rfuif=tmGMD{XTbA0bd$)Ozs;)Z#iNr}Xt@Rj0r!@}AcJ9tmYHBE*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTF{62%U5< zB6QlVvm6tn-vvDFZsC3?V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!6P8rR%lOj%zA2VnZ~b__UkAdua-fy6PB2o=NAOHB<0{Da{x&z7&*W z+}K+L4e9~UwJ0LAZ7MjgLx)U@%|L5yZdmGM!`4!Y7;mNH(-^HaB(!K`6m8^444q^A zq+@>~ha!bYwWnfjn4SXxds&h_m;yjgMo*ZnP4d8Go}=D z8AgS_Z=L^*p;2Hp9C{4`uD;Ylgq?6|EpVh@j-kdTjQf(OT=@xdh*$y>s?6AH_#L=H zQ64%C^WL9ZV8+CcrYzH&eFGOzmr>OMrP%q((EXhoGTe7CDc-+onp zgrY~E{bQMg|3t(8Za1}Loz>1hX^exnymD2In}#E=grS9bnkE*(c!QU(m;3%jI@7HK Ju9%p%{s163hY|n) literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..3cfe915fd6b12860cd3002d8de5258d8e01d87dd GIT binary patch literal 1890 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&wgaWVe%H=}(81OY zK|-#5?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8{#-aleK?HXwFDNGqp@6g4kri zjY>qSr9IYb4-hkz8ID%OD4}L3nCua(1MNAt`s$=ezN9_x$Q| z3-e7#sAxYdetsJM_!D+(-G0mse5rAp|ex} ztCJiPqyGo@T>$|(bs|&$gr7tRzMIDMw2;{>=a0IXEHAAXyHFCwH$d$*WA{RFV&W{F z2t|CryWvXMdAEgje5NuZF2nUkb`GZtN|B z2Kj*JS``u6HWj4Tp+ly{X5h6pH!O9sVQZ;HjJMM9X^hqy5?VAeiZ*&AhR!j5)UiL3 zLy$JMh8mkN?n$0<GGnjdH{c3IdFV6@ zl=q?iN%B}f*58K&y?WqF1NGPX%KgmCyzUFA`w*=%v#^@biYj0DZdZ-I{j6RIMUOuE z+cF6+iiZE)ZfeQqPUr4XV;sEM%2hRR8jidYh7Ja5nm7al4*q<;-1jolnQk3$#l*Dr E8>2*rng9R* literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..6b711dc10e92661add3e0da67329bc5b73a544bb GIT binary patch literal 1884 zcmbVNYj4^x6#dSx@TfFxRa(@Cs>x-Q+LY00Eu9Jy`xY77fQ92&jt!;1e%H=}(81OY zK|-#5?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8)7*_leK?HsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1oh(>F+pHoZUsf!<@0@Og)@fQ zM`Nd4&e@SBhNRqb(|!Xcw?h1x?U$;uF|xjCjzDA{hcocE6I$vSg(Xejzx)r_$WMvO;X<(V18&v)bJ?)lZ_ z7Ur9fP|_%pEF`N_4nO~{e4K#s|UU`P=Bqj+|Rts>%M@x578DB>POiWw90S``v AL;wH) literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..c63406461b48c156fdcde77b06f66692ddcf2f2a GIT binary patch literal 1888 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T41iRend6VTNtTyZjBQ8>;~Qx0HDmWeabn^u zod`vIz$Y$Uw}p0mrZOWoM1u~WohIS0euKDUaKZTwGbloy0;P9bgabJp0_v_jhlEaZq=oI$VjD7s-oy z?)J4i->Wh{i1-}(d;!%YGUu;ZRke6<(h4otpd#qKSONp8Wb8Hk23(;i51oc_ z@;Qd2ai|j?=P)x7tOfhhB(g9WE~z->KRFCu9oRU5SuQz zQK`tZw8wfK0Ai+c!_kTuCDaV%Q$)R!#M;(@He{9&G)NM}1c80l35vA0^L=ZDGltqn zPhNS;*^wrOWZZGnegnojBd9wf5;BEk03sBXo^jN+hIT_K-MW;nQ10vIF%6HgGC_q) zXFSWG_E<@}GDK@`j;R+)wp%S9yrVm^&b=EMioR7fV=SW)<1tq@H^b!l_UXBIc6qUd z`7R_>v>%o{KaPI>4ZE>!KW3QRK2=f1bHfR{mN{LF=0x)*DintC0IqSVcYfAE;dgpc zC;3E-eiQEWWDEC00Rf&mk*oj0zjP3MH;w6QA@iB%54xEwFRd86SQ5t9K0?RlT_P(g`ispd;YkahPT>@z63C3}t-Al!7k9 ztO(bw0kAPN3aq9>uR*}&r&@@x6OOF|juZ?s)Yyb^U-FErFhULyOJPKn83zr&0aqw0 zK&Nq_ybm2tQox3>{yrq=H2_~4sDIX1?q@;fbzeZ;hiH|Xh1G;sRE5HKyK409SM^FL zdh{{emT7!ZH2m*&Q%lx6?cIaMIC`^{t7_mh90ete91PSfbqEF={P}#b?`5QO-8$fk HiD~N(z3Ydc literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..8b2c0d5fdaa053272df6d35e97c63d824a30b33e GIT binary patch literal 1884 zcmb7FYj4^x6#dSx@TfFx6)oyR)#S2DZOUl1mQD>4`xY77fQ92&jt!;1e%H=}(81Os zNXWI%eVyZ*12ILP9{Ypa;V>EX2hZ2Z?~_)yi)LJLLo8=#vi1)Nb&Mo5Q_Ex`NS-XX zQHe;k%#Zcj1H?>ahNBfRN~jsirieN#NuIY3v>~&Upg|NNCJ5}yt)NJ|e7dKc#%Kj|R&ZW`0qLS{3^pL8=>URp7>p(KoNfZA)u9)#k= zL|Hl!iui!N7TWQd%8b|$3D`Y5P2hiN3b(rA2=tvv=N&b}d##jaj2T}F$}w*2ErJH& zfah8d9kgvK=&nPDOpDFHXl-s->SV*#QiT|ArQ_2Wtu-XHXk^scD3KUC$M{J{{xHTO zg-ErhVl7TDfP)a@tztT;7C{HS<+X!0Y?b)h^Hz=pX%nx*t-ck z`Wr)|z-lh^8U$Q@sf7qT;lNtpNWln0jZGN$B~Q8X!{ZRK1cp~k3Rd` zG6}DUhX37eYRNjQoqf_62XA@hsv0#7M_viBg;AO&7QtwPKVL8Ry@qtATL)Y*F>U<; DDc6Nb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..eeb3c5c2510bae23df61ae56f3854146b548027f GIT binary patch literal 1888 zcmb7FZExBz5dO}u@TfFx6)ozEs>x-Q+LY01Eu9)9_AN5D0Sm{m92-i1{m#w{p@XeO zkdU*VyXWp6Uk=0+eR}K1mB-AmI&`d3pi6D8h z;6^1P)iOWUYYz}Jl^Kp!#3-R=D4QbctR#8fI?#s9Qi29igqR?(FSmjs?eh7)wZa)g z?W3_%F6Zn>6GKvNxoN)xlUpOGJt7jA0vUh^MWtgLwR1zeK9z2rN>?cRZS$D;$B>zz z!X`7GrjUEcBv~1vH8;o92_@UDmh;}x16gOz#)hKrRn8d8V8nRDRi2q)^l~?T>78F) zZee~12^sB&MK90s&%a@}x!sQ$Mt9>X%4lvlVK*|Pi+D~nZ=!r+7!TkYmU1|2>-P2hiN3b#7z2sEBa=bbevc)gTnj2T}F$}w*2 zErJH|fah8h9kgvKT3&|^OpDFX)!N*!)MUffQi&LE(eX4!YYl`J4Mv@f8i}EEjGuJm z4`eJ-h*Wzj*5dR6`VeBgRZR!gLg=8kz3re)$4Y$dd5dGwxQW-|R^5uDQN#?Z@k+eD zN?z1+x3As#UX}4djOWnj3#cZMIe*Kls>P#|R%p2f6+!PUhgk;m4lOgmP|9aaDX20` z3V+=?{u@Ih!D=@28V$JmQVS7w!lAX$BL!m&H8x?~mptXl50FE|5*Sb=W3SiJ@>oCAKZXRodf;mV_0Rmu{mjd}t_!I95Unz^$Tgu=t9;_ST{Y_Vt9l_6 zHTvvt%Ot!e8vb{?sU_>IcJ@hQ9K7X~t7_ad9C;*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTE)>gig8_ z5jySGS&oU(?*g8mZQ*_>V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!1!_T#0<%YoQ(2RA$76ctG-LH-Y!k6drZmF~~fV&bwqz8r`t`t;bpyB!RYQUC7wI{AIl>U7YID{hG83{BSlBcYCwgl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ)Oq$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYeoFFu65?+9M(%Q%D9NLQ&}$NA29uj;GS8sdR+07n{$-e+FiP z3Y*M$ngaL0Bv~1vH8;o92_@UDmh;}x16gOz4h==$D$W?oV8nRDRi2qa{Bl2j>789& zZehL)35@o`;+N;)&%a@}x!sQ$#P{PW%4lvlVK*|Pi{YGT-b8s~7~jD?EcH&i5jyE! zMCi0zXE`QDzYBPNwuSqlpaYItk*WW}PatfBHkIjXA+wp|Pr8{cFRd8czzO3U=F8;ZZY=LExEm-cd8W7fWfznDM2c9OK5` zB4|(!c&-Hzp>0#q?>cnIwAc(St<4Qfoov`z$`IqNbbK14wT6Tijf|p=6p5j8jGuJu zPvcOe5UKW5tPRt1=s}3_Rx%N)MG&F4eT~qjTP42syp?0ow22quR^5toQN#?Z!u?ge8Ss{Y z=(B$;lkkgZ_}}fOR;;ty*(Z&0@RnDus!7vuU7YID{hG83{BSlBcYCwgl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ)Oq$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYeoFFu65?+9M)?DUbn(P*gg`Q9C!Z<5TI>sdR+07n{$-e}>Ei z6*igiG=_X#c4#R2R^^Pb3`UGcT;-V=#4q>bm)_ac zC(8_z8rK(55ndEo3%x{7E<2^3sa24LM_I3_Oq`_?p@=Gh#zDXz^(`f%noB9(C3+h&+?dJ8M$#f+@`yGrkm* zW8BzV1P$r|&$TEbv~4PSUWX1$i_Os1+T5_zWW&}{iWqOv@iazj4TKgAM$tx&#Lzj$ zPdfG|awt-WRC_AchUq!EaCx8iIRF~jO` zC0<`8FY3A5*Y13;%J`tlW{@NN2ir Jz!ekI)*qXUhQa^< literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..22a9bfb1ff672040957668ab385c8acc9a79cb5d GIT binary patch literal 1889 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T4#_({vX_T1r2c2icI|zei9-0ZVJ=WLS{3^A9a%~FRd8ckP^l>(AsOp?uFvS#92BK ziuizc!4ZYZ_)8IMr#d(77a$xMvuhMImVAV_D6Ck zQixQ0D%OVS1#}|Bc&(cV)l!Ji>+VKq)3Xv^d*0$$v~J?nxK+2}a1=4aDzxzBMe?Ga zyM67>_o|E!Qa*=1UqlFW$X}DHO7Y;N6Jl0S3_aQ;99{AEg{WZUGKl3WD>jL6FM61j!a!qK}DxdgnSB<*;tlkJkjXwI@ zG70aAhX37eYQ;LMoqg082XA@hs+u+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+}(5c_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T4qjqj+$5ZLlR60W0FPq22KL%!k z3Y*M$ngaL0Bv~1vH8;o92_@UDmh;}xJy~bY4h=GMhR6sGG_1(u%PSlrX-5)?PDqFBB&x&eDlc z#0TuQ(2mbkX2ga_(A~4s1pb$%aH|=|pzlmN@2DBxYo#<}%=l7Jj&Wme5i|$~JlA@N z(6*^)cO5!pT5N`n*5-z#PBv^URfzFcIzEljT0=sMMn=&_iNw%3#*aGohjA!Uh*Wzj z)`sZ?^dQ7|t(XYaB8brIzD8)%trA~*-pa9P+QjQ{t8T@yC}M`y;Yz%`NM6))x3ArN zugdr!;&bTpMT9_s{B=@QDIT1(Ld!J>2wHDBj4>E>XqgFyQa)o!L6Kod`0Lim-xwPF zs+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T4#_({vX_T1r2c2icI|zei9-0ZVJ=WLS{3^A9a%~FRd8ckP^l>(AsOp?uFvS#92BK ziuiy}T)J)x?f6V(Mr?=%9X>lv;D2cfw>s+>G@eQ4oi!dTFf(G$` z=UNjH+BOv}uR{l>#b)SgZEjdV(1*>M;-eEITR^G zsy!8J!}J3B5MsPmO@wM8MCf&IBedyQiLX6xaV#1)@mk!fTX8gsm|=Ce5-%^37xmoj zYj?g^Wqc6wIrRA=LZCwanp9Pa2Pdu2at$Ja)>{sv48|Q=W`d!V&zMqBWEd3wx^?A?$=RYoSF7rWk5$!nh}S%9WoVhlnLGpGwAF!f(J8g7VO5m?rN- z`(xy>ex|<<33Bzomj>#u`IY;bS9x6*5ceTkWoD6ULaSE!#CN-D)a_^WJ}7GR(chLy zcuO?=?{-rw)>-ZBqsBOR%PUvav}rix-Q+LY00Eu9Jy`xY77fQ92&jt!;1e%H=}(81OY zp**gA?&}=i9Ed6U@X+hu4hG4n*MGWBex0;B9W>*L8)7*_leK?IsAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1oh(>F+pHoZUsf!<@0@Og)@fQ zM`Nd4&e@SBhNRqb(|!Xcw?h1x?U$;uF|xjCjzDA{hcocE6I$vSg(Xejzx)r_$WMvO;X<(V18&v)bJ?)lZ_ z7Ur9fP|_%pEF`N_4nQ~!jYLZ3F8}}_L{MKp*S&d zmQI8sKH%MOCGvTC7;a1ljgUU1Myt`%+UNNN^W5$<)a*P{$ zi=aV1;JH>sgtkou>2>IkX|Wl2t<4Qfoov`zY7yhDbbK14wT6Tijf|p=9*LoIj30IE zkK|CK5UKW5tPRr(U?RkLt(yqdQi#xNb|bXmti;!zw{k3~n|L*D^{qG@Ma-}Y6u!Jj zUi5Rfuif=tmGME!=fLwtgg}Snry&ukyMtAnrr7%FH6ygjQ7f!gsrB^zCQ$MJRgo z(chLy_)awZ?{-rw)>-ZBqsBOR%PUvaylFV{N*G$0r)gpl%r|)We7Wyiq%++*;EIW9 F>o+*Nhfe?i literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f7648c3f4f377b8a18b18f6d75acb77e0fc8cd4 GIT binary patch literal 1883 zcmbVN+in^$5Pjz>Tq0F%vZQRUD`$%c}u38?}SeOnpZfQ8pI;Gc#xAc=kX{(T9g#|8_7)M!o*ib@J<^)#;!aSKJWG8Jev9OF|tZ3C+|pnFx|6 z3vN^*QZ4giz4ibxQ<>ptMT`<^hO#N5&PtN!tpjbyEG4KP$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYen)Fu65?+9M(%Q%D9NLQ&}$NA29uj;GS8sdR+0UpB9ae+|q8 z6*igiGzIQ~NwP9TYi^FI6H2yQE$6+Xd$P`)9U6+hR-7@G!HDsQt2{G<`1x-9+&#a# z+`@bl5*Y1=#m`T}AAiDbbGsiii0{T#l+oO9!fs?n7sENxyovI}Fz&-OEOpOLBXn^d zp|jIE$uTkde{kOwG{8|SGWAdRNrd3PDNIibnav!3)XijhX~ozEN*LcjYp)r*7m5=T zXX!*J;sbVDXvc3VGh#y|=+-k-#=sT0nJ8FjaS}DyKGrkm*W8BzV1P#Ig z&$S*Rv~4QdU55^t7Mr1?wYg!blMP!-6=J-Vj!$E>){xMmkx{f!A~AH1@uQCYVH}DS zBGsOXwPAVzJqR&gD<(p<2qN^luMygGtHjryw{k3+Ht{;#s#|d^ikM+_xDqcfk{9*d z?Q1vRt1>=__#FCt5g||@f1OlSiU%jH&~gm|g4SCOV+=+eT4sWwl+Tz_P-GYq{kxLrd9~0Y1rrQ4HeuY8Jmt#Ijzh!}m`!EIUczs{6@v26X_zGM zL;EA-v3{Pv4+(Phz?TNTq0F%vZQRUD`$%c}u38?}SeOnpZfQ8pI;GjnFptMT`<^hO#N5&PtN!tpjbyEG4KP$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYen)Fu65?+9M)?DUbn(P*gg`Q9C!Z<5TI>sdR+0UpB9ae+`)l zDr_?2X$rZAOp=u$T61$uolvsfYB}#6-II0Z?9fp3waOV|8H^Z@xXLp#h@bDq&)xH@ z%Pq_|At9svu=x3D_~TF5ZEp8t2JzjviZYrTPS}mi=wdi0nm18CF^v0g4NKj#(+FLh zN9gRdCOIZX{}1lFf(AHhMW+4~E?u{UcKoI?BQ`{X4xgPS@LZb0t(GH|u^GBrn;VvzY}i^V5#ud7p2ldcfzYDCDB7rz7&^!JQOEv3 z4n+!)YEQ-5Fuj02gcz??6QNoN5qjO*2yHr6;%m=a9E-+HycW0WRve8YW>_7r#LJ81 zMLl=>+MVxJ86Sjv4t>6e5U7y9CRLTwJnP4d8Go}<283u*F zZk_&(p^;xT8oG`8Uwx`|2s`1-T4<4iDTW%GFz!j7a^>g8Az}&4r;@Rk@EdT2pgeRM zrpf!z{up_zpXu*If?PfDrGffue&v4VRbJNx#C?cXnOWqT(5h8F@!hT(b^BR;4~iOn z^tWXaz7h@pyWP}^byhq3s4))S^2${;Z5oce5{4G0X_{CB(+wUzU+()7=}fl{xME`3 F`VHIJhLHdO literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..497591db5b63aadf94eb1a9ac915682b9d774fb6 GIT binary patch literal 1890 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4(7`k)6Rp&}MXv3(RvpK3?3VKDckEo6vUY*L8)7*_lePaysAD9dnOY_jLGon5 zjY>qSWqz#J9w25aGaRjmQ9{j7HbvA~N%FjPpbeR&1l`3kVuHZF+zN`c%jf&n3TF(p zkH${9oUh1x?U$;uF|xjCjzDA{hcocE3%$U1X&Xejzt)r_$WMvO;X<(V18FZbh@-r3dV z7UsK_%pEF`N_4nWJOTF{62%U5< zB6QlVvm6tn-vvDFZsC3?V8Br;GWB2h351Q%rZaslWHxjBNjJ0Qr4?fvYQp#ixV>iV zK`2g4oTU??h!1!_T#0<%YoQ(2RA$76ctG-LH-Y!k6drZmF~~fV&bwqz8r`t`t;bpyB!RYQUC7wI{AIl>U7YID{hG83{BSlBcYCwgl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ)Oq$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYeoFFu65?+9M(%Q%D9NLQ&}$NA29uj;GS8sdR+07n{$-e+FiP z3Y*M$ngaL0Bv~1vH8;o92_@UDmh;}x16gOz4h==$D$W?oV8nRDRi2qa{Bl2j>789& zZehL)35@o`;+N;)&%a@}x!sQ$#P{PW%4lvlVK*|Pi{YGT-b8s~7~jD?EcMRMB6QNd zh|p=b&T>qQei!hxyM_CqpaYItk*WW}PatfBHkIjXA+wp|Pr8{cFRd8czzO3U=F8;ZZY=LExEm-cd8W7fWfznDM2c9OK5` zB4|(!c&-Hzp>0#q?>cnIwAc(St<4Qfoov`z$`IqNbbK14wT6Tijf|p=6p5j8jGuJu zPvcOe5UKW5tPRt1=s}3_Rx%N)MG&F4eT~qjTP42syp?0ow22quR^5toQN#?Z!u?ge8Ss{Y z=(B$;lkkgZ_}}fOR;;ty*(Z&0@RnDus!7vuU7YID{hG83{BSlBcYCwgl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ)Oq$A}36`*JHN(k`FxTPvI~ z)IJ(J<#NuBG%+OQmYeoFFu65?+9M)?DUbn(P*gg`Q9C!Z<5TI>sdR+07n{$-e}>Ei z6*igiG=_X#c4#R2R^^Pb3`UGcT;-V=#4q>bm)_ac ztyzwV(eDDDcDHap6m-B*D>C(8_z8rK(55ndEo3%x{7E<2^3sa24LM_I3_Oq`_?p@=Gh#zDXz^(`f%noB9(C3+h&+?dJ8M$#f+@`yGrkm* zW8BzV1P$r|&$TEbv~4PSUWX1$i_Os1+T5_zWW&}{iWqOv@iazj4TKgAM$tx&#Lzj$ zPdfG|awt-WRC_AchUq!EaCx8iIRF~jO` zC0<`8FY3A5*Y13;%J?A0GwAbqgg}M-Evc#$k4{>lW{@NN2ir Jz!ekI)*rS}hQ|N^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..87e5aae00b259c8f4d3020ceb0d58e89b638a449 GIT binary patch literal 1889 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T41iRend6VTNtTyZjBQ8>;~Qx0HDmWeabn^u zod`vIz`Nl}s|_RGvxa-8D&g#gt}@8D9#@F>dTF zf(H43=UNpJ+BOwUuR{l>#b)ShZEjdM;-ek zITR^Gsy!8J!}J0=5n{a7O@wMGMCf&QBedyRiLX6xaV%Ol@oLVBn!;CKyWjj41_0hEd_K zTjzgcXyjK7hi;?(SD$Jf!cI7~7FwiWj-kdTjC+!&T=@xdh*$y>s$}dX{03YhC=Z>6 zdGbEAKS&1hYK()oymD2|n}#E=grS9bnkE*(e1kuqFZaERbf#MeTrn|i F{RTMWhfe?i literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f5d5d874d9904abdb604a79578f7438f55c2c2e GIT binary patch literal 1883 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+}(5c_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T4qjqj+$5ZLlR60W0FPq22KL%!k z3Y*M$ngaL0Bv~1vH8;o92_@UDmh;}xJy~bY4h=bcw3 zZoXG#d=T(C^!Xw}pg{gQsj3tYPFkVm8UzHbw;aY8j5@T;1VbsGF{Pl$FeLnS>*Q|? z4Sv;F=r-zq^{Lh&?1b}bp+yQN7;0?7xF>nam7g7lh$S$a%8b2)-+(Iw<)PCsN#2L{ zN62ISJbxb&Wg& A`2YX_ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..daadee35de348c3587239fbf8aa733d05a9f2275 GIT binary patch literal 1887 zcmbVNZExBz5dO}u@TfFxRa(>+Rg=pqwJD?1S~?XZ_AN5D0Sm{m92-i1{m#w{p@Xd* zf`pv?+&y>q_;Mho=)*&=e>)f?qh9~%I{9_d>U7YID{hG83{BSlA)$_ugl1})Oa#f3 z1ve@Ysh0V%UVDI;smyS+B1Q={L)jEjXC=w=)`2!;mJ-yDW5fi3eYq7BX_wFUtrgA~ zY9EcAaye&5ni!ID%T41iRend6VTNtTyZjBQ8>;~Qx0HDmWeabn^u zod`vIz$Y$Uw}p0mrZOWoM1u~WohIS0euKDUaKZTwGbloy0;P9bgabJp0_v_jhlEaZq=oI$VjD7s-oy z?)J4i->Wh{i1-}(d=Vi~A%9J(D#e47R%p2f5kc!MhfxOO4lOgmP|9aaDJU`w3V+=? z{To9gziKpe8}+~XRO=9S!kM+uA_Y?nH8x?~lRV|h&yPdI5|~dVV=v)1;0i%`=rl}| z_o4kU@>oC9--iUbdf-a~_1FB${miSpt_z6!5Unz^$Tgu=t9;_ST{Y_Vvw9yCHTvjp z%Ot!d8vb{?sTJ$2cJ@(Y9K7X~t7_Ub9C;-SElkriu?VIc{P}#j?@gpL-8$fkiD~OM D+yRD> literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f19370ee918a046116e252562f90a5e81b9ced7 GIT binary patch literal 1889 zcmb7FZExBz5dO}u@TfFxRa(>+Rg=pqwJD?3S~?XZ_AN5D0Sm{m92-i1{m#w{p@XeO zkdU*VyXWp6Uk=0+eR}NmZwG^9)ayTAC%;cxoer9D#SO8Xp~>1mB-AmI&`d3pi6D8h z;6^1P)iOWUYYz}Jl^Kp!#3-R=D4QbctR#8fI?#s9QiA$%jF=#>FSmjs?eh7)wZa)g z?W3_%F6Zn>6GKvNxoN)xlUpOGJt7jA0vUh^MWtgLwR1x|K9x?LN=GRBZS$D;$B>zz z!X`7GrjUEcBv~1vH8;o92_@UDmh;}x16gOz4h==$tDG^G!HDsQt2{G<_~mZ=(mlVr z+`{}25;EEki(j6HKmUf^=5{}35Z{ffD5JUIgx$!DE{1cWc@yOm!?+LEu++UckC6YJ zoz^7B#OQb7&Q7;*_X--|s1=#|FZ?7z@ZA)qr-jUBjz8%pSzcN(wjm{qZ=ki;j6Dd& ziHWmxA{6lf?}jUp&$}(O<1>{Ru^}Gx`0O-+|D`G1>bzr6c_y8A*CgQ;Q<^bmd?_f$ zxUshg8sr0>YgI&O+f+2Y4jq^lo1w3@xnZfvhOMO*G2Wu%X^hqy2rU|nqKzJjp>vF% zbnK7hP^1v4_EfA5(+lWCi1Ai85vrvSp|{#O8NJ$L)so$pl{ALMureZGhg=#al9Rh8n=Nh`EmgN&f{mcuZEfrplvU?}A?rW6zz zMuop_o&SxYkzX|&x{dl@eW`T_JK@w?Xpw?Bh8mkN?n$0<K{XbTs`o$f%<2D<$mT>Ue^W0eTY_>S>&3~s#QMm-L4vS`&GRW ziW+_Pw`CIE6Ak~n-PDS8Ry+HoF%I7H%2hRQ8jidYh8E^&npgz$4gP$+-1jchnQk3$ I#l*Dr2UCTIQ~&?~ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca6f680bb626f62be1ba9d4d54fff100519e2d96 GIT binary patch literal 1883 zcmb7FZExBz5dO}u@TfFxRa(>+Rg=pqwJD?3S~?XZ_AN5D0Sm{m92-i1{m#w{p@XeO zkdU*VyL;{)Uk=0+eR}NmZwG^9)ayTAC%;cxoer9D#SO8Xp~>1mB-AmI&`d3pi6D8h z;6^1P)iOWUYYz}Jl^Kp!#3-R=D4QbctR#8fI?#s9QiA$%jF=#>FSmjs?eh7)wZa)g z?W3_%F6Zn>6GKvNxoN)xlUpOGJt7h^g=7FC6qSx~)Xojp=7(&a^5?7AnVN8p`qw|#TjE6j2Mr&$}=;FU+%^)-Sex< zEzA!gfzf_g{PH~f`8Vt~xBD@J_-_%pEF`N_4nRsM!ySpcDjYTSI_`Qt;p1W;U^J-@1`(4Eo3%x{7E;H<)sy48z^CX1FgMg>_I3_ zOq`_?p@<{Bm zq!6k0RIClt3+O?J@m4Vrsznf?w|$M!rduVx_Pmv2(X@%z;a1&>V^PEmtHYIeeU-eZ z=WbuS`CgUrL5Szj=Zgq|0{Ppds!}{UX@!<+5D>K9au{PU>d-P145fU=l!79|knq>7 zlfN-E_*G+}+o=E5ms*Fg6V9uJ7AcrusIdv-p5!T4es&xpmcVQ(GxidG2d)s5hfc#J z`54+CA&>R*{9{Ovs|UU|Q2*pt?q^=*bzMN*hiH|VMXm|0TIGrFcGalcuj*Y;)abLn zEtBwuX!zglrdF)8+Sw+Rg=pqwJD?3S~?XZ_AN5D0Sm{m92-i1{m#w{p@XeO zkdU*VyXWp6Uk=0+eR}NmZwG^9)ayTAC%;cxoer9D#SO8Xp~>1mB-AmI&`d3pi6D8h z;6^1P)iOWUYYz}Jl^Kp!#3-R=D4QbctR#8fI?#s9QiA$%jF=#>FSmjs?eh7)wZa)g z?W3_%F6Zn>6GKvNxoN)xlUpOGJt7jA0vUh^MWtgLwR1x|K9x?LN=GRBZS$D;$B>zz z!X`7GrjUEcBv~1vH8;o92_@UDmh;}x16gOz4h==$tDG^G!HDsQt2{G<_~mZ=(mlVr z+`{}25;EEki(j6HKmUf^=5{}35Z{ffD5JUIgx$!DE{1cWc@yOm!?+LEu++UckC6YJ zoz^7B#OQb7&Q7;*_X--|s1=#|FZ?7z@ZA)qr-jUBjz8%pSzcN(wjm{qZ=ki;j6Dd& ziHWmxA{6lfpSg717TWQd%8b|$4LW>wn!x|k6mE6aF=#xK&O2*T@OmlD7&E>Ulw;i3 zTLcZ_0nfE2BD8HPT3&|^OpDFX)!N*!)MUffQi&LE(eX4!YYl`J4Mx#Mjl|G7#!ou- z2XZJ+u)u`LA>U~hu z=(E2qlkk>k_}}fOR;;ty*(Z&0@RnDus%g`3wqgJ Grma8rnTD1C literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu index de60503ae217820451e4b262ad47a4b027991215..42029b8aa35ec24d06f41307a886e3615d1a3e5c 100644 GIT binary patch delta 273 zcmbQw+rqzL10$zTVr715iA!p|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@)=e|PFI-Nd;b1iBHN*EY?xrQhR3SWfTW;bQCi4N{o%-feg5U%{9#Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP5KHFdi>zW?t~sf> zsd**AMah#5StUS1yig&x%$$d;b1iBHN*EY?xrQhp|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@)=e|PFI-Nd;b1iBHN*EY?xrQhR3SWfTW;bQCi4N{o%-feg5U%{9#Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP5KHFdi>zW?t~sf> zsd**AMah#5StUS1yig&x%$$d;b1iBHN*EY?xrQhtaI6Pvt>^$P%l-BTq1 delta 131 zcmZqXpU1ml1LNkWjJp`cgNu@N6f*NljE&+;bQF9NEAvZBfMT1sF}pKP)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhR3SVH5{)bQCi4N{o%-feg5U&DG5Aj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPAWP=t3#?*Xt~sf> zsd**AMah#5SS3J0yig&x%$$2FA@#8MiQs2NxylC}if97#qcx=qUIkR_2$M0L3`gNu@N6f*NljE&+;bQF9NEAvZBfMT1sFuOBOR%6Xz1Ik&% zmrP#ADgk4@WRd;b1iBHN*EY?xrQh z;tDybxv6<2!9~eh3Mui$MafVjz{cd~0UaX8CZPsVp|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@>y0zPFI-N@~ delta 150 zcmZqWU%R3SWfTW;bQCi4N{o%-feg5U%{9y(j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPFiYm-ORQpwt~sf> zsd**AMafzUDe=Wc$xt)E=H%xAt-i=A&I?uLmYGwMS~NL;RT`vM87k_OnOF?coCKt? c>E?&313Dd~tZFhJn-T}aT9@L<%UHhv00pX0M*si- delta 153 zcmey*H;Z?}2FA@#8MiWu2NxylC}if97#qcx=qUIkR_2$M0L3taI6Pvt>^$P%k@lzxK delta 131 zcmZqXpU1ml1LNkWj5`^{gNu@N6f*NljE&+;bQF9NEAvZBfMT1sGJ7yi)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQh z;tDybxv6<2!9~eh3Mui$MafVjz{cd~0UaX8CZPsVp|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@>y0zPFI-N@~ delta 150 zcmZqWU%R3SWfTW;bQCi4N{o%-feg5U%{9y(j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPFiYm-ORQpwt~sf> zsd**AMafzUDe=Wc$xt)E=H%xAt-i=A&I?uLmYGwMS~NL;RT`vM87k_OnOF?coCKt? c>E?&313Dd~tZFhJn-T}aT9@L<%UHhv00pX0M*si- delta 153 zcmey*H;Z?}2FA@#8MiWu2NxylC}if97#qcx=qUIkR_2$M0L3taI6Pvt>^$P%k@lzxK delta 131 zcmZqXpU1ml1LNkWj5`^{gNu@N6f*NljE&+;bQF9NEAvZBfMT1sGJ7yi)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhtaI6Pvu6^$P%q-%~FD delta 131 zcmZqSpU=Bt1LNkWjJp`cgNu@N6f*NljE&+;bQF9NEAvZBfMT1sF?%ph)@03K1Ik&% zmrUNsDgk4@VU@Oq>d;b1iBHN*EY?xrQhR3SVH5{)bQCi4N{o%-feg5U&DG2vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP5KHFdi>zW?t~sf> zsd**AMah#5StUS1yig&x%$$`gNu@N6f*NljE&+;bQF9NEAvZBfMT1sFncggR%gv%1Ik&% zmrP#IDgk4@VwJXr>d;b1iBHN*EY?xrQh z;tDybxv6<2!9~eh3Mui$MafVjz{cd~0UaX8CZPsVp|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@>y0zPFI-N@~ delta 150 zcmZqWU%R3SWfTW;bQCi4N{o%-feg5U%{9y(j9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPFiYm-ORQpwt~sf> zsd**AMafzUDe=Wc$xt)E=H%xAt-i=A&I?uLmYGwMS~NL;RT`vM87k_OnOF?coCKt? c>E?&313Dd~tZFhJn-T}aT9@L<%UHhv00pX0M*si- delta 153 zcmey*H;Z?}2FA@#8MiWu2NxylC}if97#qcx=qUIkR_2$M0L3taI6Pvt>^$P%k@lzxK delta 131 zcmZqXpU1ml1LNkWj5`^{gNu@N6f*NljE&+;bQF9NEAvZBfMT1sGJ7yi)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhtaI6Pvt>^$P%k@lzxK delta 131 zcmZqXpU1ml1LNkWj5`^{gNu@N6f*NljE&+;bQF9NEAvZBfMT1sGJ7yi)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhR3SW)ufb>6nrdoLEv?lvpzPAWP=t3#?*Xt~sf> zsd**AMah#5SS3J0yig&x%$$2FA@#885^ A-v9sr diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu index fb982a3224a780cbf816dd001ca3509242eab844..6a87dc00d8be6f116c273c56c72b224c08fc4aba 100644 GIT binary patch delta 334 zcmbQk+sMCR10$zTVr715iA!d;b1iBHN*EY?xrQhtaI6Pvu6^$P%q-%~FD delta 131 zcmZqSpU=Bt1LNkWjJp`cgNu@N6f*NljE&+;bQF9NEAvZBfMT1sF?%ph)@03K1Ik&% zmrUNsDgk4@VU@Oq>d;b1iBHN*EY?xrQhR3SVH5{)bQCi4N{o%-feg5U&DG2vj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP5KHFdi>zW?t~sf> zsd**AMah#5StUS1yig&x%$$`gNu@N6f*NljE&+;bQF9NEAvZBfMT1sFncggR%gv%1Ik&% zmrP#IDgk4@VwJXr>d;b1iBHN*EY?xrQhtaI6Pvt>^$P%k@lzxK delta 131 zcmZqXpU1ml1LNkWj5`^{gNu@N6f*NljE&+;bQF9NEAvZBfMT1sGJ7yi)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhR3SW)ufb>6nrdoLEv?lvpzPAWP=t3#?*Xt~sf> zsd**AMah#5SS3J0yig&x%$$2FA@#885^ A-v9sr diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu index ccf7790f88458ef7f73c0545afe4749af9f542f8..2392dc7bc24bdc1bf70dcbc5fa39e6a1c867d3ca 100644 GIT binary patch delta 334 zcmbQk+sMCR10$zTVr715iA!d;b1iBHN*EY?xrQhp|}>XeyS g3{sc`q_HXEgD3;~4Wz1S@)=e|PFI-Nd;b1iBHN*EY?xrQhR3SWfTW;bQCi4N{o%-feg5U%{9#Kj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzP5KHFdi>zW?t~sf> zsd**AMah#5StUS1yig&x%$$d;b1iBHN*EY?xrQhd;b1iBHN*EY?xrQhR3SW)ufb>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$lXmqOl-#h delta 153 zcmZqRpUt~r1LNkWjN2K-gNu@N6f*NljE&+;bQF9NEAvZBfMT0BGrKcRR%Oj#1Ik&% zmrP#EDiH%=YAK||r)B1pq!#HYa4A4Ra(-TM3CtXjilofMVr=q2vp~{SNOE9fC-<;@ F0RZ~#FI4~l diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu index ec9f4d3654990e686c69c829ed09cf2521541a49..bad9a5ce96622a41c2cf80bb9fb7619714bc78c1 100644 GIT binary patch delta 258 zcmbQo+swaV10$zTVr715iA!taI6Pvt>^$P%l-BTq1 delta 131 zcmZqXpU1ml1LNkWjJp`cgNu@N6f*NljE&+;bQF9NEAvZBfMT1sF}pKP)?m$G1Ik&% zmrUNkDgk4@W|g*v>d;b1iBHN*EY?xrQhR3SVH5{)bQCi4N{o%-feg5U&DG5Aj9NMh zTna#tTw0QoSX^vnl~|OVVP)lC;9Htgl39>b>6nrdoLEv?lvpzPAWP=t3#?*Xt~sf> zsd**AMah#5SS3J0yig&x%$$2FA@#8MiQs2NxylC}if97#qcx=qUIkR_2$M0L3`gNu@N6f*NljE&+;bQF9NEAvZBfMT1sFuOBOR%6Xz1Ik&% zmrP#ADgk4@WRd;b1iBHN*EY?xrQhd;b1iBHN*EY?xrQhR3SW)ufb>6nrdoLEv?lvpzP088fN^Q>ZAt~sf> zsd**AMah%(StUS1yig&x%$$lXmqOl-#h delta 153 zcmZqRpUt~r1LNkWjN2K-gNu@N6f*NljE&+;bQF9NEAvZBfMT0BGrKcRR%Oj#1Ik&% zmrP#EDiH%=YAK||r)B1pq!#HYa4A4Ra(-TM3CtXjilofMVr=q2vp~{SNOE9fC-<;@ F0RZ~#FI4~l diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 693da6ca..41592a08 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1191,6 +1191,47 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { } #endif +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { + require_compute_capability(6, 1); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformFloatRNG float_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW4_NCHW; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &float_rng) + .set_rng(3, &float_rng) + .set_param(param); + + auto opr = handle_cuda()->create_operator(); + + auto run = [&](const TensorShapeArray& shapes) { + opr->param() = param; + TensorLayout dst_layout; + opr->deduce_layout({shapes[0], dtype::Float32()}, + {shapes[1], dtype::Float32()}, {}, {}, dst_layout); + checker.execs({shapes[0], shapes[1], shapes[2], dst_layout, {}}); + }; + + run({{16, 4, 23, 40, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}}); + run({{16, 4, 92, 160, 4}, {24, 4, 3, 3, 4}, {1, 24, 1, 1}}); + run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}}); + run({{16, 4, 92, 160, 4}, {16, 4, 3, 3, 4}, {1, 16, 1, 1}}); + run({{16, 4, 92, 160, 4}, {8, 4, 3, 3, 4}, {1, 8, 1, 1}}); + run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}}); +} + #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) { require_compute_capability(6, 1); -- GitLab