From f41a8086943942a36df40dc38f8122027cf7c5ab Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 11 Jun 2021 14:48:54 +0800 Subject: [PATCH] feat(dnn/cuda): add nhwc int4 conv support GitOrigin-RevId: 5236b235d0310feecac0fbd1dc76ff0755cf9426 --- dnn/src/cuda/conv_bias/algo.cpp | 34 +++ dnn/src/cuda/conv_bias/algo.h | 131 +++++++++ .../conv_bias/cutlass_convolution_wrapper.cu | 264 +++++++++++++++++- .../conv_bias/cutlass_convolution_wrapper.cuh | 19 ++ .../implicit_gemm_int4_int4_nhwc_imma.cpp | 122 ++++++++ .../implicit_gemm_int4_nchw64_imma_base.cpp | 15 +- .../implicit_gemm_int4_nhwc_imma_base.cpp | 159 +++++++++++ .../implicit_gemm_int8_nchw32_imma.cpp | 5 +- .../implicit_gemm_int8_nchw4_dp4a.cpp | 6 +- .../implicit_gemm_uint4_int4_nhwc_imma.cpp | 186 ++++++++++++ ...h_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1881 bytes ...h_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1881 bytes ...ish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1876 bytes ...h_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1881 bytes ...h_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1881 bytes ...ish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1876 bytes ...y_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1877 bytes ...y_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1877 bytes ...ity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1872 bytes ...y_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1877 bytes ...y_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1877 bytes ...ity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1872 bytes ...u_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1877 bytes ...u_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1877 bytes ...elu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1872 bytes ...u_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1877 bytes ...u_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1877 bytes ...elu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1872 bytes ...ish_s4_128x128x128_64x64x128_2_nc64hw64.cu | Bin 1909 -> 0 bytes ...8x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1907 bytes ...h_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1876 bytes ...h_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1876 bytes ...ish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1871 bytes ...h_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1876 bytes ...h_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1876 bytes ...ish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1871 bytes ...ish_s4_256x128x128_64x64x128_2_nc64hw64.cu | Bin 1909 -> 0 bytes ...6x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1907 bytes ...ity_s4_128x128x128_64x64x128_2_nc64hw64.cu | Bin 1905 -> 0 bytes ...8x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1903 bytes ...y_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1872 bytes ...y_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1872 bytes ...ity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1867 bytes ...y_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1872 bytes ...y_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1872 bytes ...ity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1867 bytes ...ity_s4_256x128x128_64x64x128_2_nc64hw64.cu | Bin 1905 -> 0 bytes ...6x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1903 bytes ...elu_s4_128x128x128_64x64x128_2_nc64hw64.cu | Bin 1905 -> 0 bytes ...8x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1903 bytes ...u_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1872 bytes ...u_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1872 bytes ...elu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1867 bytes ...u_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1872 bytes ...u_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1872 bytes ...elu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1867 bytes ...elu_s4_256x128x128_64x64x128_2_nc64hw64.cu | Bin 1905 -> 0 bytes ...6x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1903 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1883 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1883 bytes ..._u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1878 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1883 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1883 bytes ..._u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1878 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1883 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1883 bytes ..._u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1878 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1883 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1883 bytes ..._u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1878 bytes ..._u4_s4_128x128x128_64x64x128_2_nc64hw64.cu | Bin 1911 -> 0 bytes ...8x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1909 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1878 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1878 bytes ..._u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1873 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1878 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1878 bytes ..._u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1873 bytes ..._u4_s4_256x128x128_64x64x128_2_nc64hw64.cu | Bin 1911 -> 0 bytes ...6x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1909 bytes ..._u4_s4_128x128x128_64x64x128_2_nc64hw64.cu | Bin 1911 -> 0 bytes ...8x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1909 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1878 bytes ...4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1878 bytes ..._u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1873 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu | Bin 0 -> 1878 bytes ...4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu | Bin 0 -> 1878 bytes ..._u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu | Bin 0 -> 1873 bytes ..._u4_s4_256x128x128_64x64x128_2_nc64hw64.cu | Bin 1911 -> 0 bytes ...6x128x128_64x64x128_2_nc64hw64_c64rsk64.cu | Bin 0 -> 1909 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1769 -> 1822 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1767 -> 1820 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1767 -> 1820 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1768 -> 1821 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1767 -> 1820 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1767 -> 1820 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1764 -> 1817 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1765 -> 1818 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1764 -> 1817 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1761 -> 1814 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1764 -> 1817 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1762 -> 1815 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1762 -> 1815 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1763 -> 1816 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1755 -> 1808 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1762 -> 1815 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1762 -> 1815 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1759 -> 1812 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1751 -> 1804 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1760 -> 1813 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} | Bin 1759 -> 1812 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1751 -> 1804 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1758 -> 1811 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} | Bin 1756 -> 1809 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1794 -> 1847 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1793 -> 1846 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1785 -> 1838 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1789 -> 1842 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1781 -> 1834 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1789 -> 1842 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1781 -> 1834 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1786 -> 1839 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1784 -> 1837 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1795 -> 1848 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1782 -> 1835 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1793 -> 1846 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1782 -> 1835 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1793 -> 1846 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1783 -> 1836 bytes ...ish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1782 -> 1835 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1793 -> 1846 bytes ...h_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ...h_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1782 -> 1835 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1793 -> 1846 bytes ...h_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ...h_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1779 -> 1832 bytes ...ity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...y_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...y_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...y_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...y_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1780 -> 1833 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1791 -> 1844 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1779 -> 1832 bytes ...elu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...u_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...u_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1789 -> 1842 bytes ...u_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...u_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1776 -> 1829 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1787 -> 1840 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1779 -> 1832 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1790 -> 1843 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1777 -> 1830 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1788 -> 1841 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1777 -> 1830 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1788 -> 1841 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1778 -> 1831 bytes ...ish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1770 -> 1823 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1777 -> 1830 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1788 -> 1841 bytes ...h_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ...h_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1777 -> 1830 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1788 -> 1841 bytes ...h_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ...h_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1774 -> 1827 bytes ...ity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1766 -> 1819 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...y_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...y_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...y_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...y_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1775 -> 1828 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1786 -> 1839 bytes ..._s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ..._s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} | Bin 1774 -> 1827 bytes ...elu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} | Bin 1766 -> 1819 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...8x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...u_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...2x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...u_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...4x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1773 -> 1826 bytes ...8x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1784 -> 1837 bytes ...u_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...2x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...u_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} | Bin 1771 -> 1824 bytes ...4x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} | Bin 1782 -> 1835 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1796 -> 1849 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1794 -> 1847 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1794 -> 1847 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1795 -> 1848 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1787 -> 1840 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1794 -> 1847 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1794 -> 1847 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1791 -> 1844 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1783 -> 1836 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...8x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1792 -> 1845 bytes ...28x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...28x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} | Bin 1791 -> 1844 bytes ...8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1783 -> 1836 bytes ...2x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...4x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1790 -> 1843 bytes ...64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} | Bin 1788 -> 1841 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1812 -> 1867 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1817 -> 1872 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1812 -> 1867 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1817 -> 1872 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1810 -> 1865 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1815 -> 1870 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1812 -> 1867 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1817 -> 1872 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1810 -> 1865 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1815 -> 1870 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1806 -> 1861 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1811 -> 1866 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1809 -> 1864 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1804 -> 1859 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1806 -> 1861 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1811 -> 1866 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1804 -> 1859 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1809 -> 1864 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1806 -> 1861 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1811 -> 1866 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1808 -> 1863 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1813 -> 1868 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1809 -> 1864 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1804 -> 1859 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1806 -> 1861 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1811 -> 1866 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1804 -> 1859 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1809 -> 1864 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1807 -> 1862 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1812 -> 1867 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1807 -> 1862 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1812 -> 1867 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1805 -> 1860 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1810 -> 1865 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1807 -> 1862 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1812 -> 1867 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1805 -> 1860 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1810 -> 1865 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1801 -> 1856 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1806 -> 1861 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1804 -> 1859 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1799 -> 1854 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1801 -> 1856 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1806 -> 1861 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1799 -> 1854 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1804 -> 1859 bytes ...28x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...28x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1801 -> 1856 bytes ...64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1806 -> 1861 bytes ...56x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1803 -> 1858 bytes ...64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1808 -> 1863 bytes ...64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1804 -> 1859 bytes ..._32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} | Bin 1799 -> 1854 bytes ...64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} | Bin 1801 -> 1856 bytes ...64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1806 -> 1861 bytes ..._64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} | Bin 1799 -> 1854 bytes ...64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} | Bin 1804 -> 1859 bytes dnn/src/cuda/conv_bias/opr_impl.h | 3 + .../cutlass_deconvolution_wrapper.cu | 2 +- .../implicit_gemm_int8_nchw4_dp4a.cpp | 3 +- .../implicit_gemm_int8_nchw_dp4a.cpp | 2 +- ...s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu} | Bin 1716 -> 1769 bytes ..._s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu} | Bin 1714 -> 1767 bytes ...ity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu} | Bin 1708 -> 1761 bytes ..._s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu} | Bin 1715 -> 1768 bytes ..._s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu} | Bin 1715 -> 1768 bytes dnn/test/cuda/conv_test_utils.cpp | 5 + 436 files changed, 946 insertions(+), 10 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp create mode 100644 dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp create mode 100644 dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu delete mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu create mode 100644 dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu => cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (91%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu => cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8/kimpl/{cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu => cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu} (92%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (90%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu} (91%) rename dnn/src/cuda/conv_bias/int8_imma/kimpl/{cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu => cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu} (91%) rename dnn/src/cuda/convolution/backward_data/int8/kimpl/{cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu => cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu} (92%) rename dnn/src/cuda/convolution/backward_data/int8/kimpl/{cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4.cu => cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu} (92%) rename dnn/src/cuda/convolution/backward_data/int8/kimpl/{cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu => cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu} (92%) rename dnn/src/cuda/convolution/backward_data/int8/kimpl/{cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu => cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu} (92%) rename dnn/src/cuda/convolution/backward_data/int8/kimpl/{cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu => cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu} (92%) diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index a212ee4c2..adf6a93c1 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -90,6 +90,12 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : uint4_int4_nchw64_imma) { all_algos.push_back(&algo); } + for (auto&& algo : int4_int4_nhwc_imma) { + all_algos.push_back(&algo); + } + for (auto&& algo : uint4_int4_nhwc_imma) { + all_algos.push_back(&algo); + } #endif #endif fill_dp4a_algos(); @@ -247,6 +253,34 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { uint4_int4_nchw64_imma.emplace_back( AlgoParam{256, 128, 128, 64, 64, 128}); } + { + using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 32, 64, 64, 32, 64, 32}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 32, 64, 64, 32, 64, 16}); + int4_int4_nhwc_imma.emplace_back(AlgoParam{128, 32, 64, 64, 32, 64, 8}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 64, 64, 64, 64, 64, 32}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 64, 64, 64, 64, 64, 16}); + int4_int4_nhwc_imma.emplace_back(AlgoParam{128, 64, 64, 64, 64, 64, 8}); + } + { + using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 32, 64, 64, 32, 64, 32}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 32, 64, 64, 32, 64, 16}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 32, 64, 64, 32, 64, 8}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 64, 64, 64, 64, 64, 32}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 64, 64, 64, 64, 64, 16}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 64, 64, 64, 64, 64, 8}); + } #endif } #endif diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 869777b6f..691298c6d 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -63,6 +63,8 @@ public: CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4, + CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4, + CUDA_IMPLICIT_GEMM_IMMA_NHWC_UINT4_INT4, CUDA_BFLOAT16, CUDA_IMPLICIT_GEMM_SASS_NCHW4_DOTPROD_INT8, CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8, @@ -879,6 +881,133 @@ public: MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4) +private: + DTypeEnum src_dtype() const override { return DTypeEnum::Quantized4Asymm; } + + std::tuple prepare_filter_bias( + const ExecArgs& args) const override; + + std::tuple get_constants( + const ExecArgs& args) const override; + + void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float delta, float theta, cudaStream_t stream) const override; + + void update_bias(const ExecArgs& args, void* updated_bias, + void* reduce_filter_ptr, void* reduce_workspace) const; +}; + +class ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase : public AlgoBase { +public: + struct AlgoParam { + int threadblock_m; + int threadblock_n; + int threadblock_k; + int warp_m; + int warp_n; + int warp_k; + int access_size; + }; + + AlgoInt4NHWCIMMAImplicitGemmBase(AlgoParam algo_param) + : m_algo_param(algo_param) {} + + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + const char* name() const override { return m_name.c_str(); } + std::string param() const override; + + bool is_available(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + std::string to_string(AlgoParam algo_param); + +protected: + virtual DTypeEnum src_dtype() const = 0; + + // return filter_ptr, bias_ptr + virtual std::tuple prepare_filter_bias( + const ExecArgs& args) const = 0; + + // return alpha, beta, gamma, delta, theta + virtual std::tuple get_constants( + const ExecArgs& args) const = 0; + + virtual void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, + cudaStream_t stream) const = 0; + + void reorder_filter(const ExecArgs& args, int interleaved, + void* reordered_filter) const; + + std::string m_name; + AlgoParam m_algo_param; +}; + +class ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm final + : public AlgoInt4NHWCIMMAImplicitGemmBase { +public: + using Base = AlgoInt4NHWCIMMAImplicitGemmBase; + using AlgoParam = Base::AlgoParam; + + AlgoInt4Int4NHWCIMMAImplicitGemm(AlgoParam algo_param) : Base{algo_param} { + m_name = ConvBias::algo_name( + ssprintf("INT4_INT4_NHWC_IMMA_IMPLICIT_GEMM_%s", + to_string(m_algo_param).c_str()), + ConvBias::DirectParam{}); + } + + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4) + +private: + DTypeEnum src_dtype() const override { return DTypeEnum::QuantizedS4; } + + std::tuple prepare_filter_bias( + const ExecArgs& args) const override; + + std::tuple get_constants( + const ExecArgs& args) const override; + + void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float delta, float theta, cudaStream_t stream) const override; +}; + +class ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm final + : public AlgoInt4NHWCIMMAImplicitGemmBase { +public: + using Base = AlgoInt4NHWCIMMAImplicitGemmBase; + using AlgoParam = Base::AlgoParam; + + AlgoUInt4Int4NHWCIMMAImplicitGemm(AlgoParam algo_param) : Base{algo_param} { + m_name = ConvBias::algo_name( + ssprintf("UINT4_INT4_NHWC_IMMA_IMPLICIT_GEMM_%s", + to_string(m_algo_param).c_str()), + ConvBias::DirectParam{}); + } + + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_UINT4_INT4) + private: DTypeEnum src_dtype() const override { return DTypeEnum::Quantized4Asymm; } @@ -955,6 +1084,8 @@ public: std::vector int8_nchw32_imma; std::vector int4_int4_nchw64_imma; std::vector uint4_int4_nchw64_imma; + std::vector int4_int4_nhwc_imma; + std::vector uint4_int4_nhwc_imma; #endif std::vector> gconv_refhold; AlgoBFloat16 bfloat16; diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 67de77700..e77a599ab 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -321,7 +321,8 @@ void megdnn::cuda::cutlass_wrapper:: ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionFpropNCxHWxThreadblockSwizzle, \ - stage_, 4, aligned_, NeedLoadFromConstMem>; \ + stage_, 4, aligned_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAdd>; \ typename Convolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ @@ -582,7 +583,8 @@ void megdnn::cuda::cutlass_wrapper:: ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionFpropNCxHWxThreadblockSwizzle, \ - stages_, 4, aligned_, NeedLoadFromConstMem>; \ + stages_, 4, aligned_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAdd>; \ typename Convolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ @@ -1037,4 +1039,262 @@ INST(true); INST(false); #undef INST +/* ====== cutlass kernel wrapper for int4 x int4 nchw64 layout ====== */ + +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int4_int4_implicit_gemm_imma_nhwc( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const int8_t* /* d_z */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, + const int32_t /* access_size */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int4_int4_implicit_gemm_imma_nhwc( + const int8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, const int32_t access_size, + cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, access_size_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_ && access_size == access_size_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; \ + using Convolution = cutlass::conv::device::Convolution< \ + cutlass::int4b_t, cutlass::layout::TensorNHWC, \ + cutlass::int4b_t, cutlass::layout::TensorNCxHWx, \ + ElementOutput, cutlass::layout::TensorNHWC, int32_t, \ + cutlass::layout::TensorNHWC, int32_t, \ + cutlass::conv::ConvType::kConvolution, \ + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNHWCThreadblockSwizzle, \ + 2, access_size_, access_size_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAddSaturate, \ + cutlass::conv::ImplicitGemmMode::GEMM_TN>; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ + return cutlass_convolution_wrapper( \ + reinterpret_cast(d_src), \ + reinterpret_cast(d_filter), d_bias, \ + reinterpret_cast(d_z), \ + reinterpret_cast(d_dst), workspace, \ + conv_param, epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 32); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 32); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 8); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d) and access_size (%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k(), access_size); + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 8, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 8, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationHSwishClamp< + ElementOutput, 8, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int4_int4_implicit_gemm_imma_nhwc< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, const int32_t access_size, \ + cudaStream_t stream); +INST(true); +INST(false); +#undef INST + +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc( + const uint8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const uint8_t* /* d_z */, + uint8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* delta */, + float /* theta */, float /* scale */, + uint8_t /* src_zero_point */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, + const int32_t /* access_size */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc( + const uint8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float delta, float theta, float /* scale */, + uint8_t src_zero_point, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, const int32_t access_size, + cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, access_size_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_ && access_size == access_size_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; \ + using Convolution = cutlass::conv::device::Convolution< \ + cutlass::uint4b_t, cutlass::layout::TensorNHWC, \ + cutlass::int4b_t, cutlass::layout::TensorNCxHWx, \ + ElementOutput, cutlass::layout::TensorNHWC, int32_t, \ + cutlass::layout::TensorNHWC, int32_t, \ + cutlass::conv::ConvType::kConvolution, \ + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNHWCThreadblockSwizzle, \ + 2, access_size_, access_size_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAddSaturate, \ + cutlass::conv::ImplicitGemmMode::GEMM_TN>; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ + return cutlass_convolution_wrapper( \ + reinterpret_cast(d_src), \ + reinterpret_cast(d_filter), d_bias, \ + reinterpret_cast(d_z), \ + reinterpret_cast(d_dst), workspace, \ + conv_param, epilogue, stream, {src_zero_point}); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 32); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 64, 64, 32, 64, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 32); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 64, 64, 8); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d) and access_size (%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k(), access_size); + using ElementOutput = cutlass::uint4b_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 8, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, + delta + theta}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 8, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, + 0, delta, theta}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc< \ + need_load_from_const_mem>( \ + const uint8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float delta, float theta, float scale, \ + uint8_t src_zero_point, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, const int32_t access_size, \ + cudaStream_t stream); +INST(true); +INST(false); +#undef INST + // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index f2d7370de..0b695d33f 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -103,6 +103,25 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nhwc( float scale, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int stages, cudaStream_t stream); +template +void do_conv_bias_int4_int4_implicit_gemm_imma_nhwc( + const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const int8_t* d_z, int8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + const int32_t access_size, cudaStream_t stream); + +template +void do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc( + const uint8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const uint8_t* d_z, uint8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float delta, float theta, + float scale, uint8_t src_zero_point, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, const int32_t access_size, + cudaStream_t stream); + } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp new file mode 100644 index 000000000..5a72afc98 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp @@ -0,0 +1,122 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +size_t +ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + if (args.preprocessed_filter) { + return 0; + } else { + return args.filter_layout->span().dist_byte(); + } +} + +size_t ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0; +} + +SmallVector ConvBiasForwardImpl:: + AlgoInt4Int4NHWCIMMAImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + megdnn_assert(args.preprocessed_filter->tensors.size() == 1); + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + reorder_filter(args, m_algo_param.access_size, filter_ptr); +} + +std::tuple +ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::prepare_filter_bias( + const ExecArgs& args) const { + void* filter_ptr = nullptr; + if (args.preprocessed_filter) { + megdnn_assert(args.preprocessed_filter->tensors.size() == 1); + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + } else { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + reorder_filter(args, m_algo_param.access_size, filter_ptr); + } + void* bias_ptr = args.bias_tensor->raw_ptr; + return {filter_ptr, bias_ptr}; +} + +std::tuple +ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants( + const ExecArgs& args) const { + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; + + if (args.z_layout->ndim > 0) { + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + + return {alpha, beta, gamma, delta, theta}; +} + +void ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::do_exec( + const ExecArgs& args, void* filter_ptr, void* bias_ptr, void* z_ptr, + ConvParam kern_param, uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, cudaStream_t stream) const { + float dst_scale = args.dst_layout->dtype.param().scale; + + cutlass_wrapper::GemmCoord threadblock_shape{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}; + + cutlass_wrapper::GemmCoord warp_shape{ + m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}; + + if (kern_param.fh == 1 && kern_param.fw == 1) { + cutlass_wrapper::do_conv_bias_int4_int4_implicit_gemm_imma_nhwc( + reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + threadblock_shape, warp_shape, m_algo_param.access_size, + stream); + } else { + cutlass_wrapper::do_conv_bias_int4_int4_implicit_gemm_imma_nhwc( + reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + threadblock_shape, warp_shape, m_algo_param.access_size, + stream); + } +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp index 033359674..d99bf90a8 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp @@ -60,9 +60,22 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available( args.dst_layout->dtype.enumv() != src_dtype()) return false; + // uint4 do not support H_SWISH activition + if (src_dtype() == DTypeEnum::Quantized4Asymm && + param.nonlineMode == NonlineMode::H_SWISH) + return false; + if (!is_compute_capability_required(7, 5)) return false; + size_t fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + // param buffer size is 4K, use 3.4K to store precomputed offset + size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 64) - 2; + if (fh * fw > kMaxFilterPixels) + return false; + return true; } @@ -108,7 +121,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( std::string ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::to_string( AlgoParam algo_param) { - return ssprintf("%uX%uX%u_%uX%uX%u", algo_param.threadblock_m, + return ssprintf("%dX%dX%d_%dX%dX%d", algo_param.threadblock_m, algo_param.threadblock_n, algo_param.threadblock_k, algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp new file mode 100644 index 000000000..21aca9255 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp @@ -0,0 +1,159 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" +#include "src/cuda/conv_bias/reduce_filter.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +std::string ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::param() + const { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; +} + +bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + using NonlineMode = megdnn::param::ConvBias::NonlineMode; + + auto&& param = args.opr->param(); + + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) + return false; + + if (param.format != Format::NHWC || param.sparse != Sparse::DENSE || + param.mode != Mode::CROSS_CORRELATION) + return false; + + if (param.nonlineMode != NonlineMode::IDENTITY && + param.nonlineMode != NonlineMode::RELU && + param.nonlineMode != NonlineMode::H_SWISH) + return false; + + if (args.src_layout->dtype.enumv() != src_dtype() || + args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || + args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || + args.dst_layout->dtype.enumv() != src_dtype()) + return false; + + // uint4 do not support H_SWISH activition + if (src_dtype() == DTypeEnum::Quantized4Asymm && + param.nonlineMode == NonlineMode::H_SWISH) + return false; + + if (!is_compute_capability_required(7, 5)) + return false; + + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + // param buffer size is 4K, use 3.4K to store precomputed offset + size_t kMaxFilterPixels = + 848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1; + if (fh * fw > kMaxFilterPixels) + return false; + // co should be aligned with 8, and ci should be aligned with + // algo_param.access_size + if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0)) + return false; + + return true; +} + +void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](3), + hi = args.src_layout->operator[](1), + wi = args.src_layout->operator[](2); + size_t co = args.dst_layout->operator[](3), + ho = args.dst_layout->operator[](1), + wo = args.dst_layout->operator[](2); + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR + + void* filter_ptr = nullptr; + void* bias_ptr = nullptr; + void* z_ptr = nullptr; + + std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args); + if (args.z_layout->ndim > 0) + z_ptr = args.z_tensor->raw_ptr; + + float alpha, beta, gamma, delta, theta; + std::tie(alpha, beta, gamma, delta, theta) = get_constants(args); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + uint32_t nonlinear_mode = static_cast(param.nonlineMode); + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + do_exec(args, filter_ptr, bias_ptr, z_ptr, kern_param, nonlinear_mode, + alpha, beta, gamma, delta, theta, stream); +} + +std::string ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::to_string( + AlgoParam algo_param) { + return ssprintf("%dX%dX%d_%dX%dX%d_%d", algo_param.threadblock_m, + algo_param.threadblock_n, algo_param.threadblock_k, + algo_param.warp_m, algo_param.warp_n, algo_param.warp_k, + algo_param.access_size); +} + +void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter( + const ExecArgs& args, const int iterleaved, + void* reordered_filter) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + // reformat grad from nhwc to ncxhwx + TensorLayout exec_src{{co, fh, fw, ci / iterleaved, (size_t)iterleaved / 2}, + dtype::Int8()}; + TensorLayout exec_dst{{co, ci / iterleaved, fh, fw, (size_t)iterleaved / 2}, + dtype::Int8()}; + + exec_src = exec_src.dimshuffle({0, 3, 1, 2, 4}); + + auto&& relayout = args.opr->handle()->create_operator(); + relayout->exec({args.filter_tensor->raw_ptr, exec_src}, + {reordered_filter, exec_dst}); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 03cb358b0..b04b1a01f 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -75,8 +75,9 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( // only support sm_75 or later, platform should have tensorcore int8 // support available &= is_compute_capability_required(7, 5); - // FIXME: too large filter size is not supported now - available &= fh * fw <= 49; + // FIXME: too large filter size is not supported now + size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2; + available &= fh * fw <= kMaxFilterPixels; return available; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 3287b80aa..ea40e9185 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -95,8 +95,10 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); - // FIXME: too large filter size is not supported now - available &= fh * fw <= 49; + // FIXME: too large filter size is not supported now + size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2; + available &= fh * fw <= kMaxFilterPixels; + ; return available; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp new file mode 100644 index 000000000..ab62e1d98 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp @@ -0,0 +1,186 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" +#include "src/cuda/conv_bias/reduce_filter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +size_t +ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + if (args.preprocessed_filter) { + return 0; + } else { + size_t ws_filter = args.filter_layout->span().dist_byte(), + ws_bias = args.bias_layout->span().dist_byte(), + ws_reduce_filter = get_preprocess_workspace_in_bytes(args); + return ws_filter + ws_bias + ws_reduce_filter; + } +} + +size_t ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + size_t ws_size_reduce_filter = co * sizeof(int32_t); + size_t A = co, B = ci * fh * fw / 8, C = 1; + ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); + return ws_size_reduce_filter; +} + +SmallVector ConvBiasForwardImpl:: + AlgoUInt4Int4NHWCIMMAImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous(), + args.bias_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + megdnn_assert(args.preprocessed_filter->tensors.size() == 2); + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + void* reduce_workspace = reinterpret_cast( + args.workspace.raw_ptr + args.bias_layout->span().dist_byte()); + reorder_filter(args, m_algo_param.access_size, filter_ptr); + update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); +} + +std::tuple +ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::prepare_filter_bias( + const ExecArgs& args) const { + void* filter_ptr = nullptr; + void* bias_ptr = nullptr; + if (args.preprocessed_filter) { + megdnn_assert(args.preprocessed_filter->tensors.size() == 2); + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + return {filter_ptr, bias_ptr}; + } else { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + bias_ptr = + reinterpret_cast(args.workspace.raw_ptr + + args.filter_layout->span().dist_byte()); + void* reduce_filter_ptr = + reinterpret_cast(args.workspace.raw_ptr + + args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte()); + void* reduce_workspace = + reinterpret_cast(args.workspace.raw_ptr + + args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte() + + args.bias_layout->span().dist_byte()); + reorder_filter(args, m_algo_param.access_size, filter_ptr); + update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); + } + return {filter_ptr, bias_ptr}; +} + +std::tuple +ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants( + const ExecArgs& args) const { + float src_scale = + args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = + args.dst_layout->dtype.param().scale; + + uint8_t dst_zero = + args.dst_layout->dtype.param().zero_point; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, + theta = dst_zero; + + if (args.z_layout->ndim > 0) { + float z_scale = + args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + uint8_t z_zero = + args.z_layout->dtype.param().zero_point; + delta = -z_zero * gamma; + } + + return {alpha, beta, gamma, delta, theta}; +} + +void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::do_exec( + const ExecArgs& args, void* filter_ptr, void* bias_ptr, void* z_ptr, + ConvParam kern_param, uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, cudaStream_t stream) const { + float dst_scale = + args.dst_layout->dtype.param().scale; + uint8_t src_zero = + args.src_layout->dtype.param().zero_point; + cutlass_wrapper::GemmCoord threadblock_shape{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}; + + cutlass_wrapper::GemmCoord warp_shape{ + m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}; + if (kern_param.fh == 1 && kern_param.fw == 1) { + cutlass_wrapper::do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc( + reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, + dst_scale, src_zero, threadblock_shape, warp_shape, + m_algo_param.access_size, stream); + } else { + cutlass_wrapper::do_conv_bias_uint4_int4_implicit_gemm_imma_nhwc( + reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, + dst_scale, src_zero, threadblock_shape, warp_shape, + m_algo_param.access_size, stream); + } +} + +void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::update_bias( + const ExecArgs& args, void* updated_bias, void* reduce_filter_ptr, + void* reduce_workspace) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + auto&& stream = cuda_stream(args.opr->handle()); + + int src_zero_point = + args.src_tensor->layout.dtype.param() + .zero_point; + do_dispatch_reduce_filter_and_update_bias_4bit( + reinterpret_cast(args.filter_tensor->raw_ptr), + args.bias_tensor->compatible_ptr(), co, ci * fh * fw / 8, + reinterpret_cast(updated_bias), + reinterpret_cast(reduce_workspace), src_zero_point, + stream); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..059b328740aeed4c3f6190cd359ab62d3f59b1d4 GIT binary patch literal 1881 zcmbVN+in^$5PjdTaEVm45h>!*ZRFBODNRVF0#!ju-z?)bVBxhb+sh?i-|=1utda=z zWij?S=k$m(;oF!dSXY zDH;w2C=nEJ&I^W2=2owr6~+?OxO&V~=6G^tsHG|Q+&0eOIBx~>S5yk>z!{(j#blOn z)Nq!ySeDQh(`rNJ0BDKSNWp|BOP;3Wuun?=$<|V1xc$U3p=2Y9LbPjc6_-dT=c!^~ zw*q^voVMB!*CXQky4zWp@UOjVla<>Fw+aXC&h@U_*~L8)_sT_wsMVGc#oF;|+m0J( z$rZQM^90TJRVu4ZZKEs%h6WIXmYEk?p>HTc$nOnTkf@4*afk=wgl`1n7#|V(07uU_ zp?q&nd^1=%+Lm;$Gmr5sk8Aw!IB1>%&f^Vs50ATkJHGiZ_Tg5ztlK%u$s&wb@I_8S zE@qG~jMqnLsV9J1w!(N~20_b37AB?^MS+>7Uq06kf}$`U!gso&hWDZHP1Nl)QQWI% zq7IFF=OV#mZnbWa8zJ>FgZTW)a++ZL9prwSB)^2>)DCny7m5b$JWPW!A;Q!YsyGrr zRo}eo2yQADr>RBI3df~EXdyNAIkxUsk^RD_xzucZBFp>GFF|Ls-02(u?maz8-VDkt z3fn2%Xvbx_4vPhq4uGc`t!g;Rq!&i+AqP|Hv(R3(f)&2~EGI%J+(;*o!QtI#L}ufg zuh9OH=Wq^Ow9z6#DPJZ^L8HM`3EzC^RYT)>BUmn5wL0t{$?oO@xC|KQpL_V|7~?z1 zQ?C5fvP+Z@7-M;mXE5%Cgs~Y&i+pu1FNh_u!i+qZ^TYW-dCKvJS@cpIB{Ffv!~ykO zqOG=TXDBP`_cHy+*6->`DDw6`{A@`6P`q`2*%gz_o6JuP#@Kvj^^H_LbpYvHvm+sh?i-|=1uK}m%A zvKafE@ywZvqYJS_@5aN?kI970hoe8YM7!ozafyU-o+<`* zYq00aX{!x!T_UcpJMq$lf9+kHq;4x*8V=g=^{x}|dR@}%R*RUZbjyfhojBcg;s#oA z#Vz$bL5qDYWxc6wl!w630D{mm_d+Z54Mhm~z2ORksu>uEcrZ@*Mlg=?5up!o^n?@2 zcNfGrgO#IgN%uPU7|)BiCif46=5xS#yut4NVb|}(H~+;x+zOX<;Y7(@V zgI*Y~k7PL&fLfMfJT-$*%cTlaa}q^~nPp!-pB#jW!gvJVSxOD>L*kpL6E{(>d#c1K zwAVcs2__4xrxv*pQm=A|&!?8t1mo{e?zc(uODIn5KxYe~XsDh0S*T2iFf(OU92G#F z-y-V>ZmJf~rA5#R$EiVRDK+&4_SCOZ{nDp})NK8%Ebl_UggToSP8R@h@9|OcW~j_E zv2p1}kg~Yruvk**09b5RJ&AFh!*MRXFmewX%&5;pdz~b#^zCOg5klg|IspYo!||BR zCpTZA{Ugue9JpwsMS?QEN|b^|gQ*g}#n7vV#?wZyT(xR_*gq<}n-AbJV4Q#I;iF@W zhmvPp`I%*xC?PP$iXhKm+zAO|Gmw_W>RetZmcR-#@?6dj=L6-jz#nGOOL0`l#FY~V zs^=1IwOumfm!Brxb(Ld*;bO-ONJeIr;35y z670Eh+G;~wkBGsb+gY3NuYGKjtvd*}4F~Pc;MnaP`#sX{6^jm0+XEwt4Yr3t18unC zmURDd_WC%V~`9cc}H7IQ}IRr*@>1l~6R4&fOxEB`8cxUKEx5m+8%t zj^L(Z@jSE$`kWd^6I^uov!L|{RE8hWdiuH3Vq#V`nIo9Yr0P%42H#^br1AI=BLLxz7GL@((mkg?0h4b;vh zI%vCdhO#_=FVicwewVL79=G@5vnSab@z(uCPs~$qQa?8sXLej^r0khoGURmOp>-1C i5j-fcc+2$BTbfxzJS=lxK8Bik?fKIty5K4n>*yamq;n?# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd482580d1b679cb51750a175405eff8653acf8d GIT binary patch literal 1881 zcmbVNZExBz5dNND;ZbSYDmnyZtnyf;j!`;kz%-D$ZvsvLoLm?=eBl+jq_G8e?_IB4uSy_p_t55 zj%v=b2Fp|0U|Oxo3;+$087Y|XWWlqH9Nv?C{A6RPG2DJ)xlpo}Bq7@sw~9+7l=DZIY$i2)7Iy&DPDf(}oWzo?+1HTb$nOkSKvZdgamWYbl&=Nj7#|sQ0gj$= zLizTL_Z^$g&$WsDZ3`73^kp2Xr7FC i1h)z--g3S5mKNR+56c+K$Doz zAei{M^WAgLos%;$Meq7|z1zWnjPH8CZ^)l!(r6$tQ`&O0qE-ntLwbr>;U#r0IWv~d za*BH0F3JQ2ob!SqQ@HtSd(l~jY8Q`%Djd%)47D`pp4-|9HqKkY`~{VQItT_#gkrMD zII20z8m!1@gK4!S696sg~xbY<~4YD?A8wfr`dJa50C3kE4}_N>)okvSSvj#$e6|}_{d4% zVhns?yx61Vm;h?o9OI$s1}#SxhUO@Xh#BQyJ|AraMPb~7=V(q1??B<}=&~KY$4neU zBkB|+xP;HY+;StNo)wUvpIc5djK722Z?f#CP@LMX&L=|Apq+wl@5T18m(&BFQgYnZh(Uk^+jl}qk_fQerzW~C|qA>z+msL-zVe2^;c;B z#B==v71G`9<)&ZPjYCvnRXj4-hh7oPX}&y={!|B+t3> zBg@WFMqrGUNuFSFFC>i3U^FdPr`iRv1gkJ3Pqn#qKA1d}_}wgeDUU5Ob8+H;dd|^G z+l4cf#rnN$zUS*#btOdJ-iMzhDKCn*&fj*$u<)kvLxXYY$5l?Mu8Ao_O-CM@Cnp}k gtpba;T(7;Qr8mUGGRE>TXy&y$P@Ck8t4vIjzf34}8~^|S literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..3f19c5841984103064f1ca745851992939592e91 GIT binary patch literal 1876 zcmbVN?QYsI6#bv4@TfFx6&-@I4tcCn$0(gNU>Zo>Kg&1yN*elMs0;A zg~a%tbA7*^oQWlRHy)0DOeSPL9R0Z=f7?l`g~Uo}%h8TnCDaP(C1Sak)Vbu$Sh~t6 z8Vv?06%=sJ3x-VY*00H;vJ^Eg?sJtpo?aMgX~sRbjT1P|TfzJVm4Z4j1}H)?nWr2z zoMkPRr?kbi+L8qTS|T%2FyYCHXBm0iCrA6q&QfEz{ls#iWFtv}w;OI1mq;k*sbZkF z1beQWw%QQaC8F26>?}?A*WS0u+UV@N_M<`6E7 zH5H8lt=rKY~X*6oVXk2Wo&W}9bKc^jG~l-ay6x&VNA4-b+zLt)0mcA^>J zcX34dT~g@)R_q|Tz`em zPdo=XaM4za6lHvsDg|8z10}qRc~{Phr;VWb*r%niUn#ol4`4E2oPTQIiWuXel^04Su)=semGi^-KzS(ekAvtXKaR-M#c>0*bB=b} zZk(Yk=I>>CMc420Hi&V1A3j@B+!1e`f9#2A?oIBe2IE4HE0vTz6J3U!&OEeEMm&NG g1r~3q-g`?6ZHR|uwB=){nb)pAZIUytQn5__0ie@!I{*Lx literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..c66ac54efef925a95b1b7f84ea324fb779f27300 GIT binary patch literal 1877 zcmbVN+in^$5PjdTaEVm45h>zQ8o4x5N)uA4Kvj^^H_O-tEWEa5d%5K6JKkjptda=z zWij?S<2iG4av|sF-SBSkV>BYuyTPAZ^0%2Z8b~g*aRP0q(^4;xnIo13MZHfhtfPyR zqCvlpxTJvdK{8}BzkKbiFvd~s>M_%q7yQanM^h1mtDV8|!ATabsFKu!GeD7=sf-I$ z^NuxG#%Y5Yy(TjNG-PU(WHOM2NK1+ir(+JNre8=;ekH&6?firrE8b zh0wy$AUK-s%3zk8#wA&lhWa4Gs4PgM(KnPJ6n0i<2vy0zI4Z)Ji?w6|<0C>3;OH4A zb?D4Us7EVD+luZ?7BHUX%0>^5{rV~3JYIkM@VMUOG; zs6xA)bCF;&w_3F*tW;)^fxfVGf^v+%L&4v2{!3~>UEicLscEbo(3aYw9*$C9W;V|^ zf;;w$)1V?~gX5$iG*^a(99#9Mh(4$syqNstFz4BFjGNVwFC7N8nW0}l~fhJ{oGH3n6jba5WB(Ma7d=3o3GIN zNf2-jT(mZlqf{)o*3e=wQQ|irc;&!&-iVg>OL;{(V1;vtY9@wixGjd}*ZYmqIe+bP}L#QW6ln gD6n|T^fow}TSFo)3tv9QntAR0Gd8&pn#+0e4=o{bkN^Mx literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..45590035c1b7b4d0624bfaa866bce04c9e48ff10 GIT binary patch literal 1877 zcmbVN+in^$5PjdTaEVm45h-G!G;(RAloC>@Kvj^^H_O-tEWEa5d%5K6JKkjpK}m%A zvKafE@ywZvlM6XV?}q)skI{%s`-4BX}|q;{@7Jr=?yXGe;~Bih7@1SVtEb zMT1@saY+H^gJj6&e)-y2X^f-V)kCgxFZh+Ej%FeVS385_gOe;=Q6;GdXMiF#Q#lu? z<{fLWoYMv~dQD~kXvoYe$z&i4k!9qtPxk(kjic5I_lf0Ft6Gx8Xjj5%p^(%*P|d(@ z1r9=cCJz!54XZ)t@JD>%P?NSmzqQ^ zrl1$bt0P%X1)z>CF&^7qtmaaMu|0{R#7wd;pHB{AO<_EM?_^1>=tAP_sFl`Hr+uo# zDYVl*7YQZ{tEU!)mC7t~h%YRipd91xQ1G{$|B_lz*E88nY8qb5Rkr!EsU$nkz#?fj#x8)V=g(rVLv>E6BUpE3v|+g}Vh{vXPIEO4MUbmMKk3 zH=>Be9f!r7Di6S7bKj2`SJ@lpDoCq#5dDOPJT}%zs!HE}?x#Ua+0by%ZqOeN$#it{ z6L-(el1YtHbV5!QFfSmjUC$Q}-SnW87CF zgV`~0>=JPTBdiGW491;QFft=)UaZdL#WIPkI3LgD{O}=C9t-?o3cVD^9?5+v#1Wl)Al-KWkDv5^vqV?}u?7Y#zoIzQid-5grG!)}P!*)~%`&zD3$Jb2UM~6ij(6FFtda=z zg*Em$<2iG4av_)K-FP_qF`1D0aP;Sf{B0(U29hgnoIpG3wA3qPmWX9RQSXxr>*y+_ zXfzlgE-B!AkPO+(uU|VWjB!-Ey3cgx1;4V?(NqNCYG-hKaFT^9swDN`3{a$ID&qpx zykiZPaoS);Z^;4x4VhXcnG9ql(v&>zlcWD+=cu*9ePWr^s+J^C?M65)6q4Epsu|cV z!9i$moUz1ri0t*+t)-3sn)@b^T<9RfAs#hby?wj2?{-MHb6m8DUhf#uthe6vYG@_2 za5M;x7KbvF<)(2-7KNb!h%hP((rEMzB?yIs6&eCnGBA#MFy>+_nZWpj&<8kr!bu%E z3li$l%E`8(2a^Si=ee%Q-Tk2c9B>|Qu)n+C_j}j>MSi>#?rOKr^083y3cd(Q48zQ4#mTn}srL^Q<0kW23|Zo9FK4fZ0wyJSb6*C0V4jRk#6) z^DD~lk}40deDBzb7?-&lXDUdm4xoKTLl(QLlBmM7pT|KE6E-#+A~zb2$7DXa{t8{6 z1Oey3MOz~|O2vw64gCefB>v{JuACK58`1KyL(9YdNx5Br0G9#d!&B>?9Ai9GB89Or zbL}-> g0*kjy?}MYcH6-G)@a1Eynb)2^W0MP^xm+gy09ev-r2qf` literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..d0ead0c359be3c6e936cee3ced4cadb146897df1 GIT binary patch literal 1877 zcmbVN?QYsI6#bv4@TfFx6&-?htnyf;j!`;kw9`Q9{#nL0VBy%7?eNj3-*rq{h}sHG z3W@PO=lXs-JCQT=ZZsJFxVs~h!SK&D`Puf1s#$%PIwY@?{%x!iOw;34HZE;>Xn){JP@?k?8tZUfDQ z7LEqN(R5n{v)VK+E23+t4{jJ$1Zgz-hBAc0)(Q=wsu&nYk1*z9C7Hl@kI(}+dd5i| zE~X?jqn5pGMYpC17*9&i?rtCY%|pO(y#D6)Vbg1Oul|eubSfNn-Z{$1 z3B(KIR!ZOJrZ?j zLLGwymGGQ*>KcWW%FGM!FD#s(9OLg$@YkIGl3GyLH~Ca*8fyo9%WbNMz0_BkE%S}w zjNRfes0i9%KPd>!l%b)-)-6hLPu5J8VatPpyotRMD{NAlTLQ*w`S_?rGuC8EX(yS{ z?(P!xw3t!l0eD!QRrW@O3eu`Act570h>cZCRkH2pZW_dtjSPp_4F{tUncQ7{h1So4 z06B2c%1DlKG3Q!Ci@`*R?{eT(1LJukYTh+zCG71L+|>s#88ALPckiAU=H;`Tm%R;0Wn-a3ER5BEi|MYy*ZmwJ4qr0SRCGURj|plxyz5ZoxR ecuVyrI9h5$A}*6HA7jnDcK;choe0h4Ec*wM9&)4r literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..6994ef1763bcf9818b1c9b6d39c8578c080da15b GIT binary patch literal 1877 zcmbVN+in^$5PjdTaEVm45h-G!Y2>DnQj(BLC8-Kh`eqs1fQ8q#Y%iC5eaCAy0h&ap z5-etY&Uog`#o3XZpm&4o{?*+b8C~~(Uy?uVtkptts*Mw9Nu8E@ip&JDGAQbOc4Qr$ z78Lb+J;WsioDY&ATl(2+d&w9_jgyB`mtOD_OC2pl5Uz0m#|I}_IH5{X56%EZYNkpq zP{TXcVkM_7X7qxL0nn0#Rg%d-rlKgw(|fYw!LdbVUlvrax+cJcb?ZfK4lVAK7`{`6Ttdk$)WD4UIe9|Op zF#^3Xo^Q#rF93CHhVi}a#cEC}+}ph<5@uL@`MkFgYYO8&Jcl!C#W^ItiB7xmyRXDP zG@%Ybf=T%NtF5q7nP~~}g_#qSWBeTo{*v>bQVZ&OrWi|2W9>j}g-!LamHIlfRlX6N zv03bjil7a)lY-Dh85%0=evC=??;V@;-%=E;pt zzDd;6VnUS%;BIx+*&CE9NUPQm{g8$-Hr8IMl5amY(;%j7U^r;kza9+8=;o4qjO3^gQ?50%7)+G-tOi~^FrHsT%bO;xH_x{U?&1Tu3>Y7tyLW3F<7*`fm>omM zju9s?!m1z-VBAOrBQugF)#^}QER)EJ^YKv5l@F2fSmB?h&`WV_kld$P1J!ekmd4G! zr7Y#|W%`z_U-fm6bbB9v7Noi)-a3EN5BFuTWw^H(S9W}Dr0$m#GURj^plu2g5ZoxR ec+2!EI9gdlA}*6JA7jnDcK;ch9SP0lB>M|wEOMFv literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..1d9e6c79dd75f33b9cbc97350232aa6eedf1fb56 GIT binary patch literal 1872 zcmbVN+in^$5PjdTaEVm45h;Su6uC4~N(re{pejh|n`LYR7GB%3y<- zmR;jHGoCXyXJ>Mb-VOVMAEObO_6L7%$lqqxXdt=J#tF2cPD{N&W{y}96!ktkvyLuu ziUz$N;*tW+2g#5v{PHzfGR9Hu;=a&@7yQCfM{^N`tDT_ZgOe;=P$j7cXFx@2rV1`l z%{$g$1*Z*W^qR~7(2%)RlF2|8BG1X=KH2+EHjY{=+$UB@t!h~o*{+1sLLsSrpqhc* z3LJ#?#u-a|o5)V*vNgByUvt|ek_#PV*hNvZ)!AOP;UVQaE?Pt{H;ic3S#CNtv=CZ2 z8U#nPT@}h|)3~gN#83}N7*zymH2Q`zgu>1W4S}jOFphjM=3*_G!1zF+3vl#=lRC6# zB-EpogKb53rU)2MOJ1Y9`(FJy;55D7_U?Y$?Ogwt^5da!)@AD?9}|mL@F^rQ6jKNn z#;b#<9P_7+EioS3UM%Glgt0whkua0|%jY9OEGUcz@SZHG6G|2(1bb# z2`1sWY#mz^Rw}b7fL~ZTK{>|Xq1taa|0T7cu4nR@)HIe3aLa8fhl9*li7nHO;Ees^ zc~BAbz+qAlnkz#?i9Plx#XWg5Q--ac)#Gh!lvrTX(%lj;*~o_nCF-#xQ%YOO4Paaz zQGMrBd4QFB`&Pub%H6O~L0Yu~?h_h{*i}b~N}m1P4}zGmq2Um@L4PHizI5r(RixP4!*ZRFBODIug%fhr)SZ2Z3|Bx?Wr1+bW7<2)SUT{MB z&W!ka@N%#%=}u=J<7uAN=;5hXKL(s<*V{fkZM)6*=D)1>x58!Z)>%OoX}p0iI0;-# zfiH|#2eh0LKrLHhJT|?cT~RAR0a1%L^G+`>PVFLp;v;+rg_LYV6qX<&yv@JE{n3Z ziZGgSS*-nHPNf6ju|lf~4m0V6kvm{uLVXsR>!e&oY`@BB5Xv^x3Gg@Q4~Jwry7>;R zA9)VvV2jpTBq-&JL@8)9m?+_!54>t%yle!|WusPy-2>6xe1ebx;);0#>bXQ4 zZCB1vR@Cox^8;Uhs^_5K?PK^^ll+l*@BXqU#+f&n9~+EwKdy38^-Pg6)O6yZbyDIH hyeP1E&-K<@ntMY$EQ?q^2hF_o{%I3kaFvL8^bf<&a&`a! literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..83b6a72f48d52f15199f270ed8d195dec1323672 GIT binary patch literal 1877 zcmbVNU2obj6n)RH@TfFx6&)flR(Y&aM_W2+z*LaBH_JE%EF9ai9X|T&cbyQLMs0=m z5{!M$^}XkQI64yx^kH;6ycv(l>~{EYP5!o{Rtt%x(w3twwMwWZ(hI~gFR63UnXz=4 zQZyV4P$DScoEHq4%&p!!D~u(maq*a`%<<&HP)k$pxow=lao!5%FQ^pMfipl6ipebD zsNpPYu`Ho2rqzbb0nieuk%9?NmOM?#ai1LgCtFL6;r0v5gp!RY3em2)Ra_#WoTrL` z-4g7%a@uM`T$hN;%TBy7;a_{#CYH-AY`5dfT_@i4x}?`VEMlToTSgS?#H(#5ZlEPs z+)~dIG~bs(mYdo}SqKXaAOtNlFSJ5GP=t`*8?Hd6l7VrE2IGWp1mhT22z`K~XPi*J zJ14#wtW>ro-RsO_Jj zbx%ct$=qt)A~!q#>0BrpYUcs!65>luQ4^K)m)Xs8 zj^Lid;-#<%+Mt>kgced$pJVG@71|db&824R7e#p&dL>lZEO$8vOt<3cN%Cf>%Ob6D z;YJWLzvH-AQ0V}eZyx#)<1&MzOnPDD9u$~TpM~bCrK|Am*I^ojw2gEE`VDVKBQhIb zeTUYMJco1OqKy^_O8GKT3fc@NO8DjjuN)Z98^Q9SQOm=ArRc6cfy;n#{<(uI#~9yA zo^s`O%p0hl zbF|fV?F?l_{$8h7Z2c*pgF?5D;b%kgN8-Kv4?Qu-yvh8;V4T}=rIE5{ijX0vQxC0^ k5|7|TfyH~Kciz(68scGD`0_c_%vwE6z=uE88hsn+OdO9VGoALb>`P+_KEhN@TTaNbBDxua$uMkVUq|QZW#?o~{ z(RegMv7mr+UNB@*w|VO3MM>R^CTfp`=s!n>@78h+b=8?N;aY>sNHg_xI{uZPZa~Z zCD?Q2wAF^V9ub2yJFw{z(CNWWJsIz(;uj3_qP><10B=89YD zd5o6FGK}S>wow{Hp%DnsGW9|$^aDi*`J>?qf>bgv4q7md`A#s7afL7hIC{nj<$Fuw zo54zDThgOWJ;sYn)Aa6P)O-mzjW;^nJsgIE%l{%jT?%(~J16;=t9S#Sha`kz0pY@U zTZu|tKecRw@yv`u8RtQmnVO0mGf%#KtsR77!gvh7^9?n82w88UZl{U*y?P|-(0=b! zB$&*s)-7@)q+X|>&u=WJF~&ck(y!w9w@{qgkxrIE(NH?~P?VrAF?mT;vR@`QOF4pj zip9&oBIto?Vh~zMO?`%~dzHsN_h>0K+rFsD+t4VX$QGH)8DPE_kB^c!Ls{l&?c{EN z>g^)smHa_Ny2KA!!pGL-fWslL=W&FTX?A zN1nquaM4bS7$tliD+N6U!zBD>v#y*K&l|yV(WT|#uu^iDpTK3nIRD(fm1B%=Bu}{V zbIZ7gP%Bz!{(j#blmx zRCAU!Sf0`b(`rSg0BDHJNWp|BbDm{nzfX4lleMMBaQlVjLdjZ^glL!CDlU;w&Qry} zZVC2WIc>Eeu1&<%)n#jD!oTLGNi3In*lxD2Hk~$nNbwAd7Ey~eBZ@UUi*>V8LvyaU zrJko~x-An~Zfcw4AuQB`5VXv_&Yp!3pKt zQ{wBv%E7jzTb+B1Cq-1lho@fs7;qY|w|RKlbeo;)|6=do3NO2Co#bQ;;|+YwNyxksBfPJcs!F!g88o{1e1}lcv9g;?(wZHWi8n?L2}mA->E+O&pZJtZq?r z1n=1`juVTZ4Gs%~&`fIT3vAV^nEU9_RBE<766Jm9m7uao;c@{Ouf_ATT}F(6WMqTzX;T7BVoVJ`c@RDOdFE*KQhwvJG?!`Rm^e24ph4{tm65 zcn;^lMJp{*l<|406to#kl<+JDUO6ycHiG3{qn3x=1JPZ70+#{f{7VNP9AkVZdB&9= zTXv380wb&l@&v}ckT5a>X;!RG|YC@XS` literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..b39a3b38f7b1eeaf6c2db1c31cd51581174ff228 GIT binary patch literal 1877 zcmbVNZExBz5dNND;ZbSYDmp}9UFES#9c$^N0aHQhzFEdGVBy%7?eNlHzjI6|u(m>z zAQ=0(%iVL&os%;$Meq8zy&r=C8Q=CEZpfcz(r6$tQ`&O0qE-ntLwbr>;U#r0IWv~d za*BH0F3JQ2ob!SqQ@HtSXVF=PY8Q`%Djd%)47D`pp4-|9HqKkY`~{VQItT_#gkrMD zII20z8m!1@gK4!S696Z+AaP59SbH;Lu42;0r{YJJ&;4=JBvkrFjuF``&2ov&JH4b8aX zmU^C{$)+N*+SE2FLRzQ`DQH=Ep%wau5`_H5a0N6~78r+YFwXc=Fplw_K?mUI87Gu) zPl&GvFMHdPZgk->9+z1S?jO7LL%?Zv-Sz$By3g^?RzU_^Zpn(L@sF}9!EX%NcR*BS8FyY2VMcyRp{T0ij| z&cPNfwa8G;XPHvaW-w90vmAKUzi9ArpaIX5^{h5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..1a6f8b69339e750527346fd5e24262407f1d6976 GIT binary patch literal 1872 zcmbVNZExBz5dNND;ZbSYDmnyZ9r9SEj<$5tfT}Gg>MgF#vRtt%x(w3twwMwWZ(hJ0LFR63MiLrE< zQ8XM3P%0?koEHq4+^yc)i^fvaID5!d?s$4;sHGYA+%}HTao!5%&!`mCfis{Y6q9+% zQNvl*VtGniOsfr<1E3``BLx$lEP0lZCpkIzPqvmC!|fNA3nd##64-@E{4eFFL*cCR&QU%_7H{BVNJ1!P5H5_@ zm8jJDQ_EHuPs|{caSX!5)L2B!H2e0owh@X6;~~7KD{6Qjvfe}&-SDeNq7IFyV~}7H zp7T!KA~!OQMqfGPy;{5u9^a zybLUY9;hY;p@r1c7udR2G4|1;xzud^q9$)cql6-x6)qQm=~g^GO5O}*8PnQ{ZUF1z zi1NFj(gCcH9a<6NGJm67dST=q_)n?NLub{J6+QcP7z80{Bb`F@hBu=TnT;>ML)Rys z!#?PujTR}&_%c-rdJKk1co(y-oE6W4VENFcrZ(T#JGJ7pA9MQhxg7ubi^e0CifGAabd@mM#_$flp&{64~>%%kKi(a e#e1fA-qOMv;$azm`5bEIt*cL)nc$W$?G?sS)Hu2eDmb1V8ER?9J-3ZFaGbY-`6DU?b>Iw8gkmzJ z95tL}Ef!MRVp^@q3;->W87Y|XWWlqHJgk$w|72sSG2DJ(K`7Zsk{IoZTg4?3%6Y07 z*e$`HE2phC#C3@{Iq9_LCeD^_Biu4B9=A`n`!8pbhLLcDh5hs-I&WLYD zE0twQcRF~Cr+Gv-gYDJbw%_ZXHJ`%X#2##KCqMt2UH@ggHM|eU-bCf6p4B?I*KMNs z0{%59ScRw4uH(27QZE98=$Dq$6yqPD{_`~bB^0N2ptG4!Gz#i!62%HZW~PuzMHS_7 z=ORGpxK~7ppcJZy^6z{uHT5~G?pSdGg;O)B+3JZ{FJt`_3Yg}8=MeDl{$BECRBUm) z?ZS~xdw*yT18)Q1o{F|S$m<}zFmeZ3m{1>L)t#L}g?kP?0VZgzMT#=MNR@(ug1!;I zd0Q*n!s9}WV_%ZxX1^k)vrqe+K{hq?jmL_ufG{3Po^j^_K6Pc^iSdDk4$(&2l{1u;hZZ-UH&wCw zySyHX9DR&GYm#3YuY0sFjB)TL__4t_x8_PyW#Lo|MI}r;R8dAeg4+fLmEGsP9d5m) Yxn;!Tu<-8l=}Y&S(Iz?IDi!nOFW3%niU0rr diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..ef91b0666ddf4b2ef61e29b4db9f224a96bdafd1 GIT binary patch literal 1907 zcmbVNYj4^x6#brG;ZbSYDmqliSmm)w9c}5f0aHQhzF8(QV3F9CA3XZ&cO8>9&DxGl z%F{mQ=0487iH`Ujy%}B&E=MCWy&BwJkYA@!tA+SNNX^iOYQe<<$vH}#&8RWak7!Tk&SyIJ%Q2!<>FZHa} z!QHrt{0aEKLBT2 zsZ>-^mb(xELZ{sjC4!!)7Uj?Pc_yhXSaruj1%gwvOeU)*V!if_6cjKm{4OBi;r)Ha znqIM?yzSsfr@b%Q!-Lfha8E^B7INq^%T>06EKI0%zUyjL3(kGp&kA3?p-dr%gR9|? zjHc(GVK79N!9H-&TJjX-Y>^59{Ri{LUyE^Aj+4g)Z+YLr$WYUX nh4#sbMex6Yhu2JRt)_)F#QbCM<-OO;OMgeHq9Z0!K97C_hq!w} literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..574fbba8abe2687ac0945a97599ff74cefcfea65 GIT binary patch literal 1876 zcmbVN+in^$5PjdTaEVm45h>!*ZRFBODNRVF0#!ju-z?)bti`Tv*P2$>MJO=a&5K2AvKPYpD!JJ8GnmYou0)WiFxC1{d1U zbxP5AG(wS}fU{08q%#}8c2+oxQ0wY3lbPkwm8OQK+;P)7!^S!zn7g8hpcaAw6CqiW zMI5!PVI7u5w8NC#k|h8-BGrjt!jUykQ*zuVW&C7ksMg$kVwsRhD+qkH8*U^|kdW3< z$-r*~c3fIzlqPmS#PxN*x6=M!Y{ktoKO;dLjgkWpt%H+DhrH#E*M9AD;USPW-tUedd3Oq z220}F-b-yep$C;Yj2C%M(}%}V`xJ1V-Dv;txF2@IoBy&t-U^rXduIh%r11*A;KXyW z0KPEZ)M#lEKn;sAp6QX-alyh&H?k-&^YqK-#(`H8#$))-W2*TO3g1TkUK@pjCKF9) zI5-yxE_1IjI>T+x|u7dp{re&ho=0ukFzE`}?7J1M)0K$8EN_g8VvnXt@ z2%{U8D)o@q{Jb3 gP+;+v>%B8H_lCG%7O{NvntAQ{Q#!cdG7_ucADB~ejsO4v literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..22ed73163790e8eaaff1d33c3e752ef3ee7a0d25 GIT binary patch literal 1876 zcmbVN+in^$5PjdTaEVm45h=<-Y2?yKDNRVF0#!ju-z?)btcBOMY%iC5eaCwtWR*my zFN?9yjAzcA8AoSgiQbKGM?WSLGQS=DxgmcqqfQ5jmC}}@EwxIh71B$@axbZK(V4My zl~FVr4pAZ~;G7o>ncStXot4fK)Vg@cRql9lVW_1U_uRHluyNiB<}aue)Il&{A{3K( z!cogv)?s-up`53R zf!`YJxpLZSLtKxDtE+CjG~r+Bw!)=h^D@5Lb>m&XNBX^L5fhbe8Bweor`vAaLMyJg zrJg5fv9D>YH?@uOkQ5q10$S!?XobF^2qC{WTmeb71;!y4j1#^QjAMLcFaS7u!U^Sj z3*y_s%h9%^d!2iX=S5DF`-frsIp8$A;qLxnH|WOK|7Cr+6)x+>Ck0ui@e01=ByceY zzA#=N(b6PhWKm*f*_Y3agP7awDW(<&d9GEvE^_-$Cj(N%Bi5PVG=<3!!Mx&iyPX6Ee(9sfr^3)b%Z@ zj^L(h@tj%&ZE#!~gqBiMUtpVYmF$->Eu?1aXR^Ety%KabFM=)r5Z>dXPFrz*X-PNd98QRZk7=*fwbpiyAZpULXpIm>1 z&W}8YbFf7lEfSRRRiYI184Q&0E#_T4FP=7n=c-fd!@-g2u0KG?fN}n*g^!LgzLh-V z%Fir2M+t!mRwQ|X#hs8aF@w>vSed-?fKItI^!x4%jh3#M{JHTioPbvj6Fl(ro0sZ~O4kX|E}c}bm%&Wxp- zl%mOajAB6n=e%IZWG;E_tS}a%*2R6MGRNZ!LoH3Y=eBhM$9XH5zo1f32hIRRC?>O* zqn5L*!?Kumm{vQo0zgNkMhYf8+3++akNc$bpX@C)hTBgp6H2zCC`7yER&j}ha-J#% zb}O*w%4w?&aeX3&!(Ml7!oS4rg-gQbWp{Y!bq|9+8T89Vm#AdVh+@NJKWw25SKLz1 zW3)O}Nvt-tjj|9D8iV^7Y{iYf-iArg#Zz7vdNTqBGCj-GHr`TmOd zcCb?0mh`AIkMS~(Xm)o$Za)W{#v32*?hm8k)qjy6FNM2$-IIJQqId;gq$H$b3F*Rk zTgyr_erj2Q@!X7qh>Il5O(Ti|vq-;uZX5(DVLXA~MM4c9LDk!+*KMOgznO_9bkIK) z2_|!^O^e(JsW%zK=M&3mjPZBS`E?xs5{gqh*6B(p8l-c#2(p9-Q&WhdR{yHJdC?Kv zR4$&U7C{fxOM}o_YU*=r)2|}?g-d*1t{N3j8^Lnfrq$u7rn##R;4)yGf9l`bF~&ENr(F4k zW#=d+Fud|0Phi{%3Bxjw*7@pGUQkM4g~@m-=ZEuw@{r>n=g>=Wl*rf>(*~q-j`rGa zouRC#-^=uxt>4u{P~`1>`0PmjJiK*(*%I^2o6OG*#Z(lL~Vs8 zK`{0?H}`Swb#x(?=-qfY`Z1Z1`Ec~-j{I#$trik1sSHP3YNU`Wq?U-~E~VB+7uwKO zM$u?6K(U~JvraIibDO+&7L3KHarKzX-17KJQ$sWExM`fhvCatQu4pQ#1!sUFBun#{ zqlPuC#qyZ8n35Z^06A|LIV8_827q;*s> zuv>y1msT02iR}_`b93EUYX2{>TVWHw+3wuzdR_RC;^`M1B9kp6lC^uuw%u!>6_?yl z$78hEmuW0Fm5Fj66dFJPD$Six5`9AvLT<0Qgd~*=jD0K^$9y9g$GAf10~|f$gmm2n zaZPWfvYpbs${oh@BBsg1<6C~sbcFBS>F3z@j9CqP8R@h@98P!O|Q&Q*iLYx z-8+=)xNs&1+^Iy%5{~oK37zgC1vBb$-(9tO1=oHahJml!SjCXQ(QrH_^U3X3=={iW zI0r7;C=sKKuVN{o&tRbVXEE=}dGWmAEgw3yJnUCgcl!Zc28?yjEnGRqc$o5xOE)v@ z62$~2SP|qIjC+y7#PpVV$Qu75L*QdMSrd(Eg3DMeqksIga-skL literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..fc8339b417fd229fc1c598b46492b36d7ea567e5 GIT binary patch literal 1876 zcmbVNZExBz5dNND;ZbSYDmv7_y2@jfI!fuJ0nDaeqb3JQ@ydh@a~)Js6F^npvg!!l zv|Aifi=Ykar9o((De6k>ai)R&5YsGEYnJ!QPxdC0dnmSY%G01Pqn$T&YL`y`28q)DUTg8v0>nVc+SyUnWfc~ zh5Eg0Ui0;%`C(fH3b`FoypI!LaRwgPRbRZ^{xULscbj5?Q`7)w_< zMdRTRaY+H^yky7}E`4n;D&weic3-H%34Ug%rMd9JwvOO9Zzc0*G?UbUF+h=uWd#?g zYTT6`*_7f|l%34Vhyj=^cM24hto+<`< zE3g;JX{!x!JtF)4i|*2df2rF_mj>Z^x4*mW!9&V-SagX>w~Q#(Pq+ORS_vgA^@5|t zz6xR`)HW%COK1q@>#XonEA$N|2>HDc3PMydFb zO`OraE$g@wyh3 zrvKEk6yrNH3`HD+aAz7`M9eJz^0~1QN(tjJyk{vjVgOlhqsw0SH6zi4M$|D#PzldP zx2aJWndwyl{`u4j$}#>9b$-qHFR28zL!B?AqM>wdXQ3>?!`#H8sO7&(Z<%xiCmj~g zLyMpX>Zw6!nQ7`vY}2k7{bzeEa!*wf$~t|ALr0ZejJe8#c2bjbBeawuAQMQ z=I>>CP1o=0B8YK&A3hsWUJq}be`txj!kfb18H`Iku2NF9OmrD?I`hyuIq?YY5?H*Y cdgm=IwILCf(Uy;)W?sAbv`J2c;&Pe%11n&0w*UYD literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu deleted file mode 100644 index 2052984d7469dd096406d7d96a847228bad28e89..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1909 zcmbVNYfsxS6#brG;T37xP+3JTdR(3`Oycze%DC@Ap-%E zlGL%!xxJ5bZ=wS}M<1?-qaQan>2x@{y-5EYN1YDh3#kl48)77v3#8_VdMAjr(SbH( zkrOl;3{b`i;H={mX>XS=ot4Hi)H=HJ(p#1tX<|su95byqaI7<&x+5YuvEU3)xTL~o z47IGG9qKdEp-QgP82~yw*Md{-(gn-&^kF~U`=4$M(VCer)N?6XQ52$GF(a8kTv|sY z1-lj4F=>@iI<>u&pPqK(xejkjyWw^jHsh0IdmNvJJ5739xWXzkI+z4^(Te3Rr*Ti)GX`eI5rh>Wg*s(Pb#zVn!Cf&r)LzJa3uJ*o%K;OW- ztP~27vCPA*=p{D_&`I4Xrs)^YC@}`g@fl! z)$IPRJ`ZJ%K8BxlT6{EK_h{c3WAC(gV~ufP%~qzW#;F+!N|-q4qC9m7zBaI^oj&jF caO(^yEF%_{rFWmtUizMCrK1BTGd_?00_@IkjsO4v diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..d65d8dcafaf8152a80fb2afbab3740801aa05ab9 GIT binary patch literal 1907 zcmbVNYj4^x6#brG;ZbSYDmv7VW|hY(b*!b+222I1`(~NMfJI_kw!@>pe%CQ+)2!{- zq!1kYoSXYN_a->vbM$6-HTXIjk?Gao_LBTM4LTjf7eZ==HdG5P7D&!fQrL_d6C5c` z7db_PejlZr0?t^TAXS*eEZ&}m=k7=1XbYEHg+wHAwA{oIfq!jQ zanwQ!CYYv{rD(RRvRQ3P9TYA$)Q31^R#+|t`iugE>`pNOxvCf#yO1zW*_tN|;~Jp{ zaP)}d!bUS<+s;aDJEJ>USd6D-Xruo2=5E`Iql@-a*g@?6_IC39zuC=q!z;VtKRE0I zdx7P%hC90Vcs*@$gW7T&8_jwtsMm{`dl7BpF95|U_Jb?RTNfqmXBH9Q< zP~MtZZGz*djoc6Le}jTSxVm8z#}v=xqJR+XQZt%j`~}+JGEIMQ!Km)be8vTJg1VVF z)k2WD@=~d(qAGVO0)$RuA0>jGs2AnW=XoZnEm=*+d{v0Qr4w6f4oIqRZwA4#v7KjC}XqF`h*3o53 z(RegMTvEVgUNU4uwtDWY5XMpK=r-t(34Ua$qp9%1wO+x=ypzlyQ6*^x$^b=brUDnJ zl{wa7fzu8%dPC*_=*ZM6$)qPsk*4H+pOpHOt)tco_lX6mRVzuNwQJ$DP)O>`Q_Vnb z1@=N`#u-bpK9MIUz3#%smE=Nu>GyH_xO=iIUoz{0;K|obz8&3+hHDol8w)LVpCRQSG7XRZueKT6ZfSUW}RXSv2XWV)4icS^Klt`>pq7K-$` zWw`Dq-UYxtPS+}gS3!Ac)gJ7hQXgWyot;8;djb6b7HDH6N2yqHt)Y~lXT&jYX;n*j z*obzN#aJB;|nEH=o3@N4iP8Ns!H9Xp|9XwN(BuvQWyE` zZS0r}ksEvB+nC=oABlH0)_pH}%DxiMv#R4n`yhu7(bl+iW+|(J77v~lRg?R>x*7@} zy^o&_$uEqTH7ZMD61)w5VlmF8WtFI^WNLz9CQLolPf9$3n+7Je)2Fq)*m*~D$wJft9iO|S6R|+AMuXv(@tDj8!<#Ge>pW_;kXTA(INDMpg1O=RRB0)O0tLOHDFoqgux4FzMXJ?uknsLWX;|Pv*MkMZxrh-~f1}H)%Y0fxm zSd+ApoY7XI3w>*Sz6*&3=fH}8^M$g~kf-r6-clBY;W>!?gX zZV7f=T4j_bHYVcYqSIbzKNE~gC*0mIpSLe|hnIBY9Xv22Z6cFyyxMkS0)J!I-ME34 zTyjGlXK21J{aG$56Xo7I)CWscnmeH+`h+5c++K4D-YOXwd!sOBd?ON$afQ$WIC{Vd z>EbzYO)sUgoYK9@9mcc5xN(2?^>)|m#+S{E;~{*fE2?=9;@?E&r|#7{ zxEnW-e*ynDC`g5;)2`#V7O7h1V4_n3w60B$a$9 z16_n4q4RFA4?#Ut!|~^Qkt*s6R$Zmw`=HJ|Rmu8MnqK=N3bCIR8W)i1R@~jCyy9h~wx*s@`1MYFUmLVMFsS`TggZ)$La$j$?cm?IY9r}ci+ek6+-f%D)k@4*E zBQ%A`ao7hg+9<(L#+OV=s66O3{#&%dvXwk6c*%#_EjN3Wth;;zDg(y4hoY`*V?0QC z2AySUl2gP8G`GUYBN*2rg=XqWi(+&v&nJp!`ObMP=Zkfoa#!H*o6}Qv9FWY076j>Z zinhwEt)@wc-_!Jpte@pY5&ZVr|2Cw!K3+QiP!W^d>D)~;#)TYP3Mnflm<%zUIw+is jI0SbM3|Zj7VG)nlqsC-{}2mL|dr+c<;cyp_yf(O6Ok&HzO!7N=aG zhO?~2Qchb;t1Vdopd}L%OC~*8i6kM%eNy;Oc9t3=>?f8=6*rZfYB)K^7W<1RbYdYK6X`2qAwkLP3~H2F5`R#$0SA6Bt(r1AwDvoK(KI zAif!_RJLP!(5c6Go~fEVJPwp-WM;K)Q5YHPRSNq2+6u}s{tm@{%lR*<1hqq*ETp2Lc0gNV@_MMGzRYZvZv=M~ zi_@SYXn|@{5L(8X`V3q5D35#Y%_7!pb5fA^p-)1E%`w=+W|UP^q_@58yIjoPTcK$}z@wu}ENa%q+V^ zoWS(TM4rL8moZGsKw4(2b9tdk0xOKhb2&ep50s}2e;h(D=_rug<)a3Q=MwF--8e&8 zp1+sr64f%T+bvj6{l(quxs8v#}kX|B|`Gh(bof%74 zDMh2<5OGNX=e%UdWN!W1Sz(N$*2P1nGAH{ehel+#ul;(A10U3KH73I8M)%1eI;+n4dxz8mlRJ<{)$i6g!qgHTTxkKj97QzHhD_%`asZPf2Im1siy zy;G53GPl~aD2z<>Duej^+6u}s{tm@{!}%|%1hqq*E~KKNb|ALY6!lO`eU;ff-w5s~ z7tcjS&;s?OAhb+0^*OfbQK5U`%_7lk^Q<86LZ5^Ro9FK4fZ0wyJ|?0aYO+XaT(}WL z%h1aixC|KQpPILJjPY$EQWzaG z%gzxeFun31Phi~11g2#mE%Vi>yig^96-MK!oFC2y%43c{4xyLgD3RP1qXvrS9PPB- zI73;HznAGXTfeK9pwR7o_}P;DeR%8svK^+GH<_OrjB`7#GE%im5i;a-=Amm+;t@P1 fuz1V#-dmbmLn15-Up|JKdF|=bCOQ*}%VqQriYajd literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..2a4de50ee5d9d12ff4cf96db9621258c7fe436e3 GIT binary patch literal 1867 zcmbVNU2obj6n)RH@TfFx6&)%thCEiOV=bLDU>Zo>n`LYR7Kv>+HXr@*v1K*+*!%o1x+Q*!5N@P#Zu1& zYUGx+nCG;`wAzvd09rCLsbtcTmB=#kxKB#|$<9(^g#E<4RB0ngg4&I+N~B1t+)>5A zZUuHi<=SdP@-C76ey6=O;h*F}Iq42z`?B5NciQ`2m-M>jqD|C#$B1J6^{(GQE1`s? zPH?n1RAH<(wM~2wg@zzNr`}1e&^MGI+h=<0HZV;OGe_mFq5u zYX&Pv+bKP0?=YShnkIMm!{&3qX}sb7?tVY$U;P*P@lv>}(>}?^sNxlT3`q#Z9Kwb1 z<|rz4{nWBG##1v4WgLSrH8m9xGt0hwt{sG8!gvI~vo$qh09kLMPP>VE-FhVI&|ddc zB$zC$)-4JnQ@!$_&#kSX9OLg$>DQe9l1flJ)Y(ER8cGMWWhR!xQRb_}7U@QCN4a<& zR0Q>ZoD_tXsiv;L);)@GkKQa&%{I^K@isI`D6o0qZULC>MJyS;sl%n z7j3oVC=)BL6tox2k?>oLxoS*2Z3N3@gI0%wqiVbQ04@V2cTb&rbd2#$Dl(WFGt15q zCos4QktZ;2WeS5bke0>jR9>i&zzS3GRL+mw1{-QyR(Y&a$0(gNU>Zo>Kg-w#EE3zY9X|T>yN(HksIAbX zkX-C@uJ5;#3pq#chW){h(TGg@gFkoVZ!>8$kX$Hj1=>)nq*@?7M=bX#buPIumM$`i z2E88Qk^;_o$&kt2^0mFFjHBAsW3F;1_?4lSX2J_wJA>oAmCRq!R8j}V07WX6=3Jnf zv#h~#P8&?CHJJgRAv2RoCOuh*EF*_?vbUdXEHy^hPb`-zttCnDb|tJ5DU!;0su<`k z!Cok*tv1B9iM+YFZp}^jC%I5w`dwIVwr;kaHaw(w2cbpOa>IyX&CYVu?9|XgC}F7= z9L;uRAj?H(*IDMmJu;$C!jzOd(zv zua3e}4FI)liSgL)4EVxq8VK7X>yO?$5tax4sn)f|gZg!9A?e+th3>fF1oA+oN<9;eK7#$PKE)gd% zy$UDKVBE_Trez?_3+Y^5sFJ`6qw!qM59b5rslXqG&`W;ok=(^m1I2TRHrlS7p)BU_ zW%`k>-{nmZZ^$g&tQbDcdEw3^|>6=$edp1eXa2Z>ipT ZOABpC1Tot3G1Sa!SD!Y?g-~41lYbitaUcKy literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..b4c9d9ecb72b442f96731937a1b7ef4f3d4c7db0 GIT binary patch literal 1872 zcmbVN+in^$5PjdTaEVm45h=<-)5xWfQbI_j0#!ju-z;Mru<+WJ*UKee-|?CyK$8en zg2mWpX6Kx_I60G3^se9S-39~uV|hpXV%eK zMp3WRL0nS6 zfoi#94OVd4U`8*=1ON@0StXhDWG1qV?DokvezJ1ZTH!viLTXh@lE8K$oE8d6oqMVo z_^rTR=-fDCN!}vz`ns7;ZTKg-&|dmY*uG4!*HFc_8f2<0Fb-@m=3*(CznaaGG6b{qVSMH`AN{vfjNFUe-)c3Nq4o1s^pDT8u$2 zj2C;d91B1ln`1n*olwb9g`quS5iz6e%jctmP)`{5;5nL8E839wI=X6w@39ic(1 z;avf^!{V%x*DsWpR&9X%i25S*)lsHmXg{`tAY`m>I7rv)_WNW!xcLfQpLhZ1V2hSU za+HY~*BUwuhDmsqv#y#I&l|z>wnwYO_Flc+e1MPvll$l9-8;s(t3(E)W8~O5;smBw zN%91Xd#PYr2BT@YI@K;zNw5l|@l>0;+y|4V62BWlFY(wSc^*d%6wf(Y8MnwSWifv* zoA3GhRow(pxA);^Ny_`-t@m%+VOV%u_@TwP^pjUks&5TdvpM Z(b5|dVHsoj7;5IVtIycvOlU5r$zSdgaTWjo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..c5139b49cee8adc677dd52261c419d9f03d788b9 GIT binary patch literal 1867 zcmbVNZExBz5dNND;ZbSYDmqlkI^?lR9iw#8pwmF=zFEdLVBy%7?eNlHzjI956txwa z6cTfO?tIU^IXjVa^lmg5{uqzRbTIsLP5z!|trn6CZJaa^4gWafw!K~e9s6YJS!*4aE&8$d~lM5GpZ!@;0&lp%~Zh! zYIw(5tl+f8j9!x&09rD)N-`P9LgYDll#{*xWaFr{!hK?e)T)tXk?l%2EfkX42dWv^ zt-wKOZ=A8jcZlqEFWPe(|4A-%kYN|a=k4zHvI7q(-*M3qF!qFf& zn(eAERzl;lB9cOVAYfDxq|xXb$`A@WD>MYD(!e-!!I+D+WCG)wLJ#2R2`6>v%t&ZP zD>bpAJ5vOVrzNNH?R~%b9B`aoe|vks?RBsIOZo9oIP0Q)l#hwUEBF+W7>X%`3*%KS zDu?{3V@r(hY(JK93c{T|V39DB{LANqjaW<=58*voQY(6p^(MOP#NS~g4xtHk3=&Mj zbJ0GuD6CXwQ2@WNbb@k>zeA;8bN)+eL0#YEGpT7T9pIMRR1USwSBWjtjo^&^;(1UJ z)PFrG2+ftDp~N0~l;WPenJL3o&+73eHc2e7Y3XhWm~7<3gA&bHk}0L_Ju7@*i;93N{;>9&w!Y(k>L=y;b1f()A7|;X!tA$*auy- zHj<-UEV$OtUNA@EyBu@Xn0OLI%lihcHhZ;dyZQhw1ICA^&aG`@JWwKssWEZv6mbHB zt0Z}Z#*I`kD5KK6Tpgax@=l0%nABhgc9DTSRjW2I*(&=b?dy)P*jyfGA7D`)=HpD8S7D&$#^-hvJj}D9_ zi;ST0aEO>7fXkhr$oPEu(phPYq1MrzS3c+L$Pi01?znBefs;Ecs5>H3kQ|%=icnN~ z#!)M`v_m~39jetjodKXDG9v{QE?w{}OCR>rz5nUP5@WdiLcLJ36-6Q16}O5@BvkH* zqF}cIJFaqVwMp|{Do#(k@!W(r!MJk5?ZS3^(%&A(r{PZfy)9f}6{m{!d&^C~*Fp=f zxFwD=G}~1&R-4*JJ_Ln^5P+8639ZmK6d~kxhAR-MVqhF%!I<&2pd8~GVE}OSh!e{7 zW~pljE46J&cG^3Pr$tOR!|ji|?V#T~Z$E{-i9OujCO`k1-TZI(%WnJ*hs(fTV3jHb zb*7*o#;cl8O-YEQON=LG7%IEeWnvn7l$a#@deS%u^@Z^mp2?CJK7c}KBglR6XsX%- z_j_#=Zot0=1%q&P<0g(9A@#yTh;C^)VHkgh`oCc8mr$J8q0VMPkx)<&US`T7sntW3 z=py|H9rw%l5VS!(9sj=1r6#UmHC-yTFWs3*O;=CK^eXgDnfqzsaREs-;{IOpcBs`d zv2p20H{Pe~Vd1O?+|zQcQh4p96GrYJ{)9Lm`tAG-y4!J>2k?T{S}>IH1yc%I31&uk z79*`136C2gj(sy$hl5%foqyVE25M@U8IQeM17SRpJcBurSbB(93ZrW8dkFLmyelDL zKnCi(SiO%OY9VmLOne{nGIxP^Ut>MYqUYk<<5^x!oDd(>&>`AryUGorRno%2^QLNc ze^;M|QlpRIXPp)wjMqKdx5UIdZ)JnzbF9 zlsJxk&h`CvbR_2J&G33~H5!rW_2BlB{5p+VEhH9F8ICs8NFf(U%~6uOlv*1dX+swo zMT34HF+l-mok)<*?ee9)AdI2L$z3jU%h`#hhGyJx(>Q=*oe_yUp{bx2lmUv6Nt!c` z8rCGOBxkghD7hvx0JKD=Q;`Tq7Cg(y!#dgPPd0{X&CQ1-7cy-`k+*imjpQj3(mE;= zkXwQsmsT02iR}<^e%_Af+W!g0r4w%Fm*cbU_B1~CFX?u+@c3CwWYX;{H{A|_ud%Cc zr-2q+azh!_VKK?;S&wx0gRI+)w~B8(L|60 z#ZdQZ9o+3Sk$(aIHz){%rybXET#HmKaxl>?4W|s_FHi`VjQtRjQ`1-3Oi1bz3e3xN zNRmoElz}e7kI-p1*oUAVs^R$af1WDp3RYdE;QOG?ELF+sNt)jHA_}pe78)0j$wu7Y zr@ZNNH3T*einQZ>xE?0X^dFq5tcVPd7y4=@WEnY#nZ~H#s<2F)Q3Sue^`A8_i50JHdF@>I`i}Pk%N7Jw z+THjvXEJkqW}FnueqY>hgf?eh%LpEfq=k^L`9JP*agASSCN0vI83NKvi6`0IB$@~#jl4cMLK%{0W zaDiHxV;vSa?J%P^WDW)$nOY^8^kga0l-#eAGJdjk)LP*_u^_c-B}vS7Eu0n#Nu7DB z8ThS?z0jF)#*(a0$&_#0^dobz8&3+hHDol8xlpl(2_kYZ{Jq0}@{9d<4O zgpLPAjtI)2ekA|S7s^ndqncqA{1+k3m0{~gQoW3|Q=I=Sk2*J*Zspyb678th;&8h~ zAiZvRWcLH_0@yv4Y;}xRL3wG_9ypj%A7ag&okDec0sR0bXk#QtsaSHYp`4&+#Bbix zs+RDu5c4Ptvf3Qhq;&S7v~;eLQ{cF0;la2SRhXNd5v{J8td( diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..52f4334d0d60c3e665f63abc68ebc4b1be1a9491 GIT binary patch literal 1903 zcmbVN*=`y!6n&qsaEVm45h)5#8o4x5N(rr6pbALon`LGUSa@v93roJf*JCCj(<_+-Mr7I_++2}g=Sic1_(Cef(1sex zNgJupXd_i}O=bXS@LUU?az_>{&&mBd+3Qa>hHA~syVP?jYDp5TT`?n>KwMf!WeRdj zuw&9HqcpK?!Y?jb&AARU!-Nk@=go`l{w1CE79N<9CXs2Uz1(!#1pdaZI_(--Fv$#c zEJL$h>C19anZyUnP!Ft7!aJ@c`h*gM+)gtIo+=p_2ZJ!q*qWyd;{!q$;OGIzrEAZK zs|P6u%YyEdcNk9#!$!UB*V}Ek)4r@fh8@N3ZEq&u|C?Ps8;)!>|3Tqs*fXr?rQn?@ zco5^&fupJ}p(b5oJl4IC-q9~(UGYc6O!5!!D;punFdo2nvZR`IA^vq#eyU!rf;;Ux z3NPUQ1_i0`w3<~M(_E;92NT`WFq&cf8H(U4%YJalsOhPE#w862bptsICgwUO%0W() z!7fq&q4Q3(5kWm1hUCxrTqx=aR#mO&|ESVTsC4xxRj)%4#rRJPtqaIx!|(0{tA~7z z!EHuGTFrgP?g!3zz&%dcGK@nn9M@t8KA2GFL(NrU7M1(9?-L<*LzRIC2mRrYjHZ_# zp(!Me!9H-&TJa3!Y>`O`)d#&Me2Z3Cwvva1AbDTEt}gUM8CZbzcneYkC)EhSH;*n?cG>oT*$Gdkg{r`$q>_tgW}1F iLvYu?;3d&pXJ{b}u`rCfybU??+|8(Ta>8WB=gDsn|9PeW literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..c2d41fdca7127d66fd326ae7cbe20209b6adb7b0 GIT binary patch literal 1872 zcmbVN+in^$5PjdTaEVm45h>!*ZRFBODIug%fhr)SZGp?ZpSKLy^ z6ExdZBvzZ+Mn0s3dJw*r-U+SH4-_Hfc7`jUsItH~WP)+R*Mf134-C2hM=v;`TxUjH zJ$O0TmUO4R!+4r!G`XasJLKZY)-N$Fq8D#*OP;wC5#8~oh+&0T_}7VwOe%*cTSl&g~pw8 zk>E1-dg_rIA@#yTer{2OPO4Re_NyEOp=v{&0DXi0a7d=3oA1!|k>hX< zwrH(If>ORnl!7jUVG_RitgB|l%SP~A_Gxw4J<#0ECkPoZnR{vAgJX>QlBZm`iDj24 zAuzr2B+sz87ZRprFq-G9bM1msf>jue=i1z5E|@&$`28SyEsqkJWW~4v@m!*fwyVrg zR@Cox^8;Uhs+XYP?PK^^ll*;n@BXqS#@-q4#s=fuPgXgpTBb-DYC3VyIVo`n9uru+ d=X&cb&AlNWmPIU|gJ#}(`m~8IxJtx4`Ug$uaPj~E literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..331686a161e7dda5eee03578f8c4bbcc6a40a434 GIT binary patch literal 1872 zcmbVN+in^$5PjdTaEVm45h=<-Y2?yKDIug%fhr)SZ ziiZ6@N(2R5<^)5=XREi)3S$XsTs(T^GoD-+YH7+Hw~Z4xnX`hq3n~T8z!{(j#iUO- zYGjtRm``YnX|*A90JKDEq+r64B~Mdw*e9j`WNWE0+~ zTY(){nYP-HtV6`rRXbjo@XvDT!`5YdwQI+_ZijR`fP5AJK}ozM#XKoLT2Z@2=9Dh9?O5{whR5sYJ8BlG}{o^e9C&YZYr zuu|KWbg#X`c$P;rzJKgDUjk0!^>_D=yIwoK{xA07R=BJkpX6i_#vAxTlc2>6^ul;u zljT?dYS{|oiRp(bE>xJ9qbLf@H2wDV=pfV*#zXi{SJdzxB)*B-aT9er$4VSSyPZ>! zU^2IQY>^uw_0mIpZe=-5F#ZXJev>4>h2qrqbvhS{hT3_6x`g;rQ`AH){Z)4JoFllW zT)Y$(K?~FqgU~{1>T>L{SB3V4M{}v!`bAORg+2*YHp^Yk0n@E`dXl^u>as{{T(}X0 z%A0?#=D8ZIJc8kMyi%6LWZ199du4g9D>IL f7VnwfIZJbEh=*n2%jZxtZ#{k5L}y$jViElVAn0)D literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..e0d8b1e87498d27bf829501e2b140cc3b9c207b9 GIT binary patch literal 1867 zcmbVN+in^$5PjdTaEVm45h)5#id-5gC4^KePz9v)%`#pC7GB%(+FbJW9q%RCtda=z zg*Em$<2iG4bS75l!(=qRnNG=KG`_zke=nm}3yHPTmZLqjN~kr`E5y8$G)<#3W9d4f zXgnOESWv*FPB3JAx_Rp?Hx{GD#e-Kq3ML#`^CTfp`=s!n>@78h+b_%uB^yx`)NZ*|Tq2=TM->CR zCD?J5YO4)NyF~Q+o%YIvf0j!hwl3TKL#KV{bxE&VEZRhE_KYaj-|YJhwC0Li>UfNn z$1;TFrnZp}g3u70*U~$o75af9gxt|^1pz7<7zZ5~$9yLk$GAcm031EzgmT>_am`?* zvMuRRdx!BN(=)w$7&c!5PU8&^cMpd_|LVWUPnW`7o%TsS<|^L6=OGE9SU|Wi-d3Vg z*H10mU_3L!P{Mf-W~Qbh$IO#&Uuy@UkT4#@?|efIA3)ZdsMBtuUbh~JI<(h46$vIY zt96Uq2&vZ|^tp}YG{*QRRQYur{}zfl!V9*Pq5B_=P4O7_d-W+_K-PqBCz zSOoQ7O$JkX4@Avc^jG}6xkwkIRni1;_*@PW+=-%t?k?mP@P>- zex3Dz%C?Fw#JEh~#7ifPJc9f=bv`s!Emyf?zls?UayHR11aCZ=Ovqw-^&J{MavaWq zi*{PXDBfImUP-dBT;OTXv3O z0)s0Pc>?2BNEnoXw8~be@iR+x&Xa&A%=D32NbX$rljqd>-KK5L+K&e2}mZE7gX z^Y=QvV(U-&2;_157(P3aJq_>OUo^zbJLBEVV4T@WOCx2&%|%v-OXHqjYZu~P)jrJxUHSQ$(BLx$V%z2iP{XW_GPu7+i!|fO5g_5--3DGXORa_#Waz_;d zyCv9hm20aF$=gI+U0t?jCj7Hp`moh(U2QsT_>kfm7A>L{Yep1nb{6Yqr-tTSaZ4Rf z(R5oTvE0-)@gXGC1NXJ`PH2UGpadbeHC#c8N(ROu5{y&65{zSfKunyMHr;0D`oGxwx5CRVTPHag!*~N9a}shffqY@S zJV;A50o1Yu#v{`UDvntgnMxE9GtRz!tsDd`Vcdu3ctH*CLgDME(+=NiCaTbgIt2+P z;d9xlTI5DZJ@*iwTUbt0jDLdAZ_@O)P@LMH&Za`qpq)q1CB&DRsELE}m(?w5j^I7J z#c^U0w7_9u5SmF%U4gB76>}dwno7- literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..ba7aacb262a94bc2abf6b2d654919d867ee74b57 GIT binary patch literal 1872 zcmbVNZExBz5dNND;ZbSYDmv7_y2@jfI!5WF0aHQhzFEdGVBy%7WAoBqzjI6|u(m>z zAQ=0(%iVL&os%;$Meq9E-jBh6jJv&uTk_{JX*7_SDQ!7gQLBWSAw5ORJ4uToIWv~d za*BGL4$1@tT;T*m#uxL~&Z4sn)h-^r@&(T>47D`pj@#M^Hiff-xeF=$5sFEl zaa1cTYcQYD2GeRuCID!N+(^NMBQu`oWVcVY@spLM#&G+Id7)%2NkX;@ZWWhEsKQaj zz;6Y1Tou}CLy8s=*VoN-YQjIur4L(|>Gk@m1s_sA!y+YWzG6hNW;$Or(;Av_#VvI_ zLz7KKVzsGl;zLTP1L14wozMz>LkU7|W4Ho}DhrH5CKzXYDHz9i&!7!(^o$eAwI;;X zgO|N+NjKU%jK^h0gZsx${Sa`PU1xp&xNbMooBy)jy%k>8Oiv0jrtu0sauT>0178>~ z_GmdKfLb=kcxXC7#gT=fIm#kpM){Y|M+ZSm828{gnp4BuQ208!YK8AH6UWeqIt2+X z;qx!I+z6>>9`bW@%V~!3cM$q*mi-cnQ`^z`L?{}x^8mVp{BjdDu~+`8x@FA~yl1;O zBo;vn>=y>1snpb!*yB7S_c5Z0)NFAe%Dd1fL1p7Ip_>XtZ59RnQn`< zTR7_hcPP-RfPF8WFmeL~jHvUWw~ne6L;JZM1fgnuodJElZof~)gPX6=^@-zf4z_5i zMTT-d%anpHgJBY$<*ch_#q&n+yzSHKu)U|bn-35&U<&u#zBq-vR%GSqbBpmTEK5L_m(c+2(L aSz3BSJS<}@AA@FIyZW?A&bZ3NH2DjKA#nTv literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..1bb9c179ebbe0f002a9c91c7a947a6eed97125c9 GIT binary patch literal 1867 zcmbVN?QYsI6#bv4@TfFx6&)&N9r9SEj#4^lz*LaBf0l6!SU9%j*nITqcO8>9joJ!L z3W>Qs=lXs-IS~u=VKf-tjK^d)7~Wr#zvoG-g~U>6%h8rvCDan>1!CSwn&-)hv2>YH zH0<|LDk$J`Cm1q5U%j;#m8Gb0_TZJzd3t83r5ShJHjd!r&I;zvs1!5@V}K$QlRo9B zkz3YcKBXYTT6}M_6zeu$wra{Z`a%^E|E~Vql$sv z670Cjwbh2?9U{8ji}u2Vf0j!hMCa}9?y>_9Dc)hxCTg{1M6vE_+ijpFSKLy^Q#9X~ zAuNU3CO$ZX`e3}4-U+SH50oI}_J%76P|3hJ_E3kE|V(gb{9Bg(I{ z9#Dno(1jS6=^J_Jgpqr2e@dMXjaAE4H0;-5285iAbPB;64n`v~8()2ghEE)aec+;v z7Aea3GF1v%4CY997h|p*6VHO6`Ou{0X0K9lSD(OSz~t_^dn?-*4%#CxI2F;<22Y+y%;Gfq$ApulaF6rg@w-P&%h*tL-{Bl*RnL zPOs?tQ(giwZXd&ELyD{6z4H$ZG4alLH!&C&dh$|9*)Y*%$m!HU-(m Zw9tllSVmhuhnjio*3%|A;VKo2TdR(3`K2%X`kj*oLIwgR zC8=ZIbG>`+xl44w=jg-raCCWdlTL@D+l%zian$J`zL3f=v>`@vxj<@;sCR-`8y#pv z7CAwq!2o5PfSq-mBJJ(+rL)pmhFV8=UV6*2BTWp+nPaB)28?xvQ+GrJCl-PM2$xj& zjG>k_v_pMHI#kJZIs=0a&$Zx`yL7?wJbl9x>;NoI&+ z8Jg`X3ad?JA|Db$0|;IT@3@la8;THeJIy2zR3R`9d0?EeHKz>Y8lVs6=+TZ#*PErT z9lX@G1=%U@FrF6q+zhrq?za7;ciw(#_9paTdpr60-{|Il!(VpecLZD}_5!M8DX=pI z1~FdMh-y+o3|(S8)`Otxl9#b=@KM@K@~1p!|(3}YX`lS z#f{5Ay79hb4-02J*gchORmN*C9M@t8988Gwq36!ept~J|c>phHt$2oVw#cM}c7mA^ zp2bM3M#AGp$YbA-)nUITrSngF&VWq?Gvl#eYepCk17o_AIA z`@8xylpK8wKkKykV7wmDzAeVyY464wJWTsU{O1LKHA~d a8B%ygEG*0DKA*kxEz?Rz2TW#s9{mMz^KSA0 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..7fc1a3e33ed4d2f1c78f534e8beec6c6e875be53 GIT binary patch literal 1903 zcmbVNYj4^x6#brG;ZbSYDmql4tnyf;j<#&tfTgOQ$us+n5iAWvCi<+olwE41!sW5Wh#8e zP|cdOk@}1_QYBYp3V;UBwcsgtWX|%OJnWO5|72~b*35iJJ(r@EBq7=*Gm;6!rFB%M zV7CN2Cap3`6Wb>I{Jhni>F{Kj@L{WY*4dmk&%=*&+8emSs!3$pX)o5DHi3U*mz{PE z&6#9|I+mg7w#;R@sZ8QSWT*#GDB&Gf5`9JqLT;;>giMtTj6*;eXKck&hVdSu3vl#^ zGK~9hA1|n8T~I_Ffe4CQ zHLF!{r(H+k2l&51!600%W);UY7i#VyM7J=EW*C2gCb-P9A6znOdMclCNrRwnL9-BI zuA@-)G*K42kN`rbofsp6e%LR`pU*R)s4G}iw_^UIOH-lJ-dDm9DFecR26P`iQ3AcOtu z!GH`W7oT7#B#yy3aM4Qf4CQQ|NeSHtvnJe&QCNthw_400xL|;LpfL01zQ8o4x5N)uA4Kvj^^H_O-tEWEa5d%5K6JKkjptdfYn zEXF=(JZElBF611&8{Q3mj7DU7H~4c){x*|F1IdLpPM{5STIvNdbHuWssQ1Z*b##$Z zH0bvcmlSY5NQP|Ym#>`_#yF~7J!U%df?rwcXexqmwKF(AILX2lRg!ve1}IW9m2rV; z-mwPDIBhVa*JK8OhD@!JOa`(LX-W?JWbZ%OIBKnMpI9cfswGKOyAnOu_WMCXcVa&x^GJ)|Cp$BmE zj01(CGb5oM%^dA3x-(h8c$%vlJv{d7r-1W#{q4i!w%6?5RM`J}E!@^_o#kbr;}v`n zl^BgFL=5BAQDCYOppGpu9@~B_=^_kcTai&Ym4jGV7!Tk(SyC%{kor1mx9X_d zsfMBo?RL&Zg2~)!)uOOcnMDTr!qN%KG5!u!f6MtVsRea?lg^~3v3fvTYKw|E%KafZ zdBzc3vR|A=6+tr`X9c0TGBo7as!K)S3wLJ9u+>RT-p7uKH8#!t%>k2*e0ox%9;>p* zX{&If+1+RAu$WWj0eD)UW%`Dh3eu__$e+-V#onsqs_^aSekR164Go9j4eo|RG9BG~ zg#nNR0q4L)Ya=;I#e!=MT?TU{e)F+cj*aJyXnEhK=JPTgDe+$2IF2T7@Uzb&sXR2Vx2@*oRa5qe)te6PdWZDh+fjMM{-|G8>pU3 zv@vewEoDXiUZx+}`dvN>Mcm%UpEb#!ins3HH^ev#HVb2mac;+#M#_dMBtuRo0s1B- j0l~Whi?>W~gQK}MB;vC0?RvPvTQ zvKafE@ywZvlM}f>?5k-y!f(?N2njT2}?otAou%mT4IDC&K3VjW#( z6pe>N#3cos50W99`_*e_r7@0LXZN|zz2Ik-I+}?fT!29-~7;h{UKoNTbm=lpqv#R%lSGW?&qH!kCM-WCG&@!Ue$5 z6AmH_{W%HkXy#yF(VfWy#2+nf7g`#In^-f(+!zrEPn33^i6UVq*Xfze@a6hd#jPF(zl=0Oo%y~7!Kl%N0SMe z-Cll$0gwa%=fFj4BRR^%l4}iJ26H8Ti?LUaji-%hx$4vU@ZzB2E$&UokUiglE-p>_z)?N1%5w>UW%haa$imx zsGd`_G48=z%F6t`Oh2&oyM7c(-QLHaH7TBox9+bR;w}$14|f*h!j7+v)D2UH3^|zQid-5grG!)}P!*)~%`&zD3$Jb2UM~6ij(6FFtdfYn zu*N=TJZElBF60uu8xKc6CKECrj{e+`zs;o4Kysyx6KF@BmU@NE60s~O>V0xy9bKgq zjRph6B?X)hk|CS<^=oH^F^+0i_nFST;8&J9nu;J??F^0&PO@-Cm82e=0gBX2Wn7?| zcdWrOP8-bVEm;7dAyca)lYy*6nv%zTa`d0<9JN-sPb`yK)siHt-3X_JLQ?xcH3PdP zI0)^HGnQl>B741dYiZ-Z=DtZJ7dpsrh)2Fdnyucx-P(6Mq}w?zT12mRjA+(d?|L<~ z5?VMK1V@WQ8O?IjxFn0(&;WE8l?7=u`i2sO!odm+u__rDM@blSv6W0$Q%5tJB= zIRp&j%}HFU0icepF`n8%tmh&MQ(KWyU}ou;&y|B%RTz)pcebWh^dawc)Na*Lw^I#8 z7255biv*Lo)v85dr827w^o6w(lw(~msn!++}|89+sTIqCF-##i=4I! zH$ZZJMfqJ)g*#LsBRVq;Y@Re1LEI1XaQ#)d=aM#J%#%qQ1h zq4$#@;2gMUYa~ahSaGeP#bBhw-+bbg6XR(kT0S;udDuUxx9bn!GGKgo>fV!MjE733 zFg#|CT_R3khUFs9VBAUtGc%Hw`RZI=tdhuz1M*zX4<91sA;&*Xp_gm+2?AewXh-5x4j8vnBZ}@z(vvc9>?tW?^bE&h7ZpNZBrhWXS0(K-Z)s iAb3(>@s{a*a5T4uL|hiWe2g{o+5>28av?OA%j6#yfpiW4 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..4502520f292873fa8db3f594ccdf064c89438253 GIT binary patch literal 1883 zcmb7F+in^$5PjdTaEVm45h;RD8o4x5N(re{lByu3ZK^cjMvc$L%ed4o82k$lrF>Y9YDM#tF2cPD{N&W{y}96!ktkv5qcs zibjJ0;*tW+2g#5v{PMNEWQ?Q6*?pl4FZh|Ij^-i=*EoXXgOezSg3}f=dQD~kXvy3v$z&i4k>}*`J=yzDHjY{=+$UB@tr}St)vkooLLsSrpqhc* z3LJ#?#u-bBE|C`(=bgEY|JvI&kzD8?!!B<5E@^izw!JQVNcoJ54$;dEBbv2)%T2r2 zKntOTqd{;q+g0(bUK*DbQ5_n95~GSBjYi*4hEUj9p&?oo1LG(PV=mT`35*X2eSo7U z94HLk841m3=HR`eJ5vOVr=_~voBKiYIp8?nV0&}F?YDcEHTM4=3#XlTj`A|;cm(ccwh;>p;}JY3OKL?QQr|?qZv56m zQHLhfF-R~8pYu-LqOek#MFIN4(h15j{ti`t#rZF(1$6_H&!ncYdO%xlQ$-x){xLaa z#t|H{Up$X0f@V0(3PN*bXehCDmr~%9J2Pe2>RC-*$Bv0LHZA=v0h5hX z0LX%Xec+!L^1igSir)<=Csn#?y;vdEckiX8)k#EMH+xz&lCgoM}*7^H}xGRD!!kxvqwBsuyRl_97kkd(kzR5{I haIe7PEz{fJXlV_JxJ>pXubqxRj literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..2ead7f5da933bd2135c40afdc28f4dabee499769 GIT binary patch literal 1883 zcmb7F+in^$5PjdTaEVm4QBuT0)5uLDr6eJh3aJWG`eqs1fQ8q#Y%iC5eaCB-5Sm0( zg2k-Q8PA!Uvm-e{@A@~r>%oAGZhF73$e&i$Y$7?;#tF2fPD?#SW`bB56!ktkvW`v* zihA8H;*tW+2g#5v{p_{9WQ?Q6=|ibYFZijYjus*a*EoRVgOezTlG7$LdO^kjXv)GW$z&i?Q50nRo^1RlOGm90?h`AeR*fu+YUjdfp^(%*P|d(@ z4Guzk6FplpqFFniE!%km zO@$VY2EoyIUB|P2XNNjQmScHS$xHf6wgRH+->J#yA`4CDDuVD=+G9h+f%Z@aOiQyA{;o{WSU7GFN^ZN$RDxChVSj9PI4sXswyo%r1k z#XdBl4ncxR`1~7OVWl$D67+?c6O?289jg9{^Pf@+>bj;FOHE_-fVRS>irC5hHaS(s z5gf8vJdY}ZX4uUNLK9_ZsIdDYQ{a<3V`bRUOl=nvsyqOn*Jqu+eyM`AY7O#-G?cNo_Hvbc`>~k`F=u_lA$Yx;exHm6mtSE3 zWI@0_aM8j@jtVj5T0@t?T#3(W?A2rA=|!}>>C<}iVyEIRKY+`C@!_d~ceXLUQKEne zGIZ=1aRP&^5_tgQRw@{rku<4Rhw@^bL{^-VhjOlch?K_)za2y`>DVB-Ptyjf=NK)G zn|n)H%HPZM9b3QZOCiPWef(LF>Z*9_{7pmLm%*0d-eO$Y@wJhB?eNlHzjI7l8nqRZ zLSlT+o$t9fXD4!j-i?Q&ACn204M%^j$lrF>Y9YDQ#tF2cPD{N+W`S4{6!ktkv5qct zibjJ0;*tW+2g#5v{OUDXGR9Hk?5@y-7yQgpM{^N`YaF5DgOe-nF+vMhM(0mR!PH(Wix!d-8mo?@8Jrxc+?;Pc1qVWno1tkV! z1_8tP{va;(08qzP7~k4KtmhPkTU(=%Fw^|Y=b9i^6~-fYPgm56KIFZLF1qno4@DiC zP{$y_Bs}Mxxn^3hCwJz`u={5vc^%s&me{QHw**W#^5H>=W-Q8-(@t^& zD3?c6-vw13VCCMvAu+C!H!f6=R_%cNl!hWURxMM>v!DBM5HmJ596~o5j>lv+x%>*f zp9KMO&_!z_Im*S7YYi<1BPG7eiC0aGr;TWN-=vkWe^76iAHZe6`0&)d2V#teO5`v+ zrjDH=PGE+WB#+RzmI`KOR9cj)WA$Q{M6Ea=kJb6%LsWSv@sCsJB_4Yu_i5BX?VO^G zarfR*mh$(q`hl(A)qRlS_C9{rq`W2GI)C2|w?(i;xV0FUc6?=|YL`SAaykvrH8}|g ht`u0jWqKPNEv+FDm&uopv1VSo0FBK~gywRQ{R4YobRqx% literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..94a82f8292420eebd9ceab59667d037ebcf49a16 GIT binary patch literal 1883 zcmb7F+in^$5PjdTaEVm45h;SuZRFBODM?7B0#!gt-z?)bV8v@&z66r5?|3hStdfYn zEXF=(JagvakT7{b<_2xo3_x3 zD{iUh8CpEnTGpG|CLu50oI}9}QQaRL#IR2840O9|YqV?-2$7N6$Ej z&i5C@x1*W8eMuj6@EFgF$R;;;!}d$SVZ7n?=59Oarsoaz|6U8Xo%Rm$vW(*me5p!Q zV-AX8e7`45Qvqt(8sn)M#&RxYn3_fyC1#d?``S2&MTPMQzOywod;qC$qtjj+rTwN9 zO=#Lb6bU8^t4)jC2&q>A;`3|EX@>DnsQ8O4`z;iwcBu1(P&8K0Efgojmz%OG_VT|= zP?2>67gdXw$|7in{mdY=l$!bi+jOfGzjSFKHM@UNm)EglVx7$kp9{cjBc7flZ^z0k z^V%!j=%!V+c8evI4uCIfv`*nTNH2_h1PNx;huB|@e3icas%ApW+gN8H;OJ^RCiBVp zcNhSP=Wq^O^q@tCa=yxxf=+|E62HaRtH;LkMzmb@YJE7^tGe?~;4)yGe{SNvV~npP z&$;q5%Z^b-V2~9-9>BO35(Z}^EsNEmyjUxd6{qB(oJ;2;<*C5$hS6(rRLIPg69=m2 z7;Ut@cZRYuf3MT`Z2hSpg;KYV@#ldQPsMxpS4}Yu-UL527#DV2ZKQ6RGGxf<%tP)FlCYH%Ri;r~Q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d17a91eb56edb5edb0b1d29783bf4df6d5d82a6 GIT binary patch literal 1883 zcmb7FU2obj6n)RH@TfFx6&)flR(V~ej<$5tXs3eIy;;UFVBy%7KY;ev?>ZqgjoOO6 z1Y@6beeby+PL9MJeHdSjE^lwi^lEf}LH>4=P6vsF(w3tQwMwW3(sRUum(;oB$XL3_ zDH;ujC=(QL&I^W2aLc#ON@E#noje2;9M4V+wKV6R+tvXb=dED=gi1jjI0F=+m<$<5 zEoWJWg^YHXR%R@H{8`eX{ePY%Dc~+b=8#C0j`nqg`>UxI{uZPZa~Z z71(p-wAF@$J`tyxEkzwWk6ESDi}yFTfrr`uk-9rVedzgwh4EjNrP)=QV0UfMzn zuDGS1XK40VX<2P*n}iq{8bTCW1~0TiKTv{@e>7ZyQWXQ^7!bx8Ukk=Dt`W`vj-GK4 zo$t?xZ$~q=eMuj6@EA{v$Zl^QhV7St!+68(&BOMrm!3D+|9dUm)=LlavW(*me5p!Q zV+x94ys8P)RDfEx#Q4q(V>y>H+?hrgC1#R;``S2&MTPMQzLO<2{0vgxM!mF+2K}ZK zP3WM1C=yH-R+|>N5mGM##OIfm(+uOEQ1KU8_FE`U?NH}4p=hk0dnitbFE?dX)bhVi zP?2>67wr}=l||4D^~@kNmzw$l+jOfGzjSFPHCw%?%j?)NvCgK2&jnzz5l>H&w_|0N zc}+_TDS0U8()mbvD)9SZ^jaJ{Wai3= z1J!ekHrlS7p{&f`>-3tfKh>j9>h>}ItV!`yym$YuDei(d!QUB-3p=hdQZ-E(GURmP mp?7lP5xgs~c+d3KTUuB{JT6OLKF6AQ>lM@{IpQi4^W-01b#&DL literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..967b0341e5e437ea30f1c1d8c3a8ae78486dfd83 GIT binary patch literal 1878 zcmb7F*=`y!6n&qsaEVm45h-FRMJ|n$l7v($Pz9v)%`zSX79QL3!jiA=^$e4cQ4-M? z#@Odv-*Y!7XJUaqjBbY4<1v}t4DYYV-)7QiAhA^1a4l+|X54dIJAvc870h2yDX0TyfFcx=A?2v% zENifk(gxFNL*@Wzh|EaAgeOa$W#nm}9Q-F+OO4_73kyQYT9QPyYi<>nNGRv2Vqmuf zd#;?e+K|v8qStG;7AF2{?wZ7M8RE9B zdY+>BzKmqKscjOXE;IlQS_Ut)LO)P~kl!1wAWkI%<0uE?ly3y%7#|V(07uU_(B?aH z;_K1O(Y~a69X!UfT-W&SVNibwIE^>h-97C3y~_&ue{Y4$+O3nEEM&ZaFM<+-F@u0% zygrIcH2~DI6~+@Yh_zfqVPYyW3d}V7_O)^lD+=Qw{7zTY@IK_dj@qp{>UOH3s6xA) zQ;}dYw_3HxjgWd7K%ZY(PE(A3Lb+e1>2IMpwF8~ag`%-|?x8Y4UuKG;I73V%JMe0ODwZl?sE>9ZpGuHsXMP6Hl z8z4KsqWmtXbO6iu4h@NMnZQwyUKqKD08FV5vAHVQDm?pj7zZ(HBb`F{hBu=TnT;>M zL+>Y^!#Qx#MvD|>e3>c*Z3ZJH{^k>}oEXm=(ej~D%ftRr&0T&1mjUDaa|a(CV|*id z#+9F1c8*d4Gb|T*0^?Rln3<8Z$XBQGVx>e@9FV7SuAPsR#~lAOiC)ujK&Gx3H&8q0 zXshko8On@RB;09T`h! z1x15?ALW7q&UwL*32y$@S#p-6#_3~F!SVdmP)iH$xosR^ zTyaZ1&(UO4aanC@n}wJe>O&S<1~0TiKTw8{-x#idsmcQ5m=MM}Ukb)C-ZAI_9KGNm zJKvcQe-hp7>`S`Q!DBowGrPNe?4Rrd4zug8Zy(paR`;T2{lBk;x1F^QiZbQ#20pP8 z*%$+37%z5+sWU(=n`3-$`cck_hI>;>BVk6xx39H>C@PEx@Epyl;XNq*3F>y@w@yVJ znox%z!6kgo+I5fI2&rcQ^7C`cX^!zvQ2b?{{}zf<+tquC0OExF3X@c^Q~&1z zl~qUZqU~azSp?0nTN#9=Qd3`I>tQAGQ%Do3*R&36x*-uOo(+G>Kq6hTn~q2e0T93 z20-RHoP#Y|YLTOY&vK=p(_pT|XF2w&vGKAIJ#TxpI_&MJ?&1@K3>fEMnt10J<7>$a zuKdWdW0VsZWMz^ESlkE+gEJaU%hjQFQ7h3ZPRT=UuAGl1PbL0&7`>Lq7MZ&=aX>xC zXr=AK8Ol=qUN_(I^{2WN5^o>l&yti^#e47HHpPALCir`Uap}iZPO7F!DML+19(t!B i9>Ki=i}zfwy``l$#N#r>@;Peety@r=?1-yeOtXLR1a$KN literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..8db5b1af5d5352ede9e2ec1ee3d9aad08961e2ec GIT binary patch literal 1883 zcmb7F+in^$5PjdTaEVm4QBuT0)5uLDr8FUx3aJ88`eqrg0Sm8f`NAb%-|?DFfF==@ zU@`WY**Rw}&W^+cz3X51ehda=bltnZB7a(0vx&r1Y0J@)S|!vJ=?P-NOX^&9WGtPQ z6!p4YR0s+<=LJJ1xY=uG$ytFKrw>5|$BR=#EiJj{wsC-s^Hwl_N~NFquZ6d@^Mj&HJYK;kRiYXr zPz>YwmN5GYP|Ic*-JTKjgwMa?awDXk2H@vsmeT^`?@;kqMe$20PHk70W1(oQo_i=x;8&Vd6G(` z1TWewc9liY4BMGOXd*TB6?UI!5hdObOsum}6>|j`F2&=cuE8e(gAR{M(Y&zgY?44HApa|KE(do%U6o+=Vm6vy!CYf0`{)^eKHzceuV*$ zc@F1bixyfGsN~Z^Dd;qqEAd&4y?SgsZ$!_VUab!=wyN&(1B471=bxK+>lou}$xE*M z(6VDx5Ex_?$pb8IgoMEvjV9IVP`g+w(JD^KLv4OIA59)B{L?UciN^+6xHNH~dXCXj z+qpB8rTo2YzUAw8eJLc}-p8K>sjiB*-oI&zyWmamcLw9ikE@;3O_L}?PKO?Prz9T1 hy#kB3T(7*Pl{duWGR5*S*34_Spf=eNSB02le*wbkbm;&9 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..e4a28e3ef2a8e6b51318a2fbde7e0271426beae8 GIT binary patch literal 1878 zcmb7FZBN@U5dNND;T37x5UgtHSmkv{tU#Hx(ur1L-z?X0TdR(3`GppK{myX%Ap+D% zY2u!rJKsI`;_N^y(1-Em=;G#v%q~Z_=j3lEYqybDDs4I1P^*MmBE3K?cuAeh4veMC zf}+uIh;l&z=e%IZ1h;x?FB!{G>-a9H;COy)sHFw>+_v`6ao!5%kEs;Yfis{Y6q6z6 zsO2ncvyjs^)9Rkg0niqOk%9?Nmb@s)lbr1QCmTzR;r0s)LdjN^#c0>uDlU;w&Qry} zZUy#SIc>Eep+`i&f6`r;_^-3=5X)tVV%H;`Zhw2)gNKywxabnK+AyM6f3@ki(2^@| zspmPGKU7RsLT$4U<3dA-LCfHUR_F)H5b_U(E8wZpz&M73anA1r;~3Wz1^`FTIEc;n z=ENUGGc~!S4?1{^XJuSB*LTCCmw^5BhTH4A?Vx|wQ2yUj;jokLUQVVk-oPhPA{jFv z4C8f;mnH$!vK7V?GmN#ISeTeb7zr~izI|oYKOX*3q@n`+(KnSe1%CxQA__*g36>L zIB2(c$t;3SsAmSDh1ArS*rr>F{N&PHYPNn+mRGS|VwufKpG&}WBOV_mKZ=E!^4d*q zK-lFG)ptRq16V2BH6+GW0>?pmVdMi4Fr_}k=4xasdG>2J4r12EItThjm*X*+-JE@g z-p@RTeb7bsTI8tU%UmgFGZ-oHT~54eVmu3?`Q_c<=mOOH6_{!A}gvr5#rpsaht53^|>8=$wLh i1Xl_y-ZQ=RmX_8KkIUrC=U6juU4YtT2VCW1k^KV{w{!>q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu deleted file mode 100644 index 755c024e303b6554da24e8a723a8a13bbb16bdb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1911 zcmbVNZExBz5dNND;ZbSYDmqliSmm-x9c!7iwM+%6`(_#2fQ4gQj?GJd{mu!YDWhsT zDG7dgp5yNE-8nguQ}piUYViH`mW;0kcbDYPY0_#TIn%}ow4zQ+Jws-SSn!JGd2(bO zon;ga`hCPD1zheWLpJ2|=k`it95qhvgATdiCzd*z2`^mZ6`b5V$@~dblIGwHP^4xm zaDf`RV=WdqZ84*lWCDPe%&d}3dNLDPMt19D>pxjJYOQdeSddyZk|ajE5Kaq)q|QCn z4D425FLZ94u_UBKo}G2tQyXWJ3+<)f#N|9Cr|q-#_9fkP4G+vnn`qWe=c{hoKr^9* zqh4?{*;Gnai^e4(#)bM2gHgdtqtRECAmlezXwa!*U>w82n2V)k0^=H?2XOR=gV=mJ zA-);S)YcW5EZdSh(iXI$%6IGvmS@*%+ zw29&i_?Mty6`oFeA1AC-W)>hsKX-z1jK4tCZ$Nws?}M*Rg`i%o`Uj7m(3PK0GMVjM-YA zY`b)%)83xfZs1)2+|z`u&hREEFRj{u03+%{EV+wwsBtf#E5HOTjpQg3Gp;pM6LgOF zEgDIUdWk2EzDCi43~M$gyL@2{f#&?=jF<@UEqT zHW{hYV)Qn4Ooqsfo$+nV@41h}hZ<|wjt<4Q#q+%CIWa!Sp<}c%ZjoEcszZw#hfUS& z{;n>FQlt0rXGw}{<7JPwH8Bj{20yeI7uNF1R8=!ILopLZ9ttQU9>IMBgWB$4Z?D$g Z(ZVtkaaely@%*_P%^I5=3C-m+`3v2oad7|u diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..db1db695aa0898639b4af23e2a9e9b9196398622 GIT binary patch literal 1909 zcmbVNYj4^x6#brG;ZbSYDmqliSmm)w9c$UN0aHQhzF8(VV3F9CA3XZ&cO4T-S=+Hm zdD`dP+{d{$$%&YvSN-eWm%)IHuY0#wsU5+w&WOyN(Oggq&HzQoEcc9~ znl)J?^Ncn!C6{CZfQBe^E;8ZBj28vjt&^?)WM!z<+`P-Yka;ahLbMBRB+rqM)=`;( z-3siuw8|(=d`iT{MXNd0;Y~0uop775Y*TXHyjX8f(oWZKz#C~2nYGjTs+|(J8@p(y zH8kUr8|pYilTDSF75LP_)qB?!5V<`NQCF)$8sVa)haWE|r?LI>dJ z0S7_4bV6J`nAuy;=|*{n@wg0d&|QDMTX)*&W&JViD0X*!JNo_<`~N-;-)uGi!Q&X& zGpv}Wke@N+5aY#Ost$8PO*Y4PsJlVuF)>4ZkdTNO6(8OoYy`Do+=J_APBrg<0_v!` z9H#XU+)nE#oPhru6s*G0Y98Xa7P*>v2+_?ArwrrI&<$6N{ScB<(^bVpNE##s;T1Z{ zWKR&gT9pDo=)4^RM9>xc_4xCBnk(u`)}c=^`O%+AuCm1=rQU>Aipm?8E|-weO5ES) zydKmVE8C2Yw3^#`?H-)*fO{<1s)l`^JE8LpBw$3H4=s04u;|>k?X(Dm>njH7>s|Nz zWH7$`2%{l!9QJ{WmP#;G@EMa5nhz#UxRwL48YmA7!Sc3!tIf`y>@MGc%Yd=&p|AJ0 zF}}`u0rO>KvQxwejJPt$BN#U#hmjgc)AH$9UQkM4h530b=ZkfLa$n+i^XaKLwn%2< z5CS=!qLnfWt7#VN_cVRa*3as95%cys+?J&LiM;gvZA%Qj)7}j=#-$xw8L3((h72_w nIp~~%I0SzjczDV5+8J6}Lp(f2U)~1IJokT8IyvEziD~j1i+y}4 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..f6b7d1e7233711c689e6f98c42e03fb38f2600ce GIT binary patch literal 1878 zcmb7F*=`y!6n&qsaEVm45h-FRja(Wjr3tB2pejh|n`LYR7BjZxv03um) zgR#%KzUOX^F60uu8{dt7OeSQ0H~Mo+{d`Bo3Vvm%rHOFDHqPLr&PwL4Xe?<8&HzO!7JDvG zBekr>Jf|(D)s`#((2|LXC6kVJ}LYsJ4=lb_7n3`#f>NmYB$0v5hJNmM->CR zCD;j-YO4+LJtD8KyPc&8|0Ea6Np}d_X^*r!*ZXc~-|vxruUK@5TJIQ9tleMl+WiJv z2_-Cbf}_QujAgm0ZRCS8Gz1ko_D*VrzM%*qcQ8Uhq)GO4&!;IZ1V6pY@Pzn;|=!@kNZKpe^X=s@3nASw{w=4xsF%xc~n9) z<`6NAH~s(P|kT6rluw%$IOy1pKAx9s4yPEcebWR3?TJQ)a^7;zgG`M z9op}miv*LI)w)GtWUN;n^trValwbP*$1Maju%j}JP?4*egpngW353N;8Rqoo)VjP5&jWvhZjqb){GN0Uhh2D>x zfOFuYt(F`mV#Sq$CWDa@zS+bpC&u$euw1lhc{r#P+|37Y88E4P?%&EW#&@wuV1Udl zyF{G849i5G!MK+(%*;SqW~*~~p-ciR49IgiKT;PcPZ|C=hhEZAAbFY(8z`Pjw9|Hz z8p`tgy-csz`dz*UdEDNIpDoE=iMQ@A`eEvw@os7`&g`V6k+NTM$&k~TgSJVCL-3@) e;w{sAXK7{)iLlIl`50>EwFgj}=t3wim(f3U)^l|L literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..1353ffd1766e38fbd6f347c35c71a8cc943c0d32 GIT binary patch literal 1878 zcmb7F+in^$5PjdTaEVm45h=<-Y2?yKDNRVF0#!ju-z;Mru-LUNugxW2-|=3^W|c(r zWij?ScZKPD40za9O#A%8EUP6x@A(pI1ywMwcL(o4j=OK6rwXU5W1 zO3`RIL|jt9Wll0=e71h=tT4t=>*B#Hp9y|psHLfJ!nRJ}WX?+FE@&cY2F?IQDwcRI zP%E>n!#t-Qrqz}#0ML=CNhFhwtVEiU<31_+h=;~HTAaP)+O z=v;3>TsxSl?I-l0y~B8(M>e^C7`C4SPU8*t_YeC)H@&D989aRQwI+zoZh>4t2VaiiYZe*iuteL@oEn;R|;biDsK;HF+1hCDhnF_csU3cJlEt5$#ZwMNZ?w zji6$F$8q7T2VA~aHYCPX_QpPO(j*6neny=St<^|X;o8q~9E6mOH3#iRx8pIHPp-d0 z??+C+IdIWdOO8^p;z~i2!AJ?;eBxCT<7p#UF59#^9MlT#`UAKOn9M!(Z|xZ4+eD-= zKxUSmBTit3j?PQgas$YtbA*VA3ZIcp* h;7NhSTc-EU(%c#nVOjX{G1Sa!51=;DnNVCVqkl-=b87$q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu new file mode 100644 index 0000000000000000000000000000000000000000..dbdc1bd6a0277830c0450532ce726525f6c09b34 GIT binary patch literal 1873 zcmb7F*=`y!6n&qsaEVm45h)5#id-5gr3tB2pejh|n`LYR7BjZxg(Y9#>sb;qN+SBg z82g;-d+z4sOfJ#8@y+PRWJ2aQqd(W=?`6_zA-Pi83bdnENwq?HiCFL{buKwGmaa02 zM#CZEk^;_o$&d+d{n}Y!jHAZIeNe#(eqpGknef6kPT)9iCG!_FmDGVVK#_{2feX}d zmbF;mw8gaAk_7--GBc@U(vy|QGV-`jO8?2uQe%Yu#DY|5BT1s#jj&3jNGj*4Vqmud zd!d}R+K|vCvfuBtmnQy`TqrO7A#S@axor3Mo%X)hCB1IBXcM*GF``(1z3Vs7N+@Be z7aT1PRV1rTZIcjnp&@9{Y4B1j^bI8l`GXM(;#4s(j&d;OVk?=z_=qq7IC{c?Hs4(k z-;8FC_EUP$!DBqnbxrQ>ht21J(|E)E-Ti*hzp9b{_g1*9(>}?`LdGljA}BE!a|js5 zo1?hY13)cXV>~s(Sj$BerluyNz|68QpKAxPqA(u8?`%zt7(m{esMBtuUbh~KI<(h4 z6$vJDt96UQ$W*Ta=<{nUD989al>0U3zoZh>4t2JWipJsrZJ8+w;wbfx$;lIr;F5Cj zJgNw~;5aJ?EmKW>j;*^?1io-*k!rSiR+6`|Rbq+FbANNdY$qQcQqhb>S>&``xB-&$ zE2^)z0Z{o?*^3xgxf_SnOOqZz`;7V!`>K|x!m*#_B!~$cYYvec-HgX%KDqh|ZJ&4n z=fFiiYo;j2Gb<|=EJTU7Ec?|a@nKR;ozv;u0DXvfN}n*d5?}UzDY#}vtwr2 zIpPGyS1$4d#;r_YTt?C|U!BT}MG{$YHlE7);e4b#TarHzZ{1&Z!!&pk{M2Ba+i{hVs#^-lkkgrmrpbs$@S4El dEz^5%X>JXPxGa467;EOWw@;hoOeijw$v>X5a*6-| literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..a7f2d4790f4e2f036342eeca3d0b9345b45e71bc GIT binary patch literal 1878 zcmb7FYj4^x6#brG;ZbSYDmnyZtnyf;j!`;kz%-D$Zzql$su z66}P^wbh3BHjy_s*R8n;|0Ea6Nw*8zd7Cs_H``7dKBRbtMT@BAh7rY@o#m$4siB2X z!cr$Vn(fM1mYdopJ}5&yP@z-rq*mw~N)U28BNRlcWMCWwVa&x^GJ)|Cp$l;Ij01J9 zJtM9j%pC2fbf>+;cv>hMJv{d6r-1W#z3s!}w%hF7R@ndhT6o)a>nty$j#uz8Dj^zE zh#1DJqrg-nKrLHhJT|>h&M^#QQ;`udlkCgq%0Vb9j0fH%$;i4}2_`$KYyj3ao- zesLOA1dVW<6@=!grmnzNU5bH^?#xonRwp%iAG#&f*tGDs08BRW=_wWUP?a&Kt>{Lx zv(MCF;j9PTX?d2}8~W5qlkPzMggPHutCFhd+Ryzs2q_zC4zU~bheI+Q-F}7MPn>{r z;G(sb9A#p`m4YUNkrJN8#49Jp^G2||Z`1OydsJ|@AHZe6&>m cOmCf~g*7C?GWzl{)XZxapf<^cP+ZQFf7$1Ai2wiq literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..7c3a1ef3020ed7d56ec10d265e1b8a00f98f736d GIT binary patch literal 1878 zcmb7F+in^$5PjdTaEVm45h=<-)5xWfQbI_j0#!ju-z;Mru-LUNugxW2-|?CyK$D0{ zuo(Ny?3^Xe7sj^y<1hxxdmB^4(g`Gk@m1s_sA!y+YWzG6hNW;$Or(;Auy zB`kGBF=Rc(q)OK_}k&1@u0dBd86|tB5U2@8dBY4Sn zai}VSM%d2^LeorBS7MJzM&V;PlT5S4K~3I=ZV5FuF5@i$qm_Jm%0xX>Wz1z-2%fiXS{=6c3hw3ugbbL%J@@b4F~;3YPq4U`8O+RJG%Z)B+J!O+R$)M%YI9e(VDePrcXQ|^9$O?Y;;@0@IY%pP7lolL z=I>?mJzu}7`ylG}KKv|6c}u+Y{%t=Dy))ho4aTLPqH^zC-jE2(7|X{{Gp}8M+9YQ}aXC%?0<7@kg6c1Z^`XA1V1y>(n5G)8%O9kZzc0*G?&zYGoT_B%L5mv z;Vf&hz-fzVwI(wFv}9p&$)qO>Q557+PWJwjjits2`-ugq@H`Oz2QRfk-%y5--x;AGPL&46kq*XOtR)i|A1L$yj-GJ9&39(R zH=~&Yc}{mac#Nkdukr1Dzxf<+oL+x>d%x{L4!l08q=87~h$GtmPDiJ5!^PFq7iT=h{ZBD2#{jo-C;mJ;-|#U3TKH9*R0Np^ia< zNq8>Wb&JBtTrUFf^Ghoz$M`#x`!(mkq!QHjbup8Q#^M2Pg-Hc*kow2ulnF<0$bRuW zstCH^Fe?bnb4`7Tt-F*0pWK<{nysFdQOLbC`3|^Gs1LEPYKcmY{oGH2n6Q!N5V_%CG$PaS)mLcy%nR5D zU9{Gcqe3jWQqW;AP2#&8cGa+W5=6`U9<4Te2jzD40bB-*^H0rtu#NE`7X{3YiDjpV z6Bu75$s;swWDespD$UE)v3jvcqE?)Z$LjoWKB_#F_{Smi5|2HSyEJK_bWYJm+m$nv zrTo3Deqif&brqzzy^o(YDKCh(&fj;#UGOIOJA-j)$5lqEZb_6OrxOoNQxK2fHi5-k crnlbG(i#$RnSA*eYv#4PPn+ySC@$yOKLU($o&W#< literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu deleted file mode 100644 index 277b3308aaab5b2883a3b4caa6841c300c839c1d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1911 zcmbVNZBN@U5dNND;T37xP+3J9I%GN|R!S#r(1})J-z+zFTdR(3Id)(8^*bj`30)B| zDKC!wJeRxkbLZ$tOwoti{^0uVj*R<*`z!M2G-|bwm?>>JT2ZTnnjt+!sdtj*d30nf zon;gadOgGh1zhe#ij2?aFRhiv7;2n6c;$1>P7Jj)dEF-&lvh|;=EH#GPFR2$wHlip*yWm!FiG<1> zRVmo5z>ce2TWyF>h&VrQ$5Ru&1mnsHw+YjELgKT|`ZPWdH|Zp6xWXtVD(ximRVQhn z8CTp=#~GSzs!Uds+D1Oag?bQ!mfi`i&^Htzy~B81#C6wO|9Dt;JIQ79IqXgB-uiy@^AP+0eh&X^$M5jCjO+zg znWvDSG2{^AMJ-kPIiWV4V>~pyAn!6UL$jBV5;Mxap6x9JkzqW5=V(q1?}8MX2zI}C z?5A}f+)0`!+<<=#3I^e7$NM;Lgw!(+A-cKclwtfGir|W|UqW$edpes4MT4Xuyv&p` zsR?4YSA_r&I_;DJA}EIXe*F7Am72POweM4z{L-I^)al}xQg1>9mC73zE*Fr|N<2PF z-VACjcQ!5^X~*0B+6|obfO{_3Y7cL{bi&9DBw$3H4<&bb0X6P8bOrc8OD!16_>3t9 z)dZa*Jd4IwHHN2&5XZJ2tHo|jN0*r-ws@9TJtxEm8ahTRZ5O$rX|-u# z;jpQi-QU##QRe7lcr8hBY`pH#wkC$&8SjP$dZadiLy diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..65b56c5e7a92bcded8d6246d7b7ec831a25c6036 GIT binary patch literal 1909 zcmbVNYj4^x6#brG;ZbSYDmqk1v&!o#b+l#EMmrUx?we&|0~U#G`N5;Ve%CRfl(ik3 z6oO-)b8{c(-XsTNjNbIGdY3mhWO&uPyCA=glU56fiByK81vOI02~uN}dsk3vlLKw& zq@<|V?II>9;H(ol(%w#AI*Y~_Y8>5r=`CkRni^Vi$4z4oj&(-l?uZtGT5tv^Lgs~M z95t-TTe)Ynl`A1$!4Fd{U-}UwdUqS?u9HGNfM%+aU*$wgtU&z z9PHL$$E8(9Y2s5NPER}Su?|myap{Cxg>9RX_DQxpZl8u9$sKPs;B>IdJgxpGV2??ti7>BqpW_&Jkj`0@Z z4B+Sy2SK`YL|ikN+1fAYN_mIzunO>|yZm~;Jj>Ga=2O^S?C$bz@clXV|GgaE*=hfS z$1$=OSTRo_KSRhN#b9HKU2v8*QTPG=Z%{A@SEs#;<60DI;vqygHJmbxzd$!!F!n=8PEA*pBOz&! z6ogmmD3dKgY-&{r0HNb728f_5w(Ifd^SDsdRjgf~V)COuqeA7gCrZ5ztrV3vtX!@j zgN1l_D0nldHCDDA9qF{!_1Y|)@ql|O*t&*&UpS$Q6(nFloewRyQ?TgVxAn9Lh3hK@ z>FZtf`{ZVL{s~4y;y9cG7tNJmsN@qSB{UyQoN%uOVm(kEH-hDL`__kNTe3TU2QCA~ zy2rlWI>z{_;3dqLfyoaMBQWBsAopNgivmV!AdRclzPzB6zzXwoU(Tg-Ri;%hu2Oa}o3QHvG*=^$~gP{p*&v^-gA*qfl*A$U;=tlH(@SS)WexGLjJ~`NntAE_sC06`B@^T1H=q-IEdT%j literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu deleted file mode 100644 index 5773bc47ab4162c947411809b126aa3e32340a35..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1911 zcmbVN+in^$5PjdTaEVm45h)66Y2?sIDIr8^6RLoezFEd=z`|==UYkq4zT>@+4JAra zX?OAEoMC2sW*i-f8TxQL7+&4ok;!0qe@XtFMy(bSbEPduYigBHbEId8c_(R>MMuWc zc}mf+-$#j{fXkd<$oOpW(q3UKL5-6KuYAUn6GJUcx#PC+22SRzVD5xUK{Id$C_*vm z6OJ00Wi93t+G1L*$P@rAks2wOaAeNYlJgj zgPGd8q+9JB#*;jzyZ+|pVbklz7tQCeH?jMh`|;01?Em|7_|H!J9Ud2vy}&B+6!J5H z9AdnzrD{JX)UpM}BhwGkE)p{`dkHBpKulbX7mweMGP0EJIesoC0ry4cG6NWz#pAIk3H9BSNg=n61FD=iX~@_C{ZR1|cM z@XZ@r)fk=@LL6mHR-3(=mM%V(IfHa+=p0WaTLWP{kUZtejV(Jy34w-H`W^y(1MgZ$ zXp@0D%SZ2H2SEgG=#1}UUS%#2A8V{#OF9%^iBGbs?}Ye3L&s>X?J_fzRfiTf4x6gk z{asxUMUFm(pB2fkjn_RY>tf`c@or==&aGvYsj6;jhJq5t4vHux4#9l`gWB$4ZwDJ^ ZX>J+uFf6?LeE!mnX0(ZpxJtw<`U_;)aaRBU diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..80f50629694a5e7995abde900c10ec4197d6a85a GIT binary patch literal 1909 zcmbVNYj4^x6#brG;ZbSYDmqliSmm)w9c$UN0aHQhzF8(QVBy%7A3XZ&cO4T-S=+Hm zdD`dP+{d{$$qApLSN-eWm%)IHuY0#wR$}Mg^x9oB;}#neaJ7 zHEXg)=5yM}lw6Vt02;i|f@j>387m61TPIup$;wcznR%CaE=4U#LbMBJBol~B>!{4Y zZUuHsT4j_bJ|+C(qSc)0@Mf6sVacZCym_(So}`_w;ea>NBr`3^85#vjGC@0CS1}Wsap^(gjnb( zl|51H>Q#yWq4RbO5Yz&#diRmZ*;j%%@jB#fx@q3sTe7M=UHofe^ZeU(EBd)NIw z8H_JK!e~ewgMHwlrQ$g%*esV48W1K|acRd^Myj@nAwx|^ n4!Wlx4#6J>9$qrNc7~SL5DSmdm$yMP&;1{jPEMH2`84?r5GH&N literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..0429ab43b5cccd54ec642283cdeb42fe7d791fa9 GIT binary patch literal 1878 zcmb7F+in^$5PjdTaEVm45h>zQ8o4x5N(iY`pbALon`OKPEWEbmwYlW$JKkl_nY%qMhC4ZYyqk+UyY0J@;S|!vH=>=lmNt&k7g|T#* zP&DlKQ7kCnQYRQPK3%gKiOJp47XpH7fRNmC}g|lR&j}hN*z@U z{1#xxRjREv#CM3ezHYY`Cj7Hp`mmLDNV9dlYqxgY4(WCdixyFv*Ap2X4OhTYVS#Z-2IH7-1mhT&40-@ZFF44| zb>_s?gPYR6q*! zyPb29;4<@C^~jBodg&oQx3ZkZ82<#d-^TH8p*XdDoy>)zK|PNkPRK7YxhhKaKNcvf zI)aN1i&JJ1G(x#D2rZj8Hv(V~JQFP$)Q4-8DH^P#;e<;p|*br=VsY$F{5f5X9OL}ufg@6h{^ z<8ThPXro1p626R;f<}Xp6295QD<;OvM(})S)#9*M65Y)w2pKS`d+FiQF~$SQ6RzCU zvP%>bm|(o$|*Y9=nlCM9-dywI0F=+nDhxp zjm)wZ^9gM+tu|y1fR;#&6ihg>(Egl`1n7#|UO07uU_h|G27 z#5IGNqkT#D+B=MAc~s;3$A0rA;51%;cmKHSwd3m=`+u*6+uHF-UKVk@fiF}EYRo_} zjMqoP)D@tXtuUULekkQahKZ?#QDCO&x39H>P*4~T;X7SX!+VhWCThn`)a}%zs6)G* zQ;}dYw_3NzjgWfjAwIXVoF*9mglfM@lHWpcYWq5!3q?cqJV0?me5om_;wb-z1m#&r za8bE%T}!pdq+ig{RvzKOy-_@_~;npf#fMy zZfe;%N(jubJjfFmcS6F<45UTAI+YhnC9uMPJe6~sxj=c!@rPOTS{x-Z$%=sk#dD6f z+O9K0S&_fj=|{HyRPRBd+sE*;A^9uu-u-1yOuRGRO$^4lovboa^-K{m&cq6Rn2g3Z(lTYPQJXy@iuE`9egmz!;+8rd zqvf%TV!5epy%G+aT9N(RP33dS+t3C1z55C#B8&p6QKx=Z4k z!Axae(xdhc<3*-vdiOAFz66}c8y@Z+4uk$xjr_m2!eyQINlxZ6-oWQU3Bg!Ez%brc z;!+O)wQPg&%nU;v=TVrMnv5JXPriMv9fWGacnrVu4K;iKd2galyNP<;dMN79UiVZa zn9Qu!Epj8IUVG5zHkQ*EO_4a8KopBY5RrC)x({gYC literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu new file mode 100644 index 0000000000000000000000000000000000000000..35d1f344a6cd0dd9f432036726aa3c762b110168 GIT binary patch literal 1878 zcmb7F+in^$5PjdTaEVm45h;RD8o4x5N)l43KoyYEH_LbpSa@yAYjerhcf4i^tdfXI zuo(NC*}0qqx5FWs-1Q%C$lqqtXdp3H+H$m}RtYsndWM*HlID4GW-Og& z6!m*OlnM&C+zEz^&lhiExwC?~3n~T8!5N?k#iUO; zs^yk7m``bgX|*C#05n8qq+r64InOe(-zPi&$=Xt5xc$PsP_mXJA=)Lkic2I^?xl{)n#jD!avKU4_kSgG+S4jP8&X?c!oubsKuHQ#hRVPy4k6rIal0L z$5S-jmU%2UwM~483iTiYExi+3p&uwg$ZZW*kfoA=aR>(El&=Ki7#|S207ox4h|IO8 z#MOhDgMCT2+B=LVMO4Fwr(XRSa2l_-d3f4%o1N$i@UR zhVk+sG1UxE%N7`qOfN_|reS0%VMNS0`}VbR5CnyBAD-g{HM|R@ucJ;oe5tWM5 zl*RhJPCu~qr@Rkh-adw(6)A3s_ujwjiII24yOF`Tu#=ZY%AScKLrupHS|=k8!Ic7w d_e^h`rG+)b!!r8vIcVms3s9TnjH^`4l7DbQb8i3u literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ac7388e3befc0c62f994bf55804f17c9a0b2ddd GIT binary patch literal 1878 zcmb7F+in^$5PjdTaEVm45h=<-)5xWfQc6gr0#!gt-z?)bVBxhbugxW2-|?CyK$D0{ zuo(Ny?3^&3TFj#7gP#bKrmn;6q7#V zs8(3kU_PS_rqz;60MHP*k%9?FW<1ZyZl7%9Co4;h;r0{rLdjZ^1hxxq6_-e;!coP* zZv}Q-720Y;e2a+d>t;GN;h*KwhpnPTF4ODvRSQ0(e1=6z)O^K=V$F2EYNj58HK+>V2gw!Y3lzh1ZBC*#4*SLpr3aX1HC zwA3O)IiF=pL8HM)3D0ukRTJZRBY57nYIWG&E4rHx5Hes2_uRvK#~61d&$)6V%g#|o zV1|_>Pq4TX5@u#FnwG0m?Lw&pt1uu>wfRxFVDePrceCgv9$REq#DN3FbBF8jMRnMdhUGnMfIOI&#oDIdKTC6j;3F cdhIMNy&)c!F_w>^W?s7hwMovn%EUDJ3qvw9joOMy zAu-qIT;FdeCt`stGD)|vJ^GW9=!57PtOdsG~Hm4fDA3{Zq((x)6X za?4uGr?kbi+K@Q_S|T%2FyY9OXBl~tlY{+aYpF5ZeqmlH*+`P$?V4M~B@!xkR58$7 zf*n`6w%QQiA)?#8XfI6oXSwu2ly}H^ySuyWz(b06ShR^+Z5dIlyV`aeXvr0~)bSL} z_hl4Ip|*(+KA}F?ucdcFEA#^;2)VuC3Sv|;Fb=L@obrue9ODY12XOR^1HW8nPFypn zsmLYWYws|g6+Vsc9{SCffa7@m-QB~k*S)Hd|Myfl?4o^?kH(mZtuUULeyHOZg^8)T5i!&3+t=Dgs3weu@Sd)y;XTNE6J2(~uO5mzG@_0{f=YNU z+I5ZG2&tDI{BtYIX^QbrDD~?!{Vf!ywy(3fP&5?JJya(6mzh`;mGnO)s7N}3gAR+A zkwwr2)yyEYkea#zTQ@5PKbka`nyp`yT?8;&BEC`wpeOhkzDkXRI2}}k|?w%XCvW@XT@{B7twd@q7 z1jbk4WCnYZpfZITnNQn5(>0h{h}ga7~l literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu deleted file mode 100644 index d9f265758249e4062f26b659a19b8594568315c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1911 zcmbVNYj4^x6#brG;ZbSYDmv7VvdU$ZI!c+ewM+%6`(_!(fQ4gQj?JULe%A@1DWhsT zDG82!&gDMNy@`&*6n(hu53cX-$hbeazaoE5qgD%vnbMY{6}3vJ8PZe4ypuG~qa$PK zETd@9>!DOoz~xRbWPCn<>8v!CqQ=RCS3c+IiJ_Ke+;Q7@11EP@Fn2pxjpY7Dnum={Vmq9{bW;8t;ogvuRN z4D425$5pPaHpC}HoS(PjsR?hEOCPrKgv4i^^=W(_ZqiBCaD`P&6ze4ORVQhn8CTp= z$5S-fR9UPxwT*m;3H88#Exi+3p>HTc$ZZT)kfVx$aR>$DlrIJ27}p40fTJfI#O0C+ zam`?+wlC>Mdx!D3i0Q7k{_(Kxc9P5HbJ&~Mz4iU*=OOn0{W<(+JAQ}9Wn?d~$~=Yq zj3I{@FKVgU&k40`j`7g+g0#!T49#9bO3WzxdbW2E1cvbdo})Q6ybDrjBB*}x*iY*| zxRW$dxB>qf6b!=Ej`wlg2&rctLUeP>X^Qc8Xn?CU{UsEqwx_d+P&7#D9`p(!W~P)% zO%%I&6(T_Bv{MF&pc(2l`S*P)HFX7R->Khks7m(3PJU&X^ z42mtQ8<&o><894$3uisxo{P4s!K{_=|j;EHbfiUh%o^j z??y-%lYu%dR_|j6K?H7?jPGM!=PnQ*YpmT!Iuzd)pXSxv3Gso3j?qfnMQ$joiWUwI zo2uFUU40&_o$=2);M4sGT16 dw!e0k7M2kY%hJ2g7cYI$j5g5`SE-mre*t-!aa#ZY diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu new file mode 100644 index 0000000000000000000000000000000000000000..55acfe28f84c8122dfe1396cc25a050214c46623 GIT binary patch literal 1909 zcmbVNYj4^x6#brG;ZbSYDmqk1v&!o#b+l#E2Av90_suei0Sm{r`~cctzw4M#%G!=i z3c<0@xw(&XZ;}H(L2vq3z01LXjIMgO7v$G*(rO_-mC7)*q(*W%MQVa_?*z3rInahq zONx5kE-E+$oOL`$+S}PnXVF-J8b^0tddrFSktr)nve_qV|H;x&t(p0ddoD#INkX)9W+W4cOY5l2 z!EOz9Oj>1>CO#$n^t98S=9{Lmey7_`XhOy{Sy%LvW}Ifhge}R}y_j2}15(GYQG685oC{FfQ1F=M3X5!WqEP zBMt&|>6o}?FtfEU=)Lj|<53mdpu75dw>rzx^X607UhMAbcKH1{_W!*c-q~sYgU2zl z7g#Y*AwMI?A;$BqRPE-3ntX=wjqV1G$Hd&|orFZpu>5$ka}bnOM>_3w%{B{XJm8*+wytB}3&*v%ha?QC^P%l_iWZ&www@NDczsns z3VT=mJ{gS8Kf!279D{S0-f8b{G{%)3TN|m{ sCWZ_(9XjZqk~jok99X<&dgTnQtRWVb(UV!Z delta 17 YcmbQi_n3D>4CCfxMjhtOds(8H06A3!y8r+H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 50b324d0a50d6d033eb16a6f7fd400a6713560bc..f49957b0500741e26dcaf08bc50a28e11b013b4a 100644 GIT binary patch delta 70 zcmaFNJA-dS3?pl@Nl|h3<|sxzW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuV!Z delta 17 YcmbQi_n3D>4CCfxMjhtOds(8H06A3!y8r+H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu index 8e749a6cb244586af7489cbf67a15e59fa097c2d..08f98ebd73c9ece40ef5f56ba50f41981abd3292 100644 GIT binary patch delta 70 zcmaFDJCkoiEF){ONl|h3=4eJeW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuxoB#j- diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu index 66bfdd0adda1da611e7687669661cf43c6b7b6ad..116d198129e7e6ca84709070b195a390b2ac4f18 100644 GIT binary patch delta 70 zcmaFNJA-dS3?pl@Nl|h3<|sxzW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuV!Z delta 17 YcmbQi_n3D>4CCfxMjhtOds(8H06A3!y8r+H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu index 0e0f2263be8689ab34ebb2ef7fce1dcad489ee2c..772e9be85c6ca94ad44e1d8e9f0956b753e07906 100644 GIT binary patch delta 53 zcmaFJJB@EcG$U)WNl|h3=14|e=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrTA delta 17 YcmbQn_mFo(G~?zZMs4QJyIG={066glvH$=8 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu index f06c69bda0738768c751f1ac32f95ad3e884c048..b2f084855d35b8c2f9f945bd0c832d1062f0c5f1 100644 GIT binary patch delta 53 zcmaFJJB@EcG$U)WNl|h3=14|e=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrTA delta 17 YcmbQn_mFo(G~?zZMs4QJyIG={066glvH$=8 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 7f43a2c237f3377df607c31435997c8d8bc913da..995a362b6155dcefbaed0500b13e1d6736caabcc 100644 GIT binary patch delta 70 zcmaFNJA-dS3?pl@Nl|h3<|sxzW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuV!Z delta 17 YcmbQi_n3D>4CCfxMjhtOds(8H06A3!y8r+H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 95ef360946a9519902440e2cfc5400b6f1d3192c..5e1d157f9fde2741881610b9e538d9cadac9cf6a 100644 GIT binary patch delta 53 zcmaFJJB@EcG$U)WNl|h3=14|e=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrTA delta 17 YcmbQn_mFo(G~?zZMs4QJyIG={066glvH$=8 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 1e2a14b8ecbbaf0e48d38cb4e9b13213ba650ddb..7d758dd7daab6285189876fa16821d87d3a8b5cf 100644 GIT binary patch delta 53 zcmaFJJB@EcG$U)WNl|h3=14|e=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrTA delta 17 YcmbQn_mFo(G~?zZMs4QJyIG={066glvH$=8 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 7106d1f7c0d01ecb0e563da1128c0b3ca66dd1b8..d0b56d975ee741499f8983b00f487e5a8c046b70 100644 GIT binary patch delta 55 zcmaFLJBx2aBqM9GNl|h3=5R)1W=5UK6PQFKJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN<4?L^A;Z^@b79 delta 17 YcmbQm_mp=-B;)1;MnmS!`&puy06BpLzyJUM diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index ecf89ca6187b72eb8ad16627d8008fef8ba75a95..1963beb0a3f4aed1a7969386ba5cc455818f2a2e 100644 GIT binary patch delta 70 zcmaFNJA-dS1S4y*Nl|h3<}gMhW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu1mos-Mg!)}ds(8H06856w*UYD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index c1d06a193795a77d2234843381f3aaaa63a42cc2..cb6b94682e58c3e8aa06efcd97ff6400a6a28b6d 100644 GIT binary patch delta 70 zcmaFNJA-dS1S4y*Nl|h3<}gMhW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu1mos-Mg!)}ds(8H06856w*UYD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu index eeb92cda4a7b42b999f51359090f113f67b6988e..f3db7a995dcfd7774a5d9c09d7f36e9753e17cb8 100644 GIT binary patch delta 55 zcmaFDJCkoiBqM9GNl|h3=5R(MW=5UK6PQFKJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN<4@L^A;Z^zRYO delta 17 YcmbQq_k?#tB;)1;Mg!)}`&gox06A6#yZ`_I diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu index defd1f2da49725db8b1eb270d15d8c77de61c88f..f3dc291384e49ac1888f82ad87c2a567c8c74292 100644 GIT binary patch delta 70 zcmcb^JCScgC?jjKNl|h3=3quWW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu1mos-Mg!)}ds(8H06856w*UYD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu index cff3a8df7b757cb5383f5b15c0731daab0e5265c..f40cfa4c97219dd64129eb47895d60442b75b3ed 100644 GIT binary patch delta 53 zcmaFJJB@EcI3sJaNl|h3=1@jM=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTr1mos-Mg!)}ds(8H06856w*UYD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index ea8722723e03324ae0e6bcf466c9ec4748d83a4f..e6f63ba878099393e9396b8ed8ca37c48c8d0678 100644 GIT binary patch delta 53 zcmaFJJB@EcI3sJaNl|h3=1@jM=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 79b40aadaaea8c9c76cdecdadaeba8a8c0d99b49..24f333e4e6c1dff42dabe7132ca56e688e85d190 100644 GIT binary patch delta 70 zcmaFFJDqPs7$a-4Nl|h3<`70>W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu index 3072bdb4bd252bf7f8745f10c2eccce5c76140bf..4a7c4f822b0c9c69fbcf50b57c145fc006c7504d 100644 GIT binary patch delta 70 zcmaFNJA-dSI3sJaNl|h3=1@jsW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuIOFCxMnmS!ds(8H067l@wg3PC diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu index 842ca3044a35272fcea478e59f3dfbd16f130094..695dbb8dee0dc461da42c720f202ebde3becaf10 100644 GIT binary patch delta 70 zcmcc3JArRQFe7WSNl|h3=0HY$W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu_02Cb-qyPW_ delta 17 YcmbQhcbj)ZFyrPZMqTF3TUnx+05^XHk^lez diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu index d2c18579a431649618353e71268e31788fdfe1c8..ff4780c96973f716689438a85789e78b10013f4b 100644 GIT binary patch delta 70 zcmaFFJDqPs7$a-4Nl|h3<`70>W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu index 831e696c51e271603527d3875cfcf1323c2c94ba..3e169ceca5807668e3cd376a7d3683853951147b 100644 GIT binary patch delta 53 zcmaFBJC$!kC?jjKNl|h3=3qu6=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 3b419be37e741918b11327c0544b341eebf77d5a..bc521eeddd4e4171a88745a5448b8e7cb545b08f 100644 GIT binary patch delta 53 zcmaFBJC$!kC?jjKNl|h3=3qu6=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrPtN;K2 delta 17 YcmbQpcZYXFIOFCxMlI&e+gPHR05{?Vng9R* diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu index e5b3eb1e45197bd7d8506164e55e93bff444ed36..7b51944c7b5f1d04469b21fd0a76cfa8b2dcd2ad 100644 GIT binary patch delta 70 zcmcb^JCScgI3sJaNl|h3=1@i*W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuPtN;K2 delta 17 YcmbQpcZYXFIOFCxMlI&e+gPHR05{?Vng9R* diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 2cd897a8d8e884e788247deb15001a8956a33076..04846642ef263b4843d0075abc74e19be5e9c765 100644 GIT binary patch delta 53 zcmcb|JDG1o1S4y*Nl|h3<}gNG=E?7wMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrPtN;K2 delta 17 YcmbQpcZYXFIOFCxMlI&e+gPHR05{?Vng9R* diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 386de0770143ea297726b38c35dbc6a7a6c17534..8d147553242f57484645c69b020afe55856b316b 100644 GIT binary patch delta 70 zcmcb^JCScgI3sJaNl|h3=1@i*W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuPtN;K2 delta 17 YcmbQpcZYXFIOFCxMlI&e+gPHR05{?Vng9R* diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 27bdaeb55b389616281d2e438dfcfa0a59ea13b7..eb0ddf9a2cf4f3f61b4813dad15a00e3bd9a0b3e 100644 GIT binary patch delta 53 zcmaFBJC$!kC?jjKNl|h3=3qu6=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrp8x;= diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index 2bbee6fdcdc8178084082cee2e1b328076591f66..197018a8de7a123e90898269473cd678ff33d75c 100644 GIT binary patch delta 53 zcmcb|JDG1o2qSBp8x;= diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu index ba87340723e2fcaf05f64d1d9dfadd1b6fa34aa8..9501783650c2af10523836bd6a012483f5e64166 100644 GIT binary patch delta 53 zcmcc5JB4pUC?jjKNl|h3=3quc=E)zKMI}9R3vx1(GfUi4b8~(3Q&O$0++BTrp8x;= diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu index 28a74223541617348c33b7449afff48cbbaec734..7986b73a5bfea8d45a4f74270c6cabd1eaee76dd 100644 GIT binary patch delta 70 zcmcb^JCScgFe7WSNl|h3=0HXRW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwup8x;= diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu index a5f3721ebe04ef63fc072537f9061d7e29383d29..a1b665d1263eedc8b2753f41a3aff5fb56d400f2 100644 GIT binary patch delta 70 zcmcb^JCScgFe7WSNl|h3=0HXRW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu@~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 566dc284a6ce0f29ab6e0dbee84dfb879f4a154d..e63d982c0da996bf5ab274d660aec71c5741f5d6 100644 GIT binary patch delta 53 zcmZqR+sd~gl#w;rq^LN1b1-8N^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% IOIf0s0LMrX5C8xG delta 17 YcmdnW*TA@~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu index 2467cc4ed014c1b1be99f740cebe8c01b500b7f3..696542f2a27350d463788c862a9f75a2f707a270 100644 GIT binary patch delta 70 zcmZqV+s3ycjFC0jq^LN1a|mM)vz`u@0uUsZmgFQB7h72+=jWAKS$XCb<{9 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu index 0514f278163df46c7a0de4ecc58475b22d582812..f70e8d25d50921702713e53324f15de46f37bf87 100644 GIT binary patch delta 53 zcmZqR+sd~gl#w;rq^LN1b1-8N^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% IOIf0s0LMrX5C8xG delta 17 YcmdnW*TA@~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu index a86e72b10f5a437535be0e1dd83a5054ab943790..de958f955ea536dc0b7a45e56792afb9deab1822 100644 GIT binary patch delta 53 zcmeyzyP0o82qSBOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu index ae88eaf21a1fe6c72e6e989f138cd20a9c4835fb..3a1958321830d6e4f9a28297f5f2f14e7cb93c04 100644 GIT binary patch delta 53 zcmeyzyP0o82qSBOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index ab6c8b00485a21f2ebeee71a7eff7667d9581ccf..7fd8b76f2dc2c1eead8d6e04e2ca2c3ff540c5b5 100644 GIT binary patch delta 53 zcmZqR+sd~gl#w;rq^LN1b1-8N^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% IOIf0s0LMrX5C8xG delta 17 YcmdnW*TA@~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 3ff004f9b6d505c3ee4aa743a3f1a4903492c808..4205a742b66a425b8113a0dd8abe573f0c719c8d 100644 GIT binary patch delta 53 zcmeyzyP0o82qSBOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index de94428e1978ff13cffa7f78e7e9de7d35aab98b..431b43caedbc007ae92ebfd98f93b356143f17f4 100644 GIT binary patch delta 53 zcmeyzyP0o82qSBOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 86859c9a827f139d0f51ed37c1de186c5670a471..3d6ada6f3657d389eb0f8a65e87d6800b44eedc4 100644 GIT binary patch delta 53 zcmeyzyP0o81S4y*Nl|h3<}gNI=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj J7qdh&0RZ5{5r_Z) delta 17 YcmdnY_m6i&1mos-MsMcLH&~*X06nG#BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 724f1026114a3889051d376a5ef1d2ea96f05fa3..77384ac627c5dae591c35b7572dc0e8a1810438f 100644 GIT binary patch delta 70 zcmeyvyOD20I3sJaNl|h3=1@i-W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu index e5e7a1445b8395d0ae2cf3c5bf1040991cdd6e4c..d69a9e9e908fe8ebcd060adcc1e0e3a892ecd65f 100644 GIT binary patch delta 70 zcmey$yNYi^2qSBOV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 1737acc2ece05ecc377d201d30ad5b410754f6e8..91885956bab15093f146f729aebfa20007dc37fa 100644 GIT binary patch delta 70 zcmeyvyOD20Fe7WSNl|h3=0L^(W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub2><{9 delta 17 YcmZ3*_my`;0ORIxMla^g=UJke06Vh<_W%F@ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu index fe55edfe17a57af1ec9590921a50cc4ab356d69f..649d27ed3ba1fe4326ed6cc0dbe57bf1028b3290 100644 GIT binary patch delta 70 zcmeyvyOD20Fe7WSNl|h3=0L^(W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu2W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuOV delta 17 YcmZ3__ltK!5aZ@ZMqlR5S6HH%06eA!4FCWD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu index 03b3dd1e7a266aec9772b95f331db7cf06671d23..799219ab6f63f43df043011ad9e62d86fbbabb0c 100644 GIT binary patch delta 70 zcmeyxyPj`D5F=}{Nl|h3<^V>2W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuOV delta 17 YcmZ3__ltK!5aZ@ZMqlR5S6HH%06eA!4FCWD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index df33dc24f44f55911a0d4ce93f0f7e7d17208f2b..a0fe7fb28c0f62b8cd37e2ac8f376c110ee33140 100644 GIT binary patch delta 70 zcmeyvyOD20Fe7WSNl|h3=0L^(W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu2W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuOV delta 17 YcmZ3__ltK!5aZ@ZMqlR5S6HH%06eA!4FCWD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index c92680aa369aa0d82f6a028c3ca79a6cc908a86d..47634aee66867df1ab19d96ff2560d9e1a72e3be 100644 GIT binary patch delta 70 zcmeyxyPj`D5F=}{Nl|h3<^V>2W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuOV delta 17 YcmZ3__ltK!5aZ@ZMqlR5S6HH%06eA!4FCWD diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu index eb5f8857a96776f58d1eded83bb45ff6c475f8b9..52a026bace5593c46c2798c4be45a5ae6e13db37 100644 GIT binary patch delta 71 zcmeytyOwW5BqLjVvPn^K_T~siS7tpOE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJeMV!2>?b`7k&T$ delta 17 YcmZ3>_k(vsB;)1;Mi=JImsp~i06ek=3;+NC diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 119ca59e2348ab6235d42f3071d8c49399eb3c59..6c59c066a56cacfdb93ee34ad0b5a5106c8ae5dd 100644 GIT binary patch delta 70 zcmZqX+rhUXijg(hq^LN1a|B}mvz`u@0uUsZmgFQB7h72+=jWAKS$XCbNEx&QzG diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index f06f8f31c6abad7184003202a2d55ec1b9336432..e080b713aa723d9faf6df41353a0c71e1a2ea249 100644 GIT binary patch delta 71 zcmeyyyP9u91S4B~vPn^K_U3R#7iK*jE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJewt&2>?Wf7j*yt delta 17 YcmZ3@_lD7g(a106b0x0{{R3 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index ec713ff7f8c479783f67a9bb5b0ed5d5954a296b..97dae1927464632849fdf165a68ab257d9e2c1f9 100644 GIT binary patch delta 53 zcmZqV+s3ycl94spq^LN1b2y_v^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% I%UGhB0LY;c7ytkO delta 17 YcmdnS*T}aal5ukaqc8L3+bq#c05-z~u>b%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu index 9affaa3cc2db7a7a088f7f21433ef17928b30b6f..cbff4aaab4c48c584b0e4d3d7831b5bb1ea83037 100644 GIT binary patch delta 71 zcmeyyyP9u91S4B~vPn^K_U3R#7iK*jE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJewt&2>?Wf7j*yt delta 17 YcmZ3@_lD7g(a106b0x0{{R3 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 92afe9742e7ae37955acde964f2409f4522b5225..7027e8866434a04af26683087535f37b4a8b33bc 100644 GIT binary patch delta 53 zcmZqV+s3ycl94spq^LN1b2y_v^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% I%UGhB0LY;c7ytkO delta 17 YcmdnS*T}aal5ukaqc8L3+bq#c05-z~u>b%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu index 9e331632fba0d2ae3dc8a85dcbf6061216db26b2..3b8ea536547a7d43475fcbe7da4906db86cf4217 100644 GIT binary patch delta 71 zcmey)yM}K=BqLjVvPn^K_T~si7iK*jE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJclKk2>?Zq7kU5y delta 17 YcmZ3(_nmh`B;)1;MrY>D7g?g206d2V2mk;8 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index 026d72a924780fdcb92beac45e4f93d497d61e21..97b4ce07df7af40ed4320424505ba6e5c2d004a1 100644 GIT binary patch delta 53 zcmaFQyM%8;C?i{ZvPn^K_T~^q2j)pEqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jr?5mb0RZBy5t#r0 delta 17 YcmZ3&_nvn{DC6cBMtkPXCt0GI06O;t?Wf7j*yt delta 17 YcmZ3@_lD7g(a106b0x0{{R3 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 65ae55c58560d7fb419fe2b8421d19a672232edb..61fdf2bf6a8d0dd0e1d3f581242eb0d9cd25f1a0 100644 GIT binary patch delta 53 zcmZqV+s3ycl94spq^LN1b2y_v^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% I%UGhB0LY;c7ytkO delta 17 YcmdnS*T}aal5ukaqc8L3+bq#c05-z~u>b%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu index d9267ca2f98d12d7b8deba9e89fd02140b7e67d0..065b4d978f74f7d3e9ad9cd1ff23a3f303b45737 100644 GIT binary patch delta 71 zcmeyuyOM81I3rtpvPn^K_U14~XJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJd-7w2>?R27i<6k delta 17 YcmZ3<_l0*uIOFCxMknUY=UAed06Xdh`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index cb957cd373aa4e63db10d43f78f4d55255731669..45e2ecd18d6985208af3fd058f217202196eaab8 100644 GIT binary patch delta 53 zcmey*yM=E<1S4y*Nl|h3<}gM-=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZ7u5sUx; delta 17 YcmdnO_n&t|1mos-Mjz(QH(8>Y06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu index 53f215f1d520af7c36a586ca90ee6d04b7306c78..66394d5f84992bfe2fe8c1ed709ebe2735537558 100644 GIT binary patch delta 71 zcmeyuyOM81I3rtpvPn^K_U14~XJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJd-7w2>?R27i<6k delta 17 YcmZ3<_l0*uIOFCxMknUY=UAed06Xdh`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index c865ecbca9ad414330cce1fcbd0dbbf49fb21066..7d0383cb0a35abfdd2cbe782f527724dcd1101df 100644 GIT binary patch delta 53 zcmey*yM=E<1S4y*Nl|h3<}gM-=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZ7u5sUx; delta 17 YcmdnO_n&t|1mos-Mjz(QH(8>Y06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu index a6fcb3eb8224cb18173c5b7e218ef86b87dcd363..c11049145c5711923c7b8c3d227fdad4b02a660b 100644 GIT binary patch delta 71 zcmeyyyP9u91S4B~vPn^K_U3R#7iK*jE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJewt&2>?Wf7j*yt delta 17 YcmZ3@_lD7g(a106b0x0{{R3 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 0008c79f681c2f3b4e24975e6adb15feeb9c1343..9216e8c12c9c539efa713449e776d9d29dbbd869 100644 GIT binary patch delta 53 zcmZqV+s3ycl94spq^LN1b2y_v^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% I%UGhB0LY;c7ytkO delta 17 YcmdnS*T}aal5ukaqc8L3+bq#c05-z~u>b%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 36d4ab6e8615d9805c2803d52a5fd8e2e757a733..3faa0456e8366a7496cacdbf4f35d8b4a1e4c696 100644 GIT binary patch delta 71 zcmeyuyOM81I3rtpvPn^K_U14~XJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJd-7w2>?R27i<6k delta 17 YcmZ3<_l0*uIOFCxMknUY=UAed06Xdh`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 707e4e72f520d685ce0045d891d985903d27fac9..2657a4f168155179596e386cc37e452e37e50cd0 100644 GIT binary patch delta 53 zcmey*yM=E<1S4y*Nl|h3<}gM-=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZ7u5sUx; delta 17 YcmdnO_n&t|1mos-Mjz(QH(8>Y06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu index 77b89b0348cbbada9e359b0e18c0336c5d58c269..def60f3dd00b2f2e3253dabb3e9e0a62eb446932 100644 GIT binary patch delta 71 zcmeyuyOM81I3rtpvPn^K_U14~XJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJd-7w2>?R27i<6k delta 17 YcmZ3<_l0*uIOFCxMknUY=UAed06Xdh`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 287d21c26774d164431f804a33fb09af07f0cfc4..8d37fb3196aa7a7b43a65e33e9be2f99a10e6fd6 100644 GIT binary patch delta 53 zcmey*yM=E<1S4y*Nl|h3<}gM-=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZ7u5sUx; delta 17 YcmdnO_n&t|1mos-Mjz(QH(8>Y06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu index cd8282937254c74560704cab7852d45b7729b869..f743166dfd08ae7f78a5aa3835f10bda353b9d90 100644 GIT binary patch delta 71 zcmeyuyOM81G$UJlvPn^K_U0%?duBZyE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJd-7w2>?UJ7jOUo delta 17 YcmZ3<_l0*uG~?zZMmy%s=UAed06ZcE{r~^~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 06575697c08f31e78d1650240a82ce9acbb0fb76..5dfe6b5410c1f6ae5ac54991a580a3c30f2a8476 100644 GIT binary patch delta 53 zcmey*yM=E<3?pl@Nl|h3<|sxl=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZA15s&}? delta 17 YcmdnO_n&t|4CCfxMi1uAH(8>Y06qx@D*ylh diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 27f290f840b7bb2bea13845e8e38368ee433e60f..65154c27d6d82c242c33eae4db08b563d1bc4c04 100644 GIT binary patch delta 71 zcmeywyPR)B6eC-FvPn^K_U1@NJ7zr{E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJe?()2>?O%7iRzf delta 17 YcmZ3^_lb8y6yxSZMqB31XIP?{06V?~^#A|> diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index d3cfd4b3deeb408ef3027837d9fa678d1a4a48d9..0aeb15f73723ad1cb2a00a1b20cc4a631681a738 100644 GIT binary patch delta 70 zcmey%yNPc@G$U)WNl|h3=14|QW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?O%7iRzf delta 17 YcmZ3^_lb8y6yxSZMqB31XIP?{06V?~^#A|> diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 869fac226cbeac140307ebbda37d70d0d7a31631..0cd9d76889e970c856519a250787c527f6cb603a 100644 GIT binary patch delta 70 zcmey%yNPc@G$U)WNl|h3=14|QW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?R?7i<6k delta 17 YcmZ3%_nCJ?G~?zZMqB31XIY||06X^u`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index c0c2308026dc2d40eae70be996505b43bde6bab7..95f37bffe142d1a0696f54e139bd0b9bb2dd4ea2 100644 GIT binary patch delta 71 zcmaFOyMS*)I3rtpvPn^K_U14~D`q_%E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJb@*e2>?657f1jA delta 17 YcmZ3$_nLP@IOFCxMoZ?+M_Hno06J#{)&Kwi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu index 9271b3b3bbaf6636decf0a78baf8223e7b46f917..3de086bebeccab27ce8c6bc827da7f9382cec969 100644 GIT binary patch delta 71 zcmeywyPR)B6eC-FvPn^K_U1@NJ7zr{E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJe?()2>?O%7iRzf delta 17 YcmZ3^_lb8y6yxSZMqB31XIP?{06V?~^#A|> diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 0e37f1c74093ec189f3b50919281585101fd6bf7..f346d1c128d0f98405b098e044d9d52f25ed100e 100644 GIT binary patch delta 70 zcmey%yNPc@G$U)WNl|h3=14|QW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu;M1& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index ad4aca7b51321907f1234449d3d60c5774548351..94062d99e1dae6afc68c6eb59f4e1a76a1db88de 100644 GIT binary patch delta 70 zcmey(yMb>*6eDZ0Nl|h3<_Ja)W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu;M1& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index f1b452ea75dc331e2bbfb5b953ebc6c327f0020d..a4581d29423e0e122557785425a769e035989c9e 100644 GIT binary patch delta 70 zcmey(yMb>*6eDZ0Nl|h3<_Ja)W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?O%7iRzf delta 17 YcmZ3^_lb8y6yxSZMqB31XIP?{06V?~^#A|> diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index a377250dbcc70ae824bf8c024586f38416130be1..b2513493066d3c9c2125a1fd215ac6a55ec4ff35 100644 GIT binary patch delta 70 zcmey%yNPc@G$U)WNl|h3=14|QW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu;M1& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 73a472e7591d8d9c082975f6cfb46f79b1b2e4e5..1a46c48ef091a6921465183ecc458ad38c7a3065 100644 GIT binary patch delta 70 zcmey(yMb>*6eDZ0Nl|h3<_Ja)W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu;M1& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 9a22a896f2062468a5a12c8432409941a89c4976..4fd9bb43b74fd9008a9869a0fa83e53bfb85d327 100644 GIT binary patch delta 70 zcmey(yMb>*6eDZ0Nl|h3<_Ja)W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?R27i<6k delta 17 YcmZ3<_l0*uIOFCxMknUY=UAed06Xdh`Tzg` diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index d03293a22a07a5f0b9b2b182851e5e4c9fdd3d26..05b313d0094512ec0f1bc42cc300cdbe8bc0dbd8 100644 GIT binary patch delta 53 zcmey*yM=E<1S4y*Nl|h3<}gM-=E?jlqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jm#{=L0RZ7u5sUx; delta 17 YcmdnO_n&t|1mos-Mjz(QH(8>Y06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 54c059adc5aa6230f15d6c0eb65ec930975c76a2..5c2f505c9982c5595dca678ea2c5df07dcd9e939 100644 GIT binary patch delta 71 zcmeywyPR)B7$aMJvPn^K_U2GVCuThzE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJe?()2>?Lm7h?bb delta 17 YcmZ3^_lb8y7~|$xMn~q&XIP?{06T^S@c;k- diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 0ae92d4f02f84cbb24a54c3be2169e4d3c7cda80..8efd1931059c5ff49f2eb505c7c472813662260e 100644 GIT binary patch delta 55 zcmey%yNPc@I3sJaNl|h3=1@joW=5UK3z?Lm7h?bb delta 17 YcmZ3^_lb8y7~|$xMn~q&XIP?{06T^S@c;k- diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index c1b497ace9ac4f9e8a9a3496af67c198b8237589..371cb6a7db29cddf9d123708ee36a3e61e3bc82d 100644 GIT binary patch delta 55 zcmey%yNPc@I3sJaNl|h3=1@joW=5UK3z?Ox7ia(g delta 17 YcmZ3%_nCJ?IOFCxMn~q&XIY||06V`0_5c6? diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index bb989bf77bbfff7e98a13a89a2f448f609f792b6..6acff4ccb7b6c0d5c4060e39958f81fc57bebbbd 100644 GIT binary patch delta 71 zcmaFOyMS*)Fe6)hvPn^K_U0f)J7zr{E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJb@*e2>?2<7eoL6 delta 17 YcmZ3$_nLP@FyrPZMqB31M_Hno06H%P(f|Me diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu index c48fb744e9e05238a68b1584e14f949ed5999aa4..35c8fcdfc0a8ce1b5474a7d00a5794060122258c 100644 GIT binary patch delta 71 zcmeywyPR)B7$aMJvPn^K_U2GVCuThzE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJe?()2>?Lm7h?bb delta 17 YcmZ3^_lb8y7~|$xMn~q&XIP?{06T^S@c;k- diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 68f9d8ccdd43a02ad5dfe60ad5c35ddf08b99eb1..b6cab90ee67916e5b90f6e9d0010d322b5df56d5 100644 GIT binary patch delta 55 zcmey%yNPc@I3sJaNl|h3=1@joW=5UK3z*7$a-4Nl|h3<`707W=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN+g8iDm)-1QQZ9 delta 17 YcmdnM_nUV^7~|$xMla^gS6QN&06hr?6#xJL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu index 05e3e2b60560d05481b8cc068ba2c45a055cbf9e..0c4d8206e4e45f7fcb012ad52a05dbd243921317 100644 GIT binary patch delta 53 zcmeysyOeK3C?i{ZvPn^K_T~^qN9IW^qLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jr?Nyd0RZDY5uE@4 delta 17 YcmZ3=_knjqDC6cBMhE82r&ywy06QWD=l}o! diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index ba2ef184c886506990ad05ae81c44ca35286cec3..5f436fa1f2f30b44cc97f88941c2bfee997c9eaf 100644 GIT binary patch delta 55 zcmey(yMb>*7$a-4Nl|h3<`707W=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN+g8iDm)-1QQZ9 delta 17 YcmdnM_nUV^7~|$xMla^gS6QN&06hr?6#xJL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu index ed2e4f271e3769ee2bba9f9494e775e0dbd50084..aa49f1be6ee624ca1ded5e8da49aeda6f2cfc800 100644 GIT binary patch delta 71 zcmeywyPR)B7$aMJvPn^K_U2GVCuThzE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJe?()2>?Lm7h?bb delta 17 YcmZ3^_lb8y7~|$xMn~q&XIP?{06T^S@c;k- diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index e214104496db7c7483e0123522c26af1bd94bb62..15bdd42a5a5f928b61772d3e0b093dfd1b7d3d5d 100644 GIT binary patch delta 55 zcmey%yNPc@I3sJaNl|h3=1@joW=5UK3z*7$a-4Nl|h3<`707W=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN+g8iDm)-1QQZ9 delta 17 YcmdnM_nUV^7~|$xMla^gS6QN&06hr?6#xJL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu index 4fde39a8f32ec1dd27ef1cd89ee9af7f907afd95..e3d8de90c76d3bb9e5724a451a27057bbc922bbb 100644 GIT binary patch delta 53 zcmeysyOeK3C?i{ZvPn^K_T~^qN9IW^qLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj Jr?Nyd0RZDY5uE@4 delta 17 YcmZ3=_knjqDC6cBMhE82r&ywy06QWD=l}o! diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 8b13b712fa6da5cc71acae81db53bd3f6796b786..11bd4fd6bd26735a345ac38adf984dd987965b8a 100644 GIT binary patch delta 55 zcmey(yMb>*7$a-4Nl|h3<`707W=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN+g8iDm)-1QQZ9 delta 17 YcmdnM_nUV^7~|$xMla^gS6QN&06hr?6#xJL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu index 8b8f3ac42b65bc00b3205f9d7afe456e19a66375..a1aebf28a26330464e5dcb8ea1d23d4e00afa613 100644 GIT binary patch delta 71 zcmey&yMk{+C?i{ZvPn^K_T~^q7iK*jE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJcA{g2>?N87iIte delta 17 YcmZ3%_nCJ?DC6cBMrY>DXIY||06U`v^Z)<= diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index c4a85c476e49d54abcccc425351edc9fa3059d73..f1411857237f9d2f041585fa0a49468e07beab2b 100644 GIT binary patch delta 55 zcmeyzyP0o87$a-4Nl|h3<`71IW=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN+gEiDm)-1=tcn delta 17 YcmdnY_m6i&7~|$xMqlR5H&~*X06mHZApigX diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 971d50169b50c2d855bd94ee03760ed3c6c99d04..45394e91b32661aea3fb12d6d53847a5ff54ad0c 100644 GIT binary patch delta 71 zcmey!yNqu`2qRm3vPn^K_U2$lXJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJdGuq2>?Hs7hM1V delta 17 YcmZ3+_mOu)2;=5xMknUYr&*$z06RYg>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 5d5fb2c612dc49da328ef90d0ccde0e0e152ff84..80fe01d40c652db0929d08ffb298ec94fc666290 100644 GIT binary patch delta 70 zcmeyvyOD20C?jjKNl|h3=3quYW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?Hs7hM1V delta 17 YcmZ3+_mOu)2;=5xMknUYr&*$z06RYg>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 26d1bd97bb02305b9f530919f203a52393db5b6a..9afa9cca93eac2b99e85ec29b6bd6d236a92eb2d 100644 GIT binary patch delta 70 zcmeyvyOD20C?jjKNl|h3=3quYW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?K%7h(Va delta 17 YcmZ3^_lb8yDC6cBMknUYXIP?{06TaE@Bjb+ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index 6111f919728c22c2ec7f60fdbb98b0119400131f..f98c6e3fddbde58778d93793f24caf6c64287acb 100644 GIT binary patch delta 56 zcmaFGJD+buAR}9RvPn^K_T~UaduB$R$rG7GBt3Hrax#-MOWad)bA9tuQmw4qU44Dy M{X#bPvqUoi01LSiJpcdz delta 17 YcmbQw_lkEzAmio;Mmy%sM_8hn06FLd%m4rY diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu index 68794f6b575cdcbdd429229d244f7ce5f96f1f62..2a169bd1d3ab1779be140c05efc48237408be0d5 100644 GIT binary patch delta 71 zcmey!yNqu`2qRm3vPn^K_U2$lXJ$PeE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJdGuq2>?Hs7hM1V delta 17 YcmZ3+_mOu)2;=5xMknUYr&*$z06RYg>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 16e415f6456ab4ce510c3c5e8e75280d938a07ae..943e23c6bc0c224ad433f3ac9229260adad04d81 100644 GIT binary patch delta 70 zcmeyvyOD20C?jjKNl|h3=3quYW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?Hs7hM1V delta 17 YcmZ3+_mOu)2;=5xMknUYr&*$z06RYg>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 5897f989e82827c335c09ec5171dab4f692980bd..90e777c80e146f8496343c04bc59fe9a81b0fd0d 100644 GIT binary patch delta 70 zcmeyvyOD20C?jjKNl|h3=3quYW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu5t{%2 delta 17 YcmZ3&_nvn{IOFCxMqB31Ct0GI06P-}<^TWy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index d37d0edb3b49d965018ee1cc206f769d8f657215..fa043faa88cfd7f5898b9fa5f53973b54159b0b8 100644 GIT binary patch delta 70 zcmeyxyPj`D1S4y*Nl|h3<}gN2W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu(^b delta 17 YcmZ3__ltK!1mos-MtA1TS6HH%06h8z6951J diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 837f1704579a1c833800c743f0454127661d569b..ec624a3a0ff7646f3afd91cb0c387a9dbe627ec4 100644 GIT binary patch delta 71 zcmaFMyNGW?7$aMJvPn^K_U2GVTV_2SE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJc%Wm2>?9^7f%2H delta 17 YcmZ3)_m+1<7~|$xMjPhM$62D806MP)-2eap diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index f7856b22c88cf244aafbcba4b3820cf5193e535a..545b04787b5311c44d2f614ff6eb8a5503ae4fca 100644 GIT binary patch delta 55 zcmeytyOwW5I3sJaNl|h3=1@itW=5UK3z_k(vsIOFCxMmOfomsp~i06dlk3IG5A diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu index 341fd2e5ea225a42fb6c5d91bba1d299e737da8d..67b97d901363b75599f3ceeff783eda93bf7305f 100644 GIT binary patch delta 71 zcmaFMyNGW?7$aMJvPn^K_U2GVTV_2SE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJc%Wm2>?9^7f%2H delta 17 YcmZ3)_m+1<7~|$xMjPhM$62D806MP)-2eap diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index a318b38c7415d397068169045389ecd7c2663073..ba4e2f2d1d24858a3d64861d32f15f5230a69baa 100644 GIT binary patch delta 55 zcmeytyOwW5I3sJaNl|h3=1@itW=5UK3z_k(vsIOFCxMmOfomsp~i06dlk3IG5A diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu index 8e0176529ec4080907343009dd23a22b9bf58334..aa7116c4683046a3fb355866cae0f091092da8af 100644 GIT binary patch delta 53 zcmaFIyO?i7I3rtpvPn^K_U14~TjohDqLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj JC$mH|0RZBG5tje} delta 17 YcmZ3?_l|c%IOFCxMjPhMCs?AH06ORe;s5{u diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index 79e3d032a50c88376c34cf6ac88efc04b5e6023f..3471b9fad1cfb98a97b108c05d3c4443cbfa3f83 100644 GIT binary patch delta 71 zcmaFHJDYDqFe6)hvPn^K_U0f)OJ+SCE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-L+|3fr1OPVD7cc+- delta 17 YcmbQu_l$Q#FyrPZMhoW62Uwz+06AC%z5oCK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu index 00847daa5513f7601afcc2d7510f04730cc42bc2..ceed408d0e09b94fae94fb36135f1a4ee010498f 100644 GIT binary patch delta 71 zcmaFMyNGW?7$aMJvPn^K_U2GVTV_2SE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJc%Wm2>?9^7f%2H delta 17 YcmZ3)_m+1<7~|$xMjPhM$62D806MP)-2eap diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index a9aa737ece968d5b481627988805f39ff43c340d..b7db126103326ad4220ae6de1edbb3324dfb489a 100644 GIT binary patch delta 55 zcmeytyOwW5I3sJaNl|h3=1@itW=5UK3z_k(vsIOFCxMmOfomsp~i06dlk3IG5A diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu index e3c88e3dfd237b0c7ddb93bef51bfbf492c4a083..9fcc2f40ee67ace6fb8c09aff920de6833b23479 100644 GIT binary patch delta 71 zcmaFOyMS*)C?i{ZvPn^K_T~^q8)iKnE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJb@*e2>?4d7e)X8 delta 17 YcmZ3$_nLP@DC6cBMr-EHM_Hno06I$r)Bpeg diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index ecaa42c04e0aaf4a4f9455f31e95d88a627b8678..94e3fc9730fdc00da4dc4dee5abc6f11244030dd 100644 GIT binary patch delta 55 zcmeyyyP9u97$a-4Nl|h3<`71AW=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN?E4iDm)-0U#0~ delta 17 YcmZ3@_l?4d7e)X8 delta 17 YcmZ3$_nLP@DC6cBMr-EHM_Hno06I$r)Bpeg diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 9eab2b78078540adef7e5390e39fc07c2a79550f..d0fda4ee3011e86bfe2bb9841cc26836c141841f 100644 GIT binary patch delta 55 zcmeyyyP9u97$a-4Nl|h3<`71AW=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN?E4iDm)-0U#0~ delta 17 YcmZ3@_l?9^7f%2H delta 17 YcmZ3)_m+1<7~|$xMjPhM$62D806MP)-2eap diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 125506b0ef3d3a0a094bbaa061dfe79271b7c869..7f930475debdb3e5d3bedd5fe63b822379aa93bb 100644 GIT binary patch delta 55 zcmeytyOwW5I3sJaNl|h3=1@itW=5UK3z_k(vsIOFCxMmOfomsp~i06dlk3IG5A diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index c81ee566747f7ac65ac5239858f8395cef56d006..6f8847812853a09e71d2c77bd0814119244749a7 100644 GIT binary patch delta 71 zcmaFOyMS*)C?i{ZvPn^K_T~^q8)iKnE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJb@*e2>?4d7e)X8 delta 17 YcmZ3$_nLP@DC6cBMr-EHM_Hno06I$r)Bpeg diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index bbc3e0625a43608f7b098a996a11c3ad88a5a730..1925a7603b086c1ab8f5be6eb0d6155705eca13f 100644 GIT binary patch delta 55 zcmeyyyP9u97$a-4Nl|h3<`71AW=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN?E4iDm)-0U#0~ delta 17 YcmZ3@_l?4d7e)X8 delta 17 YcmZ3$_nLP@DC6cBMr-EHM_Hno06I$r)Bpeg diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 54730939a65c21841719edcf31fe03e99cd217a0..757894163af79403e19198fda89e57bafc9c8fec 100644 GIT binary patch delta 55 zcmeyyyP9u97$a-4Nl|h3<`71AW=5UK3z$SCJ#!0kGLth)+*5ONee+XNt*qQ#eSPEo LLN?E4iDm)-0U#0~ delta 17 YcmZ3@_l?6z7fS#D delta 17 YcmZ3)_m+1<5aZ@ZMhE82$62D806KRC*#H0l diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 5ad993005a5c749b75be735f314363d38abc339c..0d6b624bcb3f6ed517ddbe0bb252a66b8bad7373 100644 GIT binary patch delta 70 zcmeytyOwW5Fe7WSNl|h3=0HXtW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu_k(vsFyrPZMla^gmsp~i06bm>1^@s6 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu index 6968d4fcbc41b04bc69c575d5d474e9d26ecdd98..746dd2996d01aedad809f858707f96025bb263f1 100644 GIT binary patch delta 71 zcmaFMyNGW?5F=ZBvPn^K_U1rFM`k@8E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJc%Wm2>?6z7fS#D delta 17 YcmZ3)_m+1<5aZ@ZMhE82$62D806KRC*#H0l diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 2bfdea160219cb9a494c867594b44e7f462055b5..6619b6135031fc820fafb10113f59ab66d0ee0b6 100644 GIT binary patch delta 70 zcmeytyOwW5Fe7WSNl|h3=0HXtW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu_k(vsFyrPZMla^gmsp~i06bm>1^@s6 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu index c458632a521dc26d2d2f9853ab4e169afde4bbea..824d258028a8cbe6a1e92850117b830bc92be0da 100644 GIT binary patch delta 53 zcmaFIyO?i7Fe6)hvPn^K_U0f)N9IW^qLQAu1v#0?nI-P2xw*diDXCUg?ykPR@qQtj JC$mH|0RZ8-5t9G_ delta 17 YcmZ3?_l|c%FyrPZMhE82Cs?AH06MS*-T(jq diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu index f1023cbd6b5913bf7eebb0870ea2e1da682c5bf7..72d7da1288e36edba0af4bb5545585655434dac2 100644 GIT binary patch delta 71 zcmaFHJDYEVKOwMjPhM2Uwz+068E9x&QzG diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu index 26af7b6cdabb2ae20788822094f023ef18271eee..bcc78062155ce085a5236b9b790bea6c69133058 100644 GIT binary patch delta 71 zcmaFMyNGW?5F=ZBvPn^K_U1rFM`k@8E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LJc%Wm2>?6z7fS#D delta 17 YcmZ3)_m+1<5aZ@ZMhE82$62D806KRC*#H0l diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index baf6788eee55f4d2606a99bf6692141272fd7562..b4cb06c14fc314d55c30cb5644bdd7d69fc1bf2d 100644 GIT binary patch delta 70 zcmeytyOwW5Fe7WSNl|h3=0HXtW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu_k(vsFyrPZMla^gmsp~i06bm>1^@s6 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu index 8651eeaff0cc8a1ab87d337728a1a2f705fb38b5..95002a672a5577e46856c088eb765516dcdff00e 100644 GIT binary patch delta 56 zcmaFOyMS*)AR}9RvPn^K_T~Ua2WCc{$rG7GBt3Hrax#-MOWad)bA9tuQmw4qU44Dy M{X#ZRV2Ne|01Q$RK>z>% delta 17 YcmZ3$_nLP@Amio;MtkPXM_Hno06G%|&;S4c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index ab66f27747df7d6a778b3c4591ebf8e01b1d3fe0..8ea2592e5691971a5e860350a38a9e73105a8f7f 100644 GIT binary patch delta 70 zcmeyyyP9u95F=}{Nl|h3<^V=-W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuz>% delta 17 YcmZ3$_nLP@Amio;MtkPXM_Hno06G%|&;S4c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu index cfbc9dc7b222282c4c04dee21eaea7f944e23378..8aa4d1214323e5d08b03bb5893f8f784387a061b 100644 GIT binary patch delta 70 zcmeyyyP9u95F=}{Nl|h3<^V=-W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu?6z7fS#D delta 17 YcmZ3)_m+1<5aZ@ZMhE82$62D806KRC*#H0l diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 624df6b1714f9897a19877e5f2f2a9b5c6f280e3..45e4d7e4a00b148392a2a520b3af5abadcb91a59 100644 GIT binary patch delta 70 zcmeytyOwW5Fe7WSNl|h3=0HXtW<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwu_k(vsFyrPZMla^gmsp~i06bm>1^@s6 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu index 9ce8d0676c65d5f246f1636889def8526e356a42..1d12d500e99f6d56fbfe30183415b4a310c0d34e 100644 GIT binary patch delta 56 zcmaFOyMS*)AR}9RvPn^K_T~Ua2WCc{$rG7GBt3Hrax#-MOWad)bA9tuQmw4qU44Dy M{X#ZRV2Ne|01Q$RK>z>% delta 17 YcmZ3$_nLP@Amio;MtkPXM_Hno06G%|&;S4c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index dd78af304ab200056a340ce3ec70fbc63f3fd8d8..2666fe1c6d51c3a48230796af5614f692f8eb394 100644 GIT binary patch delta 70 zcmeyyyP9u95F=}{Nl|h3<^V=-W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwuz>% delta 17 YcmZ3$_nLP@Amio;MtkPXM_Hno06G%|&;S4c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu index 408169ddefa8f86e3b3f9f0b1242e201b8c9cfea..c177dd203631af31f14a873f0b124a5e06cc2a54 100644 GIT binary patch delta 70 zcmeyyyP9u95F=}{Nl|h3<^V=-W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index ce2faaf9571981f2d69cd25242e65cd2d7680db8..e602b4fbb00fd267465105bd60bf2c95ec9c1b1c 100644 GIT binary patch delta 70 zcmZqT+s?Nkl#w;rq^LN1b1-8Fvz`u@0uUsZmgFQB7h72+=jWAKS$XCbb%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu index a8b5ef0f837d4126149b316305a3fc84fc1019c4..9a11ca1d19a8ca84cd77114bc678c671d361c6a6 100644 GIT binary patch delta 70 zcmZqX+rhUXjFC0jq^LN1a|mMyvz`u@0uUsZmgFQB7h72+=jWAKS$XCb*5F=}{Nl|h3<^aY3W<4D)1t3T+Ey+nNF1E5t&d)2evhvI=$jMC3EOAfG Y&GpSsNwub%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu index a01dc2b983754110fcdb8090ee532c20e9e46e32..f5ca0a8600b016ac7d6e307835dae96005971951 100644 GIT binary patch delta 53 zcmZqR+sd~ggpoDbq^LN1a}Z-N^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% IOIf0s0LK;)4*&oF delta 17 YcmdnW*TAb%7 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu index 0742f4c76588461b33874bd716acf93238aacc14..ab9dd01644ac09d50761ded8671acf1c4d499df1 100644 GIT binary patch delta 53 zcmZqR+sd~ggpoDbq^LN1a}Z-N^JGC5QAy9-f}G6c%o6w1++5%MlvFD#cUNEEc)yU% IOIf0s0LK;)4*&oF delta 17 YcmdnW*TAY06ozLCjbBd diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu index d93d719192c7894b719e1f2a6ac989ba893baf66..29ec4813645cc1a2042965092113d1a2e35df1c9 100644 GIT binary patch delta 70 zcmey)yM}K=2qSBY06m!oBLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu similarity index 92% rename from dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu rename to dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu index cc8985f328c3b960ffd0ec9202b991df8bffeba3..8518aabba701ee37ce656980d1a55b9a8471a9a3 100644 GIT binary patch delta 70 zcmey)yM}K=03&O%Nl|h3WhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rh_nh5|)Z5OQo delta 17 YcmX@jH-&FQGUMhP#xUm1A6TN906M$|4FCWD diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index db1c950cc05a6f3e36903d8e7e2bd8727f9facae..4327a877f1367c1977ebc24ddc73d9db36d2fe04 100644 GIT binary patch delta 54 zcmbQqcY$w13L{&xu~AWR_U1&!SmwzJETWR0xdl0y$(beYskynn`6;PZR_?C8zVUt` KoAHhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rh_nh5|)Z5OQo delta 17 YcmX@jH-&FQGUMhP#xUm1A6TN906M$|4FCWD diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 9b646f5ab2764c7ac184aa19a693e4f41b811697..8cd3d419613071ccae39237f18f832b6df06d9c9 100644 GIT binary patch delta 54 zcmbQqcY$w13L{&xu~AWR_U1&!SmwzJETWR0xdl0y$(beYskynn`6;PZR_?C8zVUt` KoAHI5+i$jvawN7arWi}#&Bjm9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}R=4nh5|&m=~k~ delta 17 YcmX@fH;HdU665A<#!%+X?^vRl06JI(1ONa4 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 52bf3538c5d6e2d94c91c6af4b0b5276066f09ca..de3b726c5d7da3007b6c5e54c93c713005abbe1c 100644 GIT binary patch delta 71 zcmbQvcaCpEG9z2Eu~AWR_T~h}7-l^kE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyoV*42>>%D7g+!R delta 17 YcmX@dH=S=oGUMhP#z^MPpIM@r06RSf82|tP diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index 6adce8136bf8e4dcdc57afebfc7c4e233853707c..747240fc503d503e49ec30d82c7b4e54719ccedb 100644 GIT binary patch delta 72 zcmbQjcbacQG9!C@vawN7arWj!#t3FT9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rh_nh5|)Z5OQo delta 17 YcmX@jH-&FQGUMhP#xUm1A6TN906M$|4FCWD diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 94d1442388b21e038962849750506e6ccaa8779d..1a7151f394612daa603f900fd6562eccf893477a 100644 GIT binary patch delta 54 zcmbQqcY$w13L{&xu~AWR_U1&!SmwzJETWR0xdl0y$(beYskynn`6;PZR_?C8zVUt` KoAH>xx7f=8I delta 17 YcmX@ZHhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|$!xx(X delta 17 YcmX@kH-T?MBID*P#t`PsZ&;$406Fvp`v3p{ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu index 676de64e02abff8b75d0dba19cab6235b08091e4..9ed169ed6f4b1e7a28048edde06834b9f621de6e 100644 GIT binary patch delta 72 zcmbQlcam>I5+i$jvawN7arWi}#&Bjm9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}R=4nh5|&m=~k~ delta 17 YcmX@fH;HdU665A<#!%+X?^vRl06JI(1ONa4 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index ac1ea8b0e1c0022f46ee45710b0a4fadc4619fe4..86f0b0342f4e4b38c3b2ed28ee507a3260874dcc 100644 GIT binary patch delta 71 zcmbQvcaCpEG9z2Eu~AWR_T~h}7-l^kE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyoV*42>>%D7g+!R delta 17 YcmX@dH=S=oGUMhP#z^MPpIM@r06RSf82|tP diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu index be1cd355cfa82565936ff006faf28f9f8d254f8b..f8e4490549fb5c8e2efee202e2e8da49dc2b4dfd 100644 GIT binary patch delta 72 zcmbQhcbsoSA|rczvawN7arWkT#xQ0*9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|$!xx(X delta 17 YcmX@kH-T?MBID*P#t`PsZ&;$406Fvp`v3p{ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 58b209be3baa95b5a9eed928f760e0b6e12c976b..77815d13010ed2d6488b9c9baf6efe7f6aa93a49 100644 GIT binary patch delta 71 zcmbQrcZP365+hr(u~AWR_U3rTXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>xx7f=8I delta 17 YcmX@ZHhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|%*B74v delta 17 YcmX@kH-T?MD&yun#sKEcZ&;$406HuM{{R30 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 7a9c681beb34818ecde569dc8dd67455a206b056..95ed7a79c9f77ae480e4849741cd9a1f0af94acb 100644 GIT binary patch delta 71 zcmbQrcZP368Y5e>u~AWR_U2^9aArLnE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>!?7gPWM delta 17 YcmX@ZHhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|%*B74v delta 17 YcmX@kH-T?MD&yun#sKEcZ&;$406HuM{{R30 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 5c7d5a8194cfeaa231bff1b0c29ff7482ccc668a..e0e78d1fb5aed2c5644a5d1d998ece5b92664bf6 100644 GIT binary patch delta 71 zcmbQrcZP368Y5e>u~AWR_U2^9aArLnE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>!?7gPWM delta 17 YcmX@ZH)2P0vDD5 delta 17 YcmX@g*T=UZg>iE(qd)WJS1i#?06EA7_5c6? diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 5981be60db40eea7c162096f4e885ff185a9d095..1ab249ba67d0fb341f7fb08bd5d23005cda7fd97 100644 GIT binary patch delta 71 zcmbQtcZzRADkEF6u~AWR_U0tUFlId+E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyn`j02>>vb7fS#D delta 17 YcmX@bH<@okD&yun#$e{n?^&Xm06MJ(3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index 051fe0aae0d9a6a33d58fe626ee7b7d6e9e5adba..f7e1bbb65cbfc97713a51e0660c45eb213c13e97 100644 GIT binary patch delta 72 zcmbQhcbsoSDkFP*vawN7arWk9#vo=r9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|%*B74v delta 17 YcmX@kH-T?MD&yun#sKEcZ&;$406HuM{{R30 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 61c208631534fd4343b281bc28bf414aba574c93..8340a760c0b55c7d9845a35697a9175cddd3a3d2 100644 GIT binary patch delta 71 zcmbQrcZP368Y5e>u~AWR_U2^9aArLnE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>!?7gPWM delta 17 YcmX@ZH>p}7eW94 delta 17 YcmX@XH<52c3ghNn#vta+Z&{+506Iwq0ssI2 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu index 259fa2b46fcdbfd7efacb80e1341b6290ba6b2cb..a848d1a65cd7105104f49840d52ff0bc521ef997 100644 GIT binary patch delta 72 zcmeC-JIuEsnUOs{+1RM4ID2y)2NEf)2P0vDD5 delta 17 YcmX@g*T=UZg>iE(qd)WJS1i#?06EA7_5c6? diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 90% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index f499f162a2340d0606ec5d2a641be361d9595513..3424de30c69a8412be88c3eb0fad5cc1ac8b7323 100644 GIT binary patch delta 71 zcmbQtcZzRADkEF6u~AWR_U0tUFlId+E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyn`j02>>vb7fS#D delta 17 YcmX@bH<@okD&yun#$e{n?^&Xm06MJ(3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu index 4b5ce90c0390c061b8073df5ae6d55049ec6f68f..8056afff14f15d35e697ff891467d7e13cdbd153 100644 GIT binary patch delta 72 zcmeC-JIuEsnUOs{+1RM4ID2y)2NEf>p}7eW94 delta 17 YcmX@XH<52c3ghNn#vta+Z&{+506Iwq0ssI2 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index 57a858e8e096ee453931c7422dd8d7805fe1b332..a860522844e466dbc7c31981f2eeed5873ca6f0e 100644 GIT binary patch delta 72 zcmbQhcbsoSA|rczvawN7arWkT#xQ0*9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|$!xx(X delta 17 YcmX@kH-T?MBID*P#t`PsZ&;$406Fvp`v3p{ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 24be928057cdf9eb340bcf475ce2875ee4360659..cf313aa22ab7607c315aab261097eabd5edbc658 100644 GIT binary patch delta 71 zcmbQrcZP365+hr(u~AWR_U3rTXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>xx7f=8I delta 17 YcmX@ZHhONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|$!xx(X delta 17 YcmX@kH-T?MBID*P#t`PsZ&;$406Fvp`v3p{ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 5f5efcc6a15c1dc33cd3c3790bda0f3fe36bd4f3..bf4044e50914d9d9d21da4ee3f3ee56a0b9ca6c0 100644 GIT binary patch delta 71 zcmbQrcZP365+hr(u~AWR_U3rTXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>xx7f=8I delta 17 YcmX@ZH)2N?iZ2( delta 17 YcmX@g*T=UZfpK#tV=(jPS1i#?06CBa@&Et; diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index bc85e8e976b8d7f17502fc307a34d74262ad0c1a..94d728ef5c2b54c7d27aebdae55ca8000921e5e3 100644 GIT binary patch delta 71 zcmbQtcZzRAA|qR}u~AWR_U1UoC}uq!E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyn`j02>>sK7e@d9 delta 17 YcmX@bH<@okBID*P#&G7%?^&Xm06KLB2LJ#7 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index 7f6d69a539ff80ab85085646e5d87f70a1da4bb5..61936ad484ea8e28682fc3c39b92374a439800d4 100644 GIT binary patch delta 72 zcmbQhcbsoSA|rczvawN7arWkT#xQ0*9WDhRNG>hONh~h5vP#a+E3>ll%q_^tOwKHE aPtDEs%}+_SvT}Fz^^Nxn*}Rn{nh5|$!xx(X delta 17 YcmX@kH-T?MBID*P#t`PsZ&;$406Fvp`v3p{ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index bd99ad629e0bd2e994a3fcf127944958dc817230..bdded4456134bb6472106ba555d8b1611f894889 100644 GIT binary patch delta 71 zcmbQrcZP365+hr(u~AWR_U3rTXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyo)882>>xx7f=8I delta 17 YcmX@ZH>m&7d`+0 delta 17 YcmX@XH<52c0^{aP#xUm1Z&{+506Gx`{r~^~ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu index e9c07ff1215a39e41275fc4576360f29caa6dcfd..01200c9e96fc11b297b35b80c14cf00e4fdfa2c4 100644 GIT binary patch delta 72 zcmeC-JIuEso{>F1+1RM4ID2y}V+gaJ4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~IKc%>)2M85fBF delta 17 YcmX@i*Tc6Vo^f*qV-WM^7c9|C068oL=>Px# diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu index e579272659ec9149a41c84d77d4d219d4ae77f12..c00a1f2a940d6d616ee99b51a2f4deb4289ef586 100644 GIT binary patch delta 72 zcmeC)2N?iZ2( delta 17 YcmX@g*T=UZfpK#tV=(jPS1i#?06CBa@&Et; diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 2f6b5d1a478f426d46e541d1979bfcd2b9697cdb..53d97da91dc48b350a0b2e4e66425c27d42547b9 100644 GIT binary patch delta 71 zcmbQtcZzRAA|qR}u~AWR_U1UoC}uq!E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyn`j02>>sK7e@d9 delta 17 YcmX@bH<@okBID*P#&G7%?^&Xm06KLB2LJ#7 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu index f216358a4382ef1532dcd86767cab0315b9c4c21..b9829d3cb23857c961165aa93624a0aa0b94fdc2 100644 GIT binary patch delta 72 zcmeC-JIuEso{>F1+1RM4ID2y}V+gaJ4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~IKc%>)2M85fBF delta 17 YcmX@i*Tc6Vo^f*qV-WM^7c9|C068oL=>Px# diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 8cd3ff76e39b811aa35e6342cfc4a1c2034e6495..112feff5d42250ed46281ccf80301b9d83182e6f 100644 GIT binary patch delta 71 zcmbQpcY<$20wY_pu~AWR_U2f|NM=19E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyp1KA2>>m&7d`+0 delta 17 YcmX@XH<52c0^{aP#xUm1Z&{+506Gx`{r~^~ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index f1476e65620b34f4479327615fe060d330df6895..c190a1ed52012fc7204e6a89fc413ad87526ee9b 100644 GIT binary patch delta 72 zcmeC@JI1#mo{>F1+1RM4ID2y}V>q*(4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~I2W%>)2OaTk>U delta 17 YcmX@c*Uz^ho^f*qV<_|H*DTRY06DD%^#A|> diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 36c0437db73054fcb8135b8f48405b182f8d03c3..c0b7c5675e52fdaf2bcd96fcae43d02d231ec18b 100644 GIT binary patch delta 71 zcmbQjcbacQ0wY_pu~AWR_U2f|7-l^kE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyptuG2>>t%7fJvC delta 17 YcmX@jH-&FQ0^{aP#z^MPA6TN906LNe3IG5A diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu index 954f44e81e0b94069d596595f3b87ed4e34ccd6f..95fd179e9cf6f695745908dd18b384eecb769581 100644 GIT binary patch delta 72 zcmeC@JI1#mo{>F1+1RM4ID2y}V>q*(4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~I2W%>)2OaTk>U delta 17 YcmX@c*Uz^ho^f*qV<_|H*DTRY06DD%^#A|> diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 3bbe65d5ea722b0c141c2bf2afff22796b558fa9..d33faf869640873c223f9a32245fcf9a3929f25d 100644 GIT binary patch delta 71 zcmbQjcbacQ0wY_pu~AWR_U2f|7-l^kE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyptuG2>>t%7fJvC delta 17 YcmX@jH-&FQ0^{aP#z^MPA6TN906LNe3IG5A diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu index e9ab71c6cf494580278007e987352b5c70406ffc..2f1a0e66f1450b416d66d6639d3233e767ef82ca 100644 GIT binary patch delta 72 zcmeC>JHodij*&e++1RM4ID2ypV;HlZ4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~I8Y%>)2MoEMA$ delta 17 YcmX@Y*UPsdj&XB3V+ix+mn_jt069qo>;M1& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 47ff8c2816ac5fe384c241f39544e0c59ba5aa43..0d78e2bb8779e548639fe2bec90137d323649279 100644 GIT binary patch delta 71 zcmbQlcam>IJR@7Ou~AWR_U0JIXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyqzVQ2>>oQ7eN33 delta 17 YcmX@fH;HdUJmcmJ#t7!k?^vRl06H!P0RR91 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index f6a8fea0b7b7739a55feb38fbb3a80d63783014d..3c128645f261956bdcfeaefb42fedb1dbdbbb966 100644 GIT binary patch delta 72 zcmeC@JI1#mo{>F1+1RM4ID2y}V>q*(4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~I2W%>)2OaTk>U delta 17 YcmX@c*Uz^ho^f*qV<_|H*DTRY06DD%^#A|> diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index be9d5f0086fd2fdaee013b1e3058729efd285a80..f807bd3429dfe6e6bb96d3f07660b35bf23aa258 100644 GIT binary patch delta 71 zcmbQjcbacQ0wY_pu~AWR_U2f|7-l^kE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyptuG2>>t%7fJvC delta 17 YcmX@jH-&FQ0^{aP#z^MPA6TN906LNe3IG5A diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 7f9b454f5c18d0a0855c0c5e3c46fe16d8a4338e..89604a7dfc2876e3f717e074a31f62b24ce85ce5 100644 GIT binary patch delta 71 zcmbQhcbsoS93xw@u~AWR_U35DC}uq!E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyp<)I2>>i;7dQX_ delta 17 YcmX@kH-T?M9OLG6#&G7%Z&;$406EG9_y7O^ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu index e49d4672258e8642818a09b38ade176466272b0c..90271408d3a3066d86774bfd47378a64c0fb52cf 100644 GIT binary patch delta 57 zcmeC?JH)pkmXSR^+1RM4ID2z6V<JHodij*&e++1RM4ID2ypV;HlZ4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~I8Y%>)2MoEMA$ delta 17 YcmX@Y*UPsdj&XB3V+ix+mn_jt069qo>;M1& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 36bd53e1970ef44ba1b60f71480973a70a9a37bf..e8685e45a27053891bebc17668c464f310de0d7f 100644 GIT binary patch delta 71 zcmbQlcam>IJR@7Ou~AWR_U0JIXl6YfE(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-LyqzVQ2>>oQ7eN33 delta 17 YcmX@fH;HdUJmcmJ#t7!k?^vRl06H!P0RR91 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu index bf0f2d37ea388f3b12c01a0e01570af3e0045510..eb9cf84796eb7b0db9c2df0f9ddb9a282641bd37 100644 GIT binary patch delta 57 zcmeC?JH)pkmXSR^+1RM4ID2z6V<>i;7dQX_ delta 17 YcmX@kH-T?M9OLG6#&G7%Z&;$406EG9_y7O^ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index 2f7395fed3393b15a5f29e858b281f0932888513..827651580e2bb37706ce6f0d15a66a8a6b583a64 100644 GIT binary patch delta 72 zcmeC?JH)pkk&!(<+1RM4ID2zEV<5Ah4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~H{U%>)2L+ZTrb delta 17 YcmX@a*Uh&fk#Tbtqd)WJ=Pc1o06856=Kufz diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 3f3d43b84ab45b7384ccd8f1b70a9c47218d5882..c0ae8376329e4272a89e27fba1db6cff6185e915 100644 GIT binary patch delta 71 zcmbQhcbsoS5+hr(u~AWR_U3rTFlId+E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyp<)I2>>m47d!v} delta 17 YcmX@kH-T?M665A<#$e{nZ&;$406GE%`~Uy| diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu index cfc62fa610f267b2e546e0957d284480610d44cc..2b66bb58a37e7512b40034f99815eda9250152fa 100644 GIT binary patch delta 72 zcmeC?JH)pkk&!(<+1RM4ID2zEV<5Ah4wnKDB$t-tBo-H2StaM^m04MN<`(2+CTEto ar{?DR=BK1uS-HFV`o{Z(Y~H{U%>)2L+ZTrb delta 17 YcmX@a*Uh&fk#Tbtqd)WJ=Pc1o06856=Kufz diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 3a6c6eb197a38cbd12ed44a364c1bf6c8e6937aa..aee8a5d19f54427a5c724c8c3edad21bb9861cef 100644 GIT binary patch delta 71 zcmbQhcbsoS5+hr(u~AWR_U3rTFlId+E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyp<)I2>>m47d!v} delta 17 YcmX@kH-T?M665A<#$e{nZ&;$406GE%`~Uy| diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu index 1599f6d766e95b39299c176224225e0cb08dde77..67e3717101cf077ecee50b093b1b63f817dde4de 100644 GIT binary patch delta 55 zcmeC=JHWRgfss8v+1RM4ID2y(V*vAHK^9R-&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj?8@qL~2hxDnF; delta 17 YcmX@W*U7gbfpK#tqaX9;r!3J-064h?-T(jq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 6aefb1060b5276f11831a61826c315abe4087e6c..a9514f6cbc74e23cc673ab8d978d5886c9543d89 100644 GIT binary patch delta 71 zcmeC)2L+ZTrb delta 17 YcmX@a*Uh&fk#Tbtqd)WJ=Pc1o06856=Kufz diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 7b798cd2326515d646f6f7bf9eaae42edb284eb3..70ed2f42d18d1f5c2bfd215536da2c0667adba0c 100644 GIT binary patch delta 71 zcmbQhcbsoS5+hr(u~AWR_U3rTFlId+E(IV+E-lGPEH1XPO3u$Kv$FEcEy&4C&Ma|H Z&CT`APf4}1a(DIhjrR-Lyp<)I2>>m47d!v} delta 17 YcmX@kH-T?M665A<#$e{nZ&;$406GE%`~Uy| diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 5119b75ff0c9ce9eb782ecd19ece9577555835ae..6a5c7d2ee0309e29d2e1174e0e4e1a9e67fe5b60 100644 GIT binary patch delta 71 zcmeC-JIuEsfsrlQ*r=#Ddvh#f2(z9JmjVzZmzLxt78hGtCFkdrSy_4J7UX0mXO_69 Z=H~k5r=(h0xx4!M#`}eA-pCTo1OO@67b*Y% delta 17 YcmX@i*Tc6VfpK#tV<7Y97c9|C0697Z>Hq)$ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu index f07ee7af8ee9937e7a7593dcb808b5471ffc161f..98e4034e6b4d9458fa7a67d25eedbcc195a83040 100644 GIT binary patch delta 55 zcmZqY+sC&do{>F1+1RM4ID2y}qd)Uxeil(l&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj-mYqL~2g77@q* delta 17 YcmdnT*Uq;go^f*qqc8L3$1KrI060|z)c^nh diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu index a86754dc5918d75b692fd301bf15219cb6ad3d05..70708175223832ff5bc9e2211467e00b576c2a40 100644 GIT binary patch delta 55 zcmeC=JHWRgfss8v+1RM4ID2y(V*vAHK^9R-&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj?8@qL~2hxDnF; delta 17 YcmX@W*U7gbfpK#tqaX9;r!3J-064h?-T(jq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 07d3a81d545bb5a148ef4d0f9bbe1db506c46d46..84a88a6bf48b8d95a64636b8dab75aaea8128a4d 100644 GIT binary patch delta 71 zcmeCF1+1RM4ID2y}qd)Uxeil(l&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj-mYqL~2g77@q* delta 17 YcmdnT*Uq;go^f*qqc8L3$1KrI060|z)c^nh diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index d5fd88917a13070e55bbe7b627a531f15d7696bb..46ed4028fb37779cf22c65c646fa7106f4ef7686 100644 GIT binary patch delta 71 zcmeC-JIuEsfsrlQ*r=#Ddvh#f2(z9JmjVzZmzLxt78hGtCFkdrSy_4J7UX0mXO_69 Z=H~k5r=(h0xx4!M#`}eA-pCTo1OO@67b*Y% delta 17 YcmX@i*Tc6VfpK#tV<7Y97c9|C0697Z>Hq)$ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu index a0d811bf4fd523dfd65ea8476c4bbffa5ab67020..07f2e0a743a77ca13ddd15b59c24345ce8d8eb38 100644 GIT binary patch delta 57 zcmeC?JH)pkmXSR^+1RM4ID2z6V<>i;7dQX_ delta 17 YcmX@kH-T?M9OLG6#&G7%Z&;$406EG9_y7O^ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu index 4f7a3e28a02bd11967f2e82d2671cbadb9c59328..ba90c2af3abc945c5dc062120ed07c4e01b118e8 100644 GIT binary patch delta 57 zcmeC?JH)pkmXSR^+1RM4ID2z6V<>i;7dQX_ delta 17 YcmX@kH-T?M9OLG6#&G7%Z&;$406EG9_y7O^ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu index 650b53b7e54303bdd3ef1d75730ee248c7815af2..594b5e669358f49ed16a71202064f4a07f45c4c3 100644 GIT binary patch delta 55 zcmeC=JHWRghLJr!+1RM4ID2yxV+iwPK^9R-&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj?8@qL~2g=n>BV delta 17 YcmX@W*U7gbhH-N$V-WM^r!3J-062jK+5i9m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index a6c1709c2c960ad219225a4282541b290c9258fc..2548d3735ccd809475558ff78c2facc35dabe5be 100644 GIT binary patch delta 71 zcmeC>i;7dQX_ delta 17 YcmX@kH-T?M9OLG6#&G7%Z&;$406EG9_y7O^ diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index b3bba802c7b25f85ca8ad7626e83fd881da7ba10..5876b00426ecd55f6ed0112e0aabb63cc1e4e9e8 100644 GIT binary patch delta 71 zcmeC-JIuEshLJ7V*r=#DdvhdX1hbwFmjVzZmzLxt78hGtCFkdrSy_4J7UX0mXO_69 Z=H~k5r=(h0xx4!M#`}eA-pCTo1OO<=7bXAz delta 17 YcmX@i*Tc6VhH-N$V<_|H7c9|C0678$<^TWy diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu index fff4773fa31488b6e46ca0d071a9c4cded52cdc2..d88ee03e44d68d0f4d46ff4454c9745e1329cd34 100644 GIT binary patch delta 55 zcmZqY+sC&dnvp#|+1RM4ID2y>V=(h%eil(l&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj-mYqL~2fMiImS delta 17 YcmdnT*Uq;gnsIXqV<7Y9$1KrI05}~5(EtDd diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu index 32455afbf1be5ce2912d57517a1885919cecb07d..13ab6aed77170f347c1c0d6c8dc331dd0b4554a7 100644 GIT binary patch delta 55 zcmeC=JHWRghLJr!+1RM4ID2yxV+iwPK^9R-&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj?8@qL~2g=n>BV delta 17 YcmX@W*U7gbhH-N$V-WM^r!3J-062jK+5i9m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 169f71792d44b8e843fdacf044a8b75e905003d0..e0b8a0fd88c4e3f41b5e268ee2b366799c995b55 100644 GIT binary patch delta 71 zcmeCV=(h%eil(l&)kBX%;d}x_te~6-~5zRD=T+bU*CAY Kkj-mYqL~2fMiImS delta 17 YcmdnT*Uq;gnsIXqV<7Y9$1KrI05}~5(EtDd diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu similarity index 91% rename from dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu rename to dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu index 7c9302c97884a0faf45a9fb890ab0f1328f7fd30..8a5b7e8b037ae4f37a410bb11cc7e12b363d7f87 100644 GIT binary patch delta 71 zcmeC-JIuEshLJ7V*r=#DdvhdX1hbwFmjVzZmzLxt78hGtCFkdrSy_4J7UX0mXO_69 Z=H~k5r=(h0xx4!M#`}eA-pCTo1OO<=7bXAz delta 17 YcmX@i*Tc6VhH-N$V<_|H7c9|C0678$<^TWy diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 07eaefd1a..43e612d2e 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -67,6 +67,9 @@ public: class AlgoInt4NCHW64IMMAImplicitGemmBase; class AlgoInt4Int4NCHW64IMMAImplicitGemm; class AlgoUInt4Int4NCHW64IMMAImplicitGemm; + class AlgoInt4NHWCIMMAImplicitGemmBase; + class AlgoInt4Int4NHWCIMMAImplicitGemm; + class AlgoUInt4Int4NHWCIMMAImplicitGemm; class AlgoBFloat16; class AlgoPack; diff --git a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu index 1d054ae4f..63ca450e6 100644 --- a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu +++ b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu @@ -62,7 +62,7 @@ void megdnn::cuda::cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionDgradNCxHWxThreadblockSwizzle, \ - stage_, 4, aligned_>; \ + stage_, 4, aligned_, true, cutlass::arch::OpMultiplyAdd>; \ typename Deconvolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp index 10f0c70d6..15380a34d 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -48,7 +48,8 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: // TODO: support dialtion available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); // FIXME: too large filter size is not supported now - available &= fm.spatial[0] * fm.spatial[1] <= 64; + available &= fm.spatial[0] * fm.spatial[1] <= + (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2); // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp index a8f2fd9d8..ccf466abd 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp @@ -50,7 +50,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm:: // TODO: support dialtion available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); // FIXME: too large filter size is not supported now - available &= fm.spatial[0] * fm.spatial[1] <= 64; + available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2); // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu similarity index 92% rename from dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu rename to dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu index 63d8b005d12b85d227605f38e1ce943ebd54a88e..c0c8a8faf03dedfd5c801ef3d2938182bca4bfc4 100644 GIT binary patch delta 71 zcmdnO`;vD<3L{&5wn diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu similarity index 92% rename from dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu rename to dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu index 5eaa5d469f5c63686b53d679be09d077adcb0370..1da05f648284b2cdb9c76c458bd0ded28ee9f79d 100644 GIT binary patch delta 71 zcmdnY`+|2vG9z1jwn diff --git a/dnn/test/cuda/conv_test_utils.cpp b/dnn/test/cuda/conv_test_utils.cpp index f6c4abe39..387e97052 100644 --- a/dnn/test/cuda/conv_test_utils.cpp +++ b/dnn/test/cuda/conv_test_utils.cpp @@ -100,6 +100,11 @@ std::vector get_det_first_bench_args(size_t batch) { std::vector args; args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2}); args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); + args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2}); + args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1}); + args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2}); + args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1}); + args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2}); return args; } -- GitLab