From 5a14a892246fad6329f99751036b576ca8b5a004 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 31 May 2021 11:41:30 +0800 Subject: [PATCH] refactor(dnn/cuda): refactor cutlass kernel generator for gemm and gemv GitOrigin-RevId: 11d78ab2270f0720d7d79e186124a1254c467980 --- dnn/scripts/Makefile | 14 ++++---- ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1826 -> 1769 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1824 -> 1767 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1824 -> 1767 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1825 -> 1768 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1824 -> 1767 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1824 -> 1767 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1821 -> 1764 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1822 -> 1765 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1821 -> 1764 bytes ...1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1818 -> 1761 bytes ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1821 -> 1764 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1819 -> 1762 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1819 -> 1762 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1820 -> 1763 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1812 -> 1755 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1819 -> 1762 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1819 -> 1762 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1816 -> 1759 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1808 -> 1751 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1817 -> 1760 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu | Bin 1816 -> 1759 bytes ...p_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu | Bin 1808 -> 1751 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1815 -> 1758 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu | Bin 1813 -> 1756 bytes ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1851 -> 1794 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1850 -> 1793 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1842 -> 1785 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1846 -> 1789 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1838 -> 1781 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1846 -> 1789 bytes ...p_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1838 -> 1781 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1843 -> 1786 bytes ..._hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1841 -> 1784 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1852 -> 1795 bytes ...1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1839 -> 1782 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1850 -> 1793 bytes ...1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1839 -> 1782 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1850 -> 1793 bytes ..._hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1840 -> 1783 bytes ..._1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ...1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1839 -> 1782 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1850 -> 1793 bytes ...x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1839 -> 1782 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1850 -> 1793 bytes ...x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...dentity_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...identity_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ...identity_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ...dentity_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1836 -> 1779 bytes ...x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ...identity_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ..._identity_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ..._identity_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ...identity_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ..._identity_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ..._identity_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ...x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1837 -> 1780 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1848 -> 1791 bytes ...1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ...1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ...x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1836 -> 1779 bytes ...op_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ...1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ..._1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ..._1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ...1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1835 -> 1778 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1846 -> 1789 bytes ..._1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ..._1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1833 -> 1776 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1844 -> 1787 bytes ..._hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1836 -> 1779 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1847 -> 1790 bytes ...p_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1834 -> 1777 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1845 -> 1788 bytes ...p_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1834 -> 1777 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1845 -> 1788 bytes ..._hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1835 -> 1778 bytes ...prop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1827 -> 1770 bytes ...p_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1834 -> 1777 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1845 -> 1788 bytes ...op_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...op_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...p_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1834 -> 1777 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1845 -> 1788 bytes ...op_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...op_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...dentity_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...identity_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...identity_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...dentity_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1831 -> 1774 bytes ...op_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1823 -> 1766 bytes ...identity_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ..._identity_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ..._identity_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...identity_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ..._identity_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ..._identity_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...op_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu | Bin 1832 -> 1775 bytes ...8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1843 -> 1786 bytes ...rop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...rop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...op_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu | Bin 1831 -> 1774 bytes ...ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu | Bin 1823 -> 1766 bytes ...rop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...prop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...prop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...rop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu | Bin 1830 -> 1773 bytes ...s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1841 -> 1784 bytes ...prop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...prop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu | Bin 1828 -> 1771 bytes ..._s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu | Bin 1839 -> 1782 bytes ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1853 -> 1796 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1851 -> 1794 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1851 -> 1794 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1852 -> 1795 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1844 -> 1787 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1851 -> 1794 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1851 -> 1794 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1848 -> 1791 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1840 -> 1783 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1849 -> 1792 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1848 -> 1791 bytes ...p_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1840 -> 1783 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1847 -> 1790 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1845 -> 1788 bytes ...swish_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1869 -> 1812 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1874 -> 1817 bytes ...swish_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1869 -> 1812 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1874 -> 1817 bytes ...hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1867 -> 1810 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1872 -> 1815 bytes ...swish_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1869 -> 1812 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1874 -> 1817 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ..._hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1867 -> 1810 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1872 -> 1815 bytes ..._hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ...ntity_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ...ntity_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ...entity_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1863 -> 1806 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1868 -> 1811 bytes ...ntity_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1866 -> 1809 bytes ...dentity_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1861 -> 1804 bytes ...entity_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1863 -> 1806 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1868 -> 1811 bytes ...dentity_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1861 -> 1804 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1866 -> 1809 bytes ..._relu_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ..._relu_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ...1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1863 -> 1806 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1868 -> 1811 bytes ..._relu_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1865 -> 1808 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1870 -> 1813 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1866 -> 1809 bytes ...x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1861 -> 1804 bytes ...1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1863 -> 1806 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1868 -> 1811 bytes ...x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1861 -> 1804 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1866 -> 1809 bytes ...swish_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1864 -> 1807 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1869 -> 1812 bytes ...swish_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1864 -> 1807 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1869 -> 1812 bytes ...hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1862 -> 1805 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1867 -> 1810 bytes ...swish_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1864 -> 1807 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1869 -> 1812 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ..._hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1862 -> 1805 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1867 -> 1810 bytes ..._hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ...ntity_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ...ntity_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ...entity_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1858 -> 1801 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1863 -> 1806 bytes ...ntity_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1861 -> 1804 bytes ...dentity_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1856 -> 1799 bytes ...entity_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1858 -> 1801 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1863 -> 1806 bytes ...dentity_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1856 -> 1799 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1861 -> 1804 bytes ..._relu_s8_128x128x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ..._relu_s8_128x256x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ...p_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu | Bin 1858 -> 1801 bytes ...s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu | Bin 1863 -> 1806 bytes ..._relu_s8_256x128x64_64x64x64_2_nc32hw32.cu | Bin 1860 -> 1803 bytes ...8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu | Bin 1865 -> 1808 bytes ..._s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu | Bin 1861 -> 1804 bytes ...op_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu | Bin 1856 -> 1799 bytes ...p_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu | Bin 1858 -> 1801 bytes ...s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu | Bin 1863 -> 1806 bytes ...op_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu | Bin 1856 -> 1799 bytes ..._s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu | Bin 1861 -> 1804 bytes ...cuinl => cutlass_matrix_mul_wrapper.cuinl} | 0 ...ix_mul_wrapper_batched_gemv_strided.cuinl} | 0 ...utlass_simt_sgemm_128x128_8x2_nn_align1.cu | Bin 0 -> 1813 bytes ...utlass_simt_sgemm_128x128_8x2_nt_align1.cu | Bin 0 -> 1810 bytes ...utlass_simt_sgemm_128x128_8x2_tn_align1.cu | Bin 0 -> 1810 bytes ...utlass_simt_sgemm_128x128_8x2_tt_align1.cu | Bin 0 -> 1807 bytes ...cutlass_simt_sgemm_128x32_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_128x32_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_128x32_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_128x32_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_128x64_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_128x64_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_128x64_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_128x64_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_16x128_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_16x128_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_16x128_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_16x128_8x2_tt_align1.cu | Bin 0 -> 1799 bytes .../cutlass_simt_sgemm_16x32_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_16x32_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_16x32_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_16x32_8x2_tt_align1.cu | Bin 0 -> 1791 bytes .../cutlass_simt_sgemm_16x64_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_16x64_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_16x64_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_16x64_8x2_tt_align1.cu | Bin 0 -> 1791 bytes ...cutlass_simt_sgemm_256x32_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_256x32_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_256x32_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_256x32_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_256x64_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_256x64_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_256x64_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_256x64_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_32x128_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_32x128_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_32x128_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_32x128_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_32x256_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_32x256_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_32x256_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_32x256_8x2_tt_align1.cu | Bin 0 -> 1799 bytes .../cutlass_simt_sgemm_32x32_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_32x32_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_32x32_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_32x32_8x2_tt_align1.cu | Bin 0 -> 1791 bytes .../cutlass_simt_sgemm_32x64_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_32x64_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_32x64_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_32x64_8x2_tt_align1.cu | Bin 0 -> 1791 bytes ...cutlass_simt_sgemm_64x128_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_64x128_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_64x128_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_64x128_8x2_tt_align1.cu | Bin 0 -> 1799 bytes ...cutlass_simt_sgemm_64x256_8x2_nn_align1.cu | Bin 0 -> 1805 bytes ...cutlass_simt_sgemm_64x256_8x2_nt_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_64x256_8x2_tn_align1.cu | Bin 0 -> 1802 bytes ...cutlass_simt_sgemm_64x256_8x2_tt_align1.cu | Bin 0 -> 1799 bytes .../cutlass_simt_sgemm_64x32_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_64x32_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_64x32_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_64x32_8x2_tt_align1.cu | Bin 0 -> 1791 bytes .../cutlass_simt_sgemm_64x64_8x2_nn_align1.cu | Bin 0 -> 1797 bytes .../cutlass_simt_sgemm_64x64_8x2_nt_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_64x64_8x2_tn_align1.cu | Bin 0 -> 1794 bytes .../cutlass_simt_sgemm_64x64_8x2_tt_align1.cu | Bin 0 -> 1791 bytes .../cutlass_simt_sgemm_8x32_8x2_nn_align1.cu | Bin 0 -> 1788 bytes .../cutlass_simt_sgemm_8x32_8x2_nt_align1.cu | Bin 0 -> 1785 bytes .../cutlass_simt_sgemm_8x32_8x2_tn_align1.cu | Bin 0 -> 1785 bytes .../cutlass_simt_sgemm_8x32_8x2_tt_align1.cu | Bin 0 -> 1782 bytes ..._split_k_parallel_128x128_8x2_nn_align1.cu | Bin 0 -> 1808 bytes ..._split_k_parallel_128x128_8x2_nt_align1.cu | Bin 0 -> 1805 bytes ..._split_k_parallel_128x128_8x2_tn_align1.cu | Bin 0 -> 1805 bytes ..._split_k_parallel_128x128_8x2_tt_align1.cu | Bin 0 -> 1802 bytes ...m_split_k_parallel_128x32_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_128x32_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_128x32_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_128x32_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_128x64_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_128x64_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_128x64_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_128x64_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_16x128_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_16x128_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_16x128_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_16x128_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...mm_split_k_parallel_16x32_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_16x32_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_16x32_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_16x32_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...mm_split_k_parallel_16x64_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_16x64_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_16x64_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_16x64_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...m_split_k_parallel_256x32_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_256x32_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_256x32_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_256x32_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_256x64_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_256x64_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_256x64_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_256x64_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_32x128_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_32x128_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_32x128_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_32x128_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_32x256_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_32x256_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_32x256_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_32x256_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...mm_split_k_parallel_32x32_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_32x32_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_32x32_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_32x32_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...mm_split_k_parallel_32x64_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_32x64_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_32x64_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_32x64_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...m_split_k_parallel_64x128_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_64x128_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_64x128_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_64x128_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...m_split_k_parallel_64x256_8x2_nn_align1.cu | Bin 0 -> 1800 bytes ...m_split_k_parallel_64x256_8x2_nt_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_64x256_8x2_tn_align1.cu | Bin 0 -> 1797 bytes ...m_split_k_parallel_64x256_8x2_tt_align1.cu | Bin 0 -> 1794 bytes ...mm_split_k_parallel_64x32_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_64x32_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_64x32_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_64x32_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...mm_split_k_parallel_64x64_8x2_nn_align1.cu | Bin 0 -> 1792 bytes ...mm_split_k_parallel_64x64_8x2_nt_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_64x64_8x2_tn_align1.cu | Bin 0 -> 1789 bytes ...mm_split_k_parallel_64x64_8x2_tt_align1.cu | Bin 0 -> 1786 bytes ...emm_split_k_parallel_8x32_8x2_nn_align1.cu | Bin 0 -> 1783 bytes ...emm_split_k_parallel_8x32_8x2_nt_align1.cu | Bin 0 -> 1780 bytes ...emm_split_k_parallel_8x32_8x2_tn_align1.cu | Bin 0 -> 1780 bytes ...emm_split_k_parallel_8x32_8x2_tt_align1.cu | Bin 0 -> 1777 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_nn.cu | Bin 1648 -> 0 bytes ...mt_128x128x8_32x64x8_nn_splitk_parallel.cu | Bin 1587 -> 0 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_nt.cu | Bin 1651 -> 0 bytes ...mt_128x128x8_32x64x8_nt_splitk_parallel.cu | Bin 1590 -> 0 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_tn.cu | Bin 1651 -> 0 bytes ...mt_128x128x8_32x64x8_tn_splitk_parallel.cu | Bin 1590 -> 0 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_tt.cu | Bin 1654 -> 0 bytes ...mt_128x128x8_32x64x8_tt_splitk_parallel.cu | Bin 1593 -> 0 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu | Bin 1647 -> 0 bytes ...imt_128x32x8_64x32x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu | Bin 1650 -> 0 bytes ...imt_128x32x8_64x32x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu | Bin 1650 -> 0 bytes ...imt_128x32x8_64x32x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu | Bin 1653 -> 0 bytes ...imt_128x32x8_64x32x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu | Bin 1647 -> 0 bytes ...imt_128x64x8_64x32x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu | Bin 1650 -> 0 bytes ...imt_128x64x8_64x32x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu | Bin 1650 -> 0 bytes ...imt_128x64x8_64x32x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu | Bin 1653 -> 0 bytes ...imt_128x64x8_64x32x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu | Bin 1647 -> 0 bytes ...imt_16x128x8_16x64x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu | Bin 1650 -> 0 bytes ...imt_16x128x8_16x64x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu | Bin 1650 -> 0 bytes ...imt_16x128x8_16x64x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu | Bin 1653 -> 0 bytes ...imt_16x128x8_16x64x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu | Bin 1646 -> 0 bytes ...simt_16x32x8_16x32x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu | Bin 1649 -> 0 bytes ...simt_16x32x8_16x32x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu | Bin 1649 -> 0 bytes ...simt_16x32x8_16x32x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu | Bin 1652 -> 0 bytes ...simt_16x32x8_16x32x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu | Bin 1646 -> 0 bytes ...simt_16x64x8_16x64x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu | Bin 1649 -> 0 bytes ...simt_16x64x8_16x64x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu | Bin 1649 -> 0 bytes ...simt_16x64x8_16x64x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu | Bin 1652 -> 0 bytes ...simt_16x64x8_16x64x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu | Bin 1647 -> 0 bytes ...imt_256x32x8_64x16x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu | Bin 1650 -> 0 bytes ...imt_256x32x8_64x16x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu | Bin 1650 -> 0 bytes ...imt_256x32x8_64x16x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu | Bin 1653 -> 0 bytes ...imt_256x32x8_64x16x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu | Bin 1647 -> 0 bytes ...imt_256x64x8_64x32x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu | Bin 1650 -> 0 bytes ...imt_256x64x8_64x32x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu | Bin 1650 -> 0 bytes ...imt_256x64x8_64x32x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu | Bin 1653 -> 0 bytes ...imt_256x64x8_64x32x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu | Bin 1647 -> 0 bytes ...imt_32x128x8_32x64x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu | Bin 1650 -> 0 bytes ...imt_32x128x8_32x64x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu | Bin 1650 -> 0 bytes ...imt_32x128x8_32x64x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu | Bin 1653 -> 0 bytes ...imt_32x128x8_32x64x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu | Bin 1647 -> 0 bytes ...imt_32x256x8_16x64x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu | Bin 1650 -> 0 bytes ...imt_32x256x8_16x64x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu | Bin 1650 -> 0 bytes ...imt_32x256x8_16x64x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu | Bin 1653 -> 0 bytes ...imt_32x256x8_16x64x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu | Bin 1646 -> 0 bytes ...simt_32x32x8_32x32x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu | Bin 1649 -> 0 bytes ...simt_32x32x8_32x32x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu | Bin 1649 -> 0 bytes ...simt_32x32x8_32x32x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu | Bin 1652 -> 0 bytes ...simt_32x32x8_32x32x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu | Bin 1646 -> 0 bytes ...simt_32x64x8_32x64x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu | Bin 1649 -> 0 bytes ...simt_32x64x8_32x64x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu | Bin 1649 -> 0 bytes ...simt_32x64x8_32x64x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu | Bin 1652 -> 0 bytes ...simt_32x64x8_32x64x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu | Bin 1647 -> 0 bytes ...imt_64x128x8_32x64x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu | Bin 1650 -> 0 bytes ...imt_64x128x8_32x64x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu | Bin 1650 -> 0 bytes ...imt_64x128x8_32x64x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu | Bin 1653 -> 0 bytes ...imt_64x128x8_32x64x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu | Bin 1647 -> 0 bytes ...imt_64x256x8_32x64x8_nn_splitk_parallel.cu | Bin 1586 -> 0 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu | Bin 1650 -> 0 bytes ...imt_64x256x8_32x64x8_nt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu | Bin 1650 -> 0 bytes ...imt_64x256x8_32x64x8_tn_splitk_parallel.cu | Bin 1589 -> 0 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu | Bin 1653 -> 0 bytes ...imt_64x256x8_32x64x8_tt_splitk_parallel.cu | Bin 1592 -> 0 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu | Bin 1646 -> 0 bytes ...simt_64x32x8_64x32x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu | Bin 1649 -> 0 bytes ...simt_64x32x8_64x32x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu | Bin 1649 -> 0 bytes ...simt_64x32x8_64x32x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu | Bin 1652 -> 0 bytes ...simt_64x32x8_64x32x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu | Bin 1646 -> 0 bytes ...simt_64x64x8_32x64x8_nn_splitk_parallel.cu | Bin 1585 -> 0 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu | Bin 1649 -> 0 bytes ...simt_64x64x8_32x64x8_nt_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu | Bin 1649 -> 0 bytes ...simt_64x64x8_32x64x8_tn_splitk_parallel.cu | Bin 1588 -> 0 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu | Bin 1652 -> 0 bytes ...simt_64x64x8_32x64x8_tt_splitk_parallel.cu | Bin 1591 -> 0 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu | Bin 1644 -> 0 bytes ...2_simt_8x32x8_8x32x8_nn_splitk_parallel.cu | Bin 1583 -> 0 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu | Bin 1647 -> 0 bytes ...2_simt_8x32x8_8x32x8_nt_splitk_parallel.cu | Bin 1586 -> 0 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu | Bin 1647 -> 0 bytes ...2_simt_8x32x8_8x32x8_tn_splitk_parallel.cu | Bin 1586 -> 0 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu | Bin 1650 -> 0 bytes ...2_simt_8x32x8_8x32x8_tt_splitk_parallel.cu | Bin 1589 -> 0 bytes ...mv_batched_strided_1x128_16_tt_align2x4.cu | 31 ++++++++++++++++++ ...mv_batched_strided_1x128_16_tt_align4x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_2_tt_align1x1.cu | 31 ++++++++++++++++++ ...mv_batched_strided_1x128_32_tt_align4x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_4_tt_align1x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_4_tt_align2x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_8_tt_align1x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_8_tt_align2x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x128_8_tt_align4x1.cu | 31 ++++++++++++++++++ ...mv_batched_strided_1x32_128_tt_align4x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_16_tt_align1x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_16_tt_align2x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_32_tt_align1x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_32_tt_align2x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_32_tt_align4x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_64_tt_align2x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x32_64_tt_align4x2.cu | 31 ++++++++++++++++++ ...gemv_batched_strided_1x32_8_tt_align1x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_16_tt_align1x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_16_tt_align2x2.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_16_tt_align4x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_32_tt_align2x4.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_32_tt_align4x2.cu | 31 ++++++++++++++++++ ...gemv_batched_strided_1x64_4_tt_align1x1.cu | 31 ++++++++++++++++++ ...emv_batched_strided_1x64_64_tt_align4x4.cu | 31 ++++++++++++++++++ ...gemv_batched_strided_1x64_8_tt_align1x2.cu | 31 ++++++++++++++++++ ...gemv_batched_strided_1x64_8_tt_align2x1.cu | 31 ++++++++++++++++++ ...imt_gemv_batched_strided_1x128x16_1x2x4.cu | 26 --------------- ...imt_gemv_batched_strided_1x128x16_1x4x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x2_1x1x1.cu | 26 --------------- ...imt_gemv_batched_strided_1x128x32_1x4x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x4_1x1x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x4_1x2x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x8_1x1x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x8_1x2x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x128x8_1x4x1.cu | 26 --------------- ...imt_gemv_batched_strided_1x32x128_1x4x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x16_1x1x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x16_1x2x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x32_1x1x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x32_1x2x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x32_1x4x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x64_1x2x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x32x64_1x4x2.cu | 26 --------------- ..._simt_gemv_batched_strided_1x32x8_1x1x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x16_1x1x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x16_1x2x2.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x16_1x4x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x32_1x2x4.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x32_1x4x2.cu | 26 --------------- ..._simt_gemv_batched_strided_1x64x4_1x1x1.cu | 26 --------------- ...simt_gemv_batched_strided_1x64x64_1x4x4.cu | 26 --------------- ..._simt_gemv_batched_strided_1x64x8_1x1x2.cu | 26 --------------- ..._simt_gemv_batched_strided_1x64x8_1x2x1.cu | 26 --------------- 665 files changed, 844 insertions(+), 709 deletions(-) rename dnn/src/cuda/matrix_mul/{fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl => cutlass_matrix_mul_wrapper.cuinl} (100%) rename dnn/src/cuda/matrix_mul/{fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl => cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl} (100%) create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu delete mode 100644 dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index 88076331..e852e7b4 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -37,21 +37,21 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py ./$^ --type cuda $@ -../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator +../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator/generator.py ./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ - python3 ./cutlass_generator/generator.py --operations all --type simt $@ + python3 ./cutlass_generator/generator.py --operations conv2d --type simt $@ -../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator +../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator/generator.py ./gen_cuda_conv_bias_kern_impls.py --type imma $@ python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@ ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py ./$^ --type dp4a $@ -../src/cuda/matrix_mul/fp32_simt/kimpl: gen_cutlass_matmul_kern_impls.py - ./$^ $@ +../src/cuda/matrix_mul/fp32_simt/kimpl: cutlass_generator/generator.py + python3 ./cutlass_generator/generator.py --operations gemm --type simt $@ -../src/cuda/matrix_mul/fp32_simt_gemv/kimpl: gen_cutlass_gemv_batched_strided_kern_impls.py - ./$^ $@ +../src/cuda/matrix_mul/fp32_simt_gemv/kimpl: cutlass_generator + python3 ./cutlass_generator/generator.py --operations gemv --type simt $@ .PHONY: all diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index e4e04e8413e468a5171f0b387ea027383dbe86eb..828ad12a3bd149b7e1abd585f761ae4585089bde 100644 GIT binary patch delta 91 zcmZ3)_mX$RWR}U3SfVC>Vle_@<;lLRqF}n4RdDhmR)fjPY+{q&v5E<1=9OqEl;;;^ l7Z)TZr|KwhO;%)c2Fo_E=}z9vCIMzYXOm>)n*5(l834#I9sd9T delta 156 zcmaFKyNGYYWR}TOSRz>zfMBvLtLkI{R(=@A3dT^KT*WGg;H+ciWd*5Mm^=Zf;XW&e z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q3-=rVZC5F*%4$2V!0gn-r4pS~iJ^{^FDG Hv2gU@-z><;mWxqF}m{RdDhGR)fikY+{q&u!;$0=9OqEl;;;^ w7Z)TZr|KwhO_pbK2Fup5=}z9rCIMzYWs_v&n&>CNU65bErJS0Vl9|Q@02PNIt^fc4 delta 156 zcmaFPyMS-QM3%{uSRz>zfMBvTtLkJvR(=@A3dT^KT)`@c;H+WgWd*5MnEZi7Zt^u& zemO8tAv3Q;OQAf!D7&~IF*#L7feRuGQYJo`k4+n{Nn&yUn-0XhDmE!3;ni#s6aB>} I-(lkh09G|6<^TWy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 42e7b2174f259d35964b1476682a7b8dc8152a1e..60ed11c6db3f2cfc58ac9e574c37b6d064a4e363 100644 GIT binary patch delta 102 zcmZ3$_nddbM3%`DSfVC>U@-z><;mWxqF}m{RdDhGR)fikY+{q&u!;$0=9OqEl;;;^ w7Z)TZr|KwhO_pbK2Fup5=}z9rCIMzYWs_v&n&>CNU65bErJS0Vl9|Q@02PNIt^fc4 delta 156 zcmaFPyMS-QM3%{uSRz>zfMBvTtLkJvR(=@A3dT^KT)`@c;H+WgWd*5MnEZi7Zt^u& zemO8tAv3Q;OQAf!D7&~IF*#L7feRuGQYJo`k4+n{Nn&yUn-0XhDmE!3;ni#s6aB>} I-(lkh09G|6<^TWy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index 170b469b45c0a5e6b3df184bc93ca752d0f33a73..c6b29e41e0e3dd48a011e09be14149ef1a8c53c5 100644 GIT binary patch delta 91 zcmZ3;_kwrBB$mk&S)wL?WHAC_<;gy*qF}moUgRMmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)U@-z><;mWxqF}m{RdDhGR)fikY+{q&u!;$0=9OqEl;;;^ w7Z)TZr|KwhO_pbK2Fup5=}z9rCIMzYWs_v&n&>CNU65bErJS0Vl9|Q@02PNIt^fc4 delta 156 zcmaFPyMS-QM3%{uSRz>zfMBvTtLkJvR(=@A3dT^KT)`@c;H+WgWd*5MnEZi7Zt^u& zemO8tAv3Q;OQAf!D7&~IF*#L7feRuGQYJo`k4+n{Nn&yUn-0XhDmE!3;ni#s6aB>} I-(lkh09G|6<^TWy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 4e41d46c53deea28d7f1eae223b228fb96cc2626..c9346d31a97befe9c2976c3fc19198cfba68b2f6 100644 GIT binary patch delta 117 zcmbQo_mp=-Kg;AkmZ-_^Sd4&Jd9o*~D41?%6`VYe)nKwbo7m(AR(1uK;u0-|l=zh5 z5*-CD1%=E!Ah$ffD7&~IF*y||G+CC-8LYU5O?UEoHVH8MF`FbK*F--F?t=URF6Gp` Il*}|P0Os-}IRF3v delta 168 zcmaFLJCAQeKg;9^ERieU@-z><;mWxqF}m{RdDhGR)fikY+{q&u!;$0=9OqEl;;;^ w7Z)TZr|KwhO_pbK2Fup5=}z9rCIMzYWs_v&n&>CNU65bErJS0Vl9|Q@02PNIt^fc4 delta 156 zcmaFPyMS-QM3%{uSRz>zfMBvTtLkJvR(=@A3dT^KT)`@c;H+WgWd*5MnEZi7Zt^u& zemO8tAv3Q;OQAf!D7&~IF*#L7feRuGQYJo`k4+n{Nn&yUn-0XhDmE!3;ni#s6aB>} I-(lkh09G|6<^TWy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index 70d1149d659f4f42a50b79df12eb2823595de9cb..eab74ead9a40e9150337e3b544f6c66c908dacef 100644 GIT binary patch delta 117 zcmbQo_mp=-Kg;AkmZ-_^Sd4&Jd9o*~D41?%6`VYe)nKwbo7m(AR(1uK;u0-|l=zh5 z5*-CD1%=E!Ah$ffD7&~IF*y||G+CC-8LYU5O?UEoHVH8MF`FbK*F--F?t=URF6Gp` Il*}|P0Os-}IRF3v delta 168 zcmaFLJCAQeKg;9^ERieFU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 7aa181061659beb5bbde061510a377fa86c6f7b9..50b324d0a50d6d033eb16a6f7fd400a6713560bc 100644 GIT binary patch delta 107 zcmbQk_n3D>FU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index b1c7e2e8d4f80930d3afc6cc642fe542eae1643b..8e749a6cb244586af7489cbf67a15e59fa097c2d 100644 GIT binary patch delta 106 zcmbQs_k?#tAIs!kmZ-_etjd!GSoy)U2dgNMHUYB_u<}m6%&H@pnOCBvP@Z3uU0jfu yoT{V1HTfc|#AFjTZJ@09x@(Z|>Q}a?X)3^X8)*+(+ delta 165 zcmaFDJC|=mAIs!^mPi%_AebD$sydmQl^?;eWfg^S%wP=F$tziTA)J0T$;tVwY{D+Z zC0Yt8@hQb6ItpAMlYnMDfNAY#lbFoSrj5|y%ci3QQIuSok{Dc4l$w|uU!qV9W*`jc LXEU69h)oFqv|1== diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 51b82ca72c3790666bd4cc0e41af95a2af0b7be7..a81e5ae844f5f8a589006d50d1cf4dbf5fc4991b 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index ab877a149e41c22bcf179a8e6638726187b77379..66bfdd0adda1da611e7687669661cf43c6b7b6ad 100644 GIT binary patch delta 107 zcmbQk_n3D>FU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 6fa4962bf5460ae4444ef8b2bdf972913cbed2bf..0e0f2263be8689ab34ebb2ef7fce1dcad489ee2c 100644 GIT binary patch delta 107 zcmbQm_mFo(H_PNMmZ-@}tjd#lS^0sq1(@B;DhQ_cvhq$&ViTWi&&oFWEl{3`O&G`* xn{3FY4WymLl1oz(gG-816LaHB6pFzN9R;q*32c&+AFxRFU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index a54b21f203aba0924731a47630f3569248d73eb5..95ef360946a9519902440e2cfc5400b6f1d3192c 100644 GIT binary patch delta 107 zcmbQm_mFo(H_PNMmZ-@}tjd#lS^0sq1(@B;DhQ_cvhq$&ViTWi&&oFWEl{3`O&G`* xn{3FY4WymLl1oz(gG-816LaHB6pFzN9R;q*32c&+AFxRFU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 5ce46a975a27f5bd41115b5597d1e6aa563a72f5..c1d06a193795a77d2234843381f3aaaa63a42cc2 100644 GIT binary patch delta 107 zcmbQk_n3D>FU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index 64669859f7649dde63c4dea4be8d7b937db26185..eeb92cda4a7b42b999f51359090f113f67b6988e 100644 GIT binary patch delta 106 zcmbQs_k?#tAIs!kmZ-_etjd!GSoy)U2dgNMHUYB_u<}m6%&H@pnOCBvP@Z3uU0jfu yoT{V1HTfc|#AFjTZJ@09x@(Z|>Q}a?X)3^X8)*+(+ delta 165 zcmaFDJC|=mAIs!^mPi%_AebD$sydmQl^?;eWfg^S%wP=F$tziTA)J0T$;tVwY{D+Z zC0Yt8@hQb6ItpAMlYnMDfNAY#lbFoSrj5|y%ci3QQIuSok{Dc4l$w|uU!qV9W*`jc LXEU69h)oFqv|1== diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index c1bf656194b27f7d5502aa2ba542d11f6d21133b..defd1f2da49725db8b1eb270d15d8c77de61c88f 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 8145c4ed795f19adbed7b2e55159370373d2ad60..79f65f6c59a1435256d5a5a700092cee91770e2b 100644 GIT binary patch delta 107 zcmbQk_n3D>FU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 818d658f8cc9a59e74a482e6be768811757de478..cff3a8df7b757cb5383f5b15c0731daab0e5265c 100644 GIT binary patch delta 107 zcmbQm_mFo(H_PNMmZ-@}tjd#lS^0sq1(@B;DhQ_cvhq$&ViTWi&&oFWEl{3`O&G`* xn{3FY4WymLl1oz(gG-816LaHB6pFzN9R;q*32c&+AFxRFU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index ca6df441bbdc50ee74e02e25e84d672328438b40..ea8722723e03324ae0e6bcf466c9ec4748d83a4f 100644 GIT binary patch delta 107 zcmbQm_mFo(H_PNMmZ-@}tjd#lS^0sq1(@B;DhQ_cvhq$&ViTWi&&oFWEl{3`O&G`* xn{3FY4WymLl1oz(gG-816LaHB6pFzN9R;q*32c&+AFxRx@(Z|>Q}a?X)3^X8)*+(+ delta 165 zcmaFDJC|=mAIs!^mPi%_AebD$sydmQl^?;eWfg^S%wP=F$tziTA)J0T$;tVwY{D+Z zC0Yt8@hQb6ItpAMlYnMDfNAY#lbFoSrj5|y%ci3QQIuSok{Dc4l$w|uU!qV9W*`jc LXEU69h)oFqv|1== diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index a1c93fbc0fa54fb24cb4fc47932161d5099a6bb5..049b42c38fe2899ec933ad7639a250da4738f819 100644 GIT binary patch delta 88 zcmbQu_lS2x56k3kmZ-_ftjd%5Sowjp1(@B!DhQ_cvGPt%W)q+6z{)oH9Z;T`O&G`* en{33U4WykXuVIq_%Rgk3VC0(UCox%oof`mf)fij= delta 174 zcmaFFJDYDq56k3UmPi%_AeijOsydmIl^?;eW)+2T%wP=F$;(-JA)Jq_5|bCO@~Ng2 zm*^;Pfs6ow%)AmUh4TEO?BasNFU#Z}mZ-@|tjd%5S^2@VJF6&=HUYEuv+_>9#Hu5dnOCBvP@Z3uU0jfu zoT{V1r7-y;i}++?HtorufOIvR&g8Xh5@7ZtHVH)# delta 165 zcmaFNJBM#WFU#aUmPi%_AeijWsydmAl^?;eVHJgO%wP=F$tzfSA)G!o$;o-FY{D+Z zC0Yt8@hQb6ItpAMlYnO3hiUC&lbFoKrj5|y!=|GIQIuSok{Dc4l$w|uU!qV9W*`jc LV>6t5kWC2ytY0Wp diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 931c330a1dc8d0cf3a975df098535b2d9a9cc9eb..842ca3044a35272fcea478e59f3dfbd16f130094 100644 GIT binary patch delta 114 zcmbQjcbj)ZE6d~-mZ-@xtjd$wS^0sq1(;pSDhQ^xv+_uE`OH&eqONvqxbK^@CiopzsGDAkL$sgI20qCkD A=>Px# delta 175 zcmcc3JB4pUE6e0ImPi%_AehY0swx~@l&qzY5?@@DtfRmM5(SG}!4*u-VHHGh7O?WN zLe%)N%1vIt%C44DTmsjqppcnYqNPxtUzA;3keHl`keKYpDn6NsO&hLRVzLXH4#eC% THYp_G`D_vs{lzC=VB-b==YA)~ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index a22f895f4adfd4d0a03948bbae893eb8264b0fe3..d2c18579a431649618353e71268e31788fdfe1c8 100644 GIT binary patch delta 88 zcmbQu_lS2x56k3kmZ-_ftjd%5Sowjp1(@B!DhQ_cvGPt%W)q+6z{)oH9Z;T`O&G`* en{33U4WykXuVIq_%Rgk3VC0(UCox%oof`mf)fij= delta 174 zcmaFFJDYDq56k3UmPi%_AeijOsydmIl^?;eW)+2T%wP=F$;(-JA)Jq_5|bCO@~Ng2 zm*^;Pfs6ow%)AmUh4TEO?BasNMmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)MmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)MmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)MmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)MmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index aa06ab816bbc5521abf566b98d01f5f8ac3aec4a..fbf8c3b9ea553d64bc0a0d413298343e81a7a0d0 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index e2ee8d5bbed6885805674830869acb640a696e0f..73cd3839de13e639ff2f77bdecef0d7676205efd 100644 GIT binary patch delta 129 zcmbQicb|7dC(GmxmZ-@Jtjd$QS^0sq1&Gb>l$ltprH~Sz1f+EoxF*Z7s!ra`$~!rM zO?>iGR#Cysyb>*i^8BLg;)2BFRFH}|Hi^mlY}!B-&SJ@>DT%=)MX8Co@g)kyUir^Hm3L-e?SlPL%bQHKidVsptv+Bu0 zn3;JcS_} IpJC$$0GOdBlmGw# diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 1563792c526217d0726976e54cc5eee4141aeac1..2f26acf8f00972cc0be39c876f2d76214cd5a6be 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 5dfc96fa0f5ce1ac3d6863a6e653e835d29be98a..7b9219be72e6ffe8ee48361a8adaafdd20f21158 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index f0f7aa164c8524c4f4820e4819b40bfbde32d6dd..e5b3eb1e45197bd7d8506164e55e93bff444ed36 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 84fab7d2ef71b7e8d91f797b601965d700c09570..2cd897a8d8e884e788247deb15001a8956a33076 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index d26dfab162ab79ddd8df10ef27c20c037ccf4aad..29e25395c40e5f84accf1abad8c9b66bb975d65f 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 05ab9918a166889be3406d0f5e9df6f2e1fca4c8..386de0770143ea297726b38c35dbc6a7a6c17534 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 26a9fff78f7ef1bf5046d3a773171377cc0bccbf..27bdaeb55b389616281d2e438dfcfa0a59ea13b7 100644 GIT binary patch delta 119 zcmbQq_kedp7t7>MmZ-^!tjd#lSowjp1&Gb>l$ltprH~Sz1f+EoxF*Z9s!ra+$~!rc zO?XZBPz+{(w8pbZPQK43!N@hy KPhv73J2wCtcp@$U delta 186 zcmaFBJCkoi7t7>smPi%_Aeh|1sydmSl^?;eWEF*TM4d7di?tL|;*)^1jsh1*2Uw30 zM9BtLJz0oYW?qSwLV12sc5y*saw}=W)8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index c9537b49b548862866669bf56c584f3ffe1e0fc7..2bbee6fdcdc8178084082cee2e1b328076591f66 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index d3a814d8f3bb660dd92f2fd6f43dafdf2fa48060..ba87340723e2fcaf05f64d1d9dfadd1b6fa34aa8 100644 GIT binary patch delta 129 zcmbQicb|7dC(GmxmZ-@Jtjd$QS^0sq1&Gb>l$ltprH~Sz1f+EoxF*Z7s!ra`$~!rM zO?>iGR#Cysyb>*i^8BLg;)2BFRFH}|Hi^mlY}!B-&SJ@>DT%=)MX8Co@g)kyUir^Hm3L-e?SlPL%bQHKidVsptv+Bu0 zn3;JcS_} IpJC$$0GOdBlmGw# diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 842249dd683fd9373cb2a05a6a61a0cbe35b903f..84b1ffbafd051cf5ed3abff2881d8312c46c3652 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index b955bfed242acc1a3aa9063b5cbce76a3998aed9..28a74223541617348c33b7449afff48cbbaec734 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index 48ed0e5b0d9be511ba24fc7d9273a9c7e0edd601..fafc7a187557da12f18f1a5e34dbefae650b7058 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 36a1438a4a56b05657d0bc5312e348907a58ff22..8e1835f814b06a961de1f1b27cf7b338705a7908 100644 GIT binary patch delta 117 zcmbQvcaL{N2g~GkmZ-_{tjd$QSowjp1(@ByDhQ@`vGPuiXA__NgjG~9Gp|HTp*+7R zySN}RIaNo2YjP}`#AH1-ZJ-KgvE8(iIzfneo=ODL1J<$iZb!ZtZdq>FijGZ-Pv>?rWUeEAqg*HlbGl)&RvjS Nz@?m;my(&r1psB`DU|>K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index 7bfded1c1d4e9abc3f1465439f0206c62d770309..a5f3721ebe04ef63fc072537f9061d7e29383d29 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 509dbff0f411388db0db40a45e48ae5194366478..1401390e883dabf6c997ae3c619f9f8601e3ba2d 100644 GIT binary patch delta 123 zcmbQrcZYXF8_VQYmZ-_Htjd!)Sowjp1(;pODhQ@`u<}k$WR;!#f|Xw=Gp|HTp*+7R zySN}RIaNo2OJQ;XtN3IcHf^91@yR7@I^xNtDT%=)MX8Co@g)kyUTgj|YC zK&q?YY858?v&v0A&B`wa(VLkER8XE@lwDkqn4F5DOnfpkn>H)N27gwG$*ycV5QFpC aq>zLcut`kx7w0a>FW^#6%}dEl;{pI^Tq;xm diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 5f413f7a035c8c28e77d8002f5c2a964ce7927d7..41d64cb9379f604eaf99d2fc8ef269e8ce02e6f7 100644 GIT binary patch delta 88 zcmdnZ*TlDB4a?-!EK!r4S(PX2vGN0H3ov^Lt00)Z$I3gonN56h0xR2OJ~sZz%51_w fzS!g-Hf!R QS&zizqim9sAFwF_0BZ;<3jhEB diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index b3e51ecfe5b1b3b2ef7ee32f8a525edbb51e9329..a41c02948452abcfcf417a392ffaab621f0b2824 100644 GIT binary patch delta 119 zcmdnV*TA=770cw6EK!pkS(PX2u<`?G3lN*%DKoKHOCcpb2}tWGa80&nRh@i?m3ML@ zoA~57R<_AJZ2XfI*@S_7vB?2!+CbV_EV(o#F}S2CH8D57M4=eW0BNmflbrmYO@fhY LqMyWMJ$7yYSOz07 delta 186 zcmZqR+sU_K70cw+ERie2JmlUNY=Ej#O e6oVPadL$2JmlUNY=Ej#O e6oVPadL$SV5oWw4{$TiVVVzNFvHvqOU9{>OV delta 175 zcmZqV+r_tGHOu5RERie2JmlUNY=Ej#O T6oVPadL$i1{uw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index ae9957f3d5c8b0913b518b742f1331dcb8f3140e..c4d1697edeb254bb4cb048063960d6a77a622671 100644 GIT binary patch delta 91 zcmdnQ_mg+SVwTB^SfVB~u^Iuf^5j@nQ7}E5RdDhqR)fjLY+{rDv5E<1=9OqEl;;;^ l7Z)TZr|KwhO*Uk62Fp%h)17>nO#;mR&L+voHJP7X82~1%8#4d^ delta 156 zcmey#yNPeZVwTBESRz>zfMBvNtLkJ0R(=@A3dT^K+{G%0;Ot}NWd*5Mn7ja};XNyd z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q8WWrVZC5F*%7%2V!0in-r4pUN(t|{^FD0 Hv2g2JmlUNY=Ej#O e6oVPadL$t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=62JmlUNY=Ej#O e6oVPadL$t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index f9923db3a5fbe8694bbb05ff733f0dbc1ed8a9a4..7b28b0d5f56f77079e4654b12fe3d994448bcbc3 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index cd4b2c1c33612ba082c611d99e3b88ec1e860b31..2f5bfcfc8b0c8f5f336c689226ca38623730806b 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF*0suS_C0PIf delta 168 zcmey$yN++ee3r=zSRz>zfM9YWtLkJKR(=G>k5v@LF@rHwC+}kAg>V+INltEHWfOKO zF40m*iBBmm(NW+6nFKWJBTVZ8Hi^kHY}yDNacnwD5JkzQDT%=)MX8Co@g)kyU*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 4638be4850c56f215c8e5326cda0933fb0b42199..1bd0a97f5607e09956ec3580ba5452f9842c93d7 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 00b183041b68d2a50dfcab59182c0f775713ffed..4a5a33fc568b639075a27bd0a16a858a85ef0924 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 7d62cdee8e10a0b5c697e83d18c353c4eaf998fa..c20d58475d3c87964bf6e67fbc6e61be41978a46 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index bde2cae9044296e777f3a1e3b5d6dc031670f1f5..747219ed588803e852309b0c703c9c527cfef3e4 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index e86ebdbf501ad6226e8c0aecf693c96e9582dded..a7b322a56829b8ae7d1ab66151fe6f6452918192 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 8f0174024bbeab8df718380e7e7e24ce8d07c81a..fe01d0b800933b85538c13fab98d87c55cd4259b 100644 GIT binary patch delta 117 zcmdna_m6kO3YN*sS)wM}vno&4V&wt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index ad695f14bee5cd30b8c66b58abd4d134bf939203..e8f671ced486e72855fe4fad626ab0d6439ad941 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index b2d58453f4ace71c2a77822323b2ae7c4fadf75e..d4bcd815899662b99870cbacf3775794a6bfdf51 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF*0suS_C0PIf delta 168 zcmey$yN++ee3r=zSRz>zfM9YWtLkJKR(=G>k5v@LF@rHwC+}kAg>V+INltEHWfOKO zF40m*iBBmm(NW+6nFKWJBTVZ8Hi^kHY}yDNacnwD5JkzQDT%=)MX8Co@g)kyU*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 788940cb1600fd16752c84dcbccc82f1bd3aa287..9f5e15d54aea5c662f5371a59ab04f504ac689bb 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index bbae6c49111393251747ac6e086ead11aab6ac7c..03b3dd1e7a266aec9772b95f331db7cf06671d23 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 65e571ffbed9dca77f977e4fb144f99f9b30677c..df33dc24f44f55911a0d4ce93f0f7e7d17208f2b 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 3b30ce4d06e30776ae32cd42cf0f3212b70f97d8..2cd0a451d0fc770624a28de5ba44d889cbb8d3ed 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index dabc15a994f1f1e07cf852ece4503cbe050e3851..c92680aa369aa0d82f6a028c3ca79a6cc908a86d 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu index 1a6b0c287c04155d64ffd8d7e6d50f5bf18e2399..eb5f8857a96776f58d1eded83bb45ff6c475f8b9 100644 GIT binary patch delta 91 zcmdnU_k(xCB9_SuS)wL0vKj%g^5hs+Q7}D=RdDh~R)fh#Y+{rDvWf|2=9OqEl;;;^ l7Z)TZr|KwhO*UY22Fv!d=}tbxCIM!DW0Pd$n#{+p3;-h18z2Ax delta 160 zcmeytyOD3hB9_UESt3~!fMBu?tLkKVR(=@A3dT^K+{r44;Ou4PWd*5Mn9Rs3H+e5B z`{Z}59CBb0h0MGXErs&@qU_>=#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 6a0c4006f4fc15be2062632b1c5b37115845e9fb..119ca59e2348ab6235d42f3071d8c49399eb3c59 100644 GIT binary patch delta 107 zcmdnP*UYzJEz9IJEK!r2Sd}O1v+{%KbXHLyZ31TBXXTyziB(4^Gp|HTp*+7RySN}R zIaNo2OJTAgtN7$#HtoqmKzcTt&g8Rf5@0qHy96WGL_Z1cg8Tw5<43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 0935981426b1d601781a9239f424c5014a2e4a79..ec713ff7f8c479783f67a9bb5b0ed5d5954a296b 100644 GIT binary patch delta 106 zcmdnR*T}bFHOu5xEK!r4Sd}O1vho9I3ov^zt00)Z%gQ^siA{WRJS*E|UN-*8N^HVF xzS!hIHfSV5oWw4{$TiVVVzNFvHvqOU9{>OV delta 175 zcmZqV+r_tGHOu5RERie2JmlUNY=Ej#O T6oVPadL$i1{uw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu index 1ae71af530ee23a8ba65f43ada6f49e125b5d220..9affaa3cc2db7a7a088f7f21433ef17928b30b6f 100644 GIT binary patch delta 102 zcmZ3__l43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index c0039b40c33ede6067a2945f05d4937edfe995c9..92afe9742e7ae37955acde964f2409f4522b5225 100644 GIT binary patch delta 106 zcmdnR*T}bFHOu5xEK!r4Sd}O1vho9I3ov^zt00)Z%gQ^siA{WRJS*E|UN-*8N^HVF xzS!hIHfSV5oWw4{$TiVVVzNFvHvqOU9{>OV delta 175 zcmZqV+r_tGHOu5RERie2JmlUNY=Ej#O T6oVPadL$i1{uw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu index b33e34fa426eb0a7c35c06a966c942e4084fd8c7..9e331632fba0d2ae3dc8a85dcbf6061216db26b2 100644 GIT binary patch delta 99 zcmdnM_nmjcLYB!3SfVDe8ckwVo*d083ZiDR3Qpd@YB1T5O>FWXRx!cMyb>*i^8BLg v;)2BFR2>Db$@*;0VA(!4-N^^pB*5&iY?6#z6a6H(3-SxNlvDFkGSj#KPCXvZ delta 152 zcmey)yMb@RLYB#kSRz>zfMBvVtLkJqR(=@A3dT^K+`%e{;Ot@LWd*5Mn8YeK`57y} z9GIh!nOCBvP@Z3uU0jfuoT{V11rY|S5}z!`rVY~~F*$)v2Vz(kn-oHHH=D#nfAPt0 G*th|wI3qRy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu index e6a7fd337ecdbd74c867583a73bdfdf5ea9d8ec6..026d72a924780fdcb92beac45e4f93d497d61e21 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu index b615514f4ae0f36e07a172769b0e829f22eea49b..f9311df3d26a654472f6a4fb7430a552d5a3bf68 100644 GIT binary patch delta 102 zcmZ3__l43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 5df14d2268122af633d341ac30c677bfdeed2695..65ae55c58560d7fb419fe2b8421d19a672232edb 100644 GIT binary patch delta 106 zcmdnR*T}bFHOu5xEK!r4Sd}O1vho9I3ov^zt00)Z%gQ^siA{WRJS*E|UN-*8N^HVF xzS!hIHfSV5oWw4{$TiVVVzNFvHvqOU9{>OV delta 175 zcmZqV+r_tGHOu5RERie2JmlUNY=Ej#O T6oVPadL$i1{uw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu index 245921f09f6dbdfda1ead827bb35fe1b70a5b006..d9267ca2f98d12d7b8deba9e89fd02140b7e67d0 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index a5d7831ed54b7bea18dbcbfea557dea7567a59e0..cb957cd373aa4e63db10d43f78f4d55255731669 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu index 487b2b1230a9c56d6dbe3c0c2685410343802dec..53f215f1d520af7c36a586ca90ee6d04b7306c78 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 99f78418b098fe7d2dec9c9be0c7ef696d593d16..c865ecbca9ad414330cce1fcbd0dbbf49fb21066 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu index e00d4cf5df04dafd5e4f43d23bb9b501daf397f8..a6fcb3eb8224cb18173c5b7e218ef86b87dcd363 100644 GIT binary patch delta 102 zcmZ3__l43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 2af0e3842c461517b38646f37e167486b1bb8cfd..0008c79f681c2f3b4e24975e6adb15feeb9c1343 100644 GIT binary patch delta 106 zcmdnR*T}bFHOu5xEK!r4Sd}O1vho9I3ov^zt00)Z%gQ^siA{WRJS*E|UN-*8N^HVF xzS!hIHfSV5oWw4{$TiVVVzNFvHvqOU9{>OV delta 175 zcmZqV+r_tGHOu5RERie2JmlUNY=Ej#O T6oVPadL$i1{uw diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu index c85127be1ddc4e6d6abebdc77ecd1e1eb8dc7591..36d4ab6e8615d9805c2803d52a5fd8e2e757a733 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 3d675e636efb9c61d772a6c3f4934f27b1849d40..707e4e72f520d685ce0045d891d985903d27fac9 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu index aef4c3dc5a62529ca6c27634f2dbd583d7882d85..77b89b0348cbbada9e359b0e18c0336c5d58c269 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 98aaf39ad03f90f77cd7d35de67f48ad4988cc6c..287d21c26774d164431f804a33fb09af07f0cfc4 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu index 07f655c7250ad139b6ae80c1e5dd8b93d7b14691..cd8282937254c74560704cab7852d45b7729b869 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 5e4591f174c7ba5d21ba332108c274bef0445d8e..06575697c08f31e78d1650240a82ce9acbb0fb76 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu index f5db58c89fe17a853c7b63dec9b7e63976dd84d5..27f290f840b7bb2bea13845e8e38368ee433e60f 100644 GIT binary patch delta 88 zcmZ3@_lbAI9G1zmS)wKzvno%PW90|Z7GU-iRzWa*j+J+EF`M}009Lli|A6wsY{Edk e*kmU*Z6NJDc@LWeSpFlM1S8i(KZ(f-?A!pf)fv11 delta 174 zcmeywyP9vq9G1y*St3~!fM9YQtLkJ)R(=G>n^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 7119977b062fc3392786aad85caa24bde2b85d51..d3cfd4b3deeb408ef3027837d9fa678d1a4a48d9 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYFn^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 2b283b3521128a576d5a58487600eef671790e8c..869fac226cbeac140307ebbda37d70d0d7a31631 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYFzfM9YwtLkJaR(=G>hgB5DF@rHwC+}e8g>dGvNlva~WfOKO zF40m*iBBmm(NW+6nFKWJJxuF7Hi^knY}yDNF>E?Y5JkzQDT%=)MX8Co@g)kyUzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bfn^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 3634e7f9e5df6f65bc54ad4cd183c156e1cd008c..0e37f1c74093ec189f3b50919281585101fd6bf7 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu index 649d3ca6d216b355a6d1e47e864147b5a2ecef91..27263d129e678eaa3434c79d3e71bab1d0bb07f3 100644 GIT binary patch delta 119 zcmZ3<_knlAESAYLS)wKzvMNuOVdV$X79cjiQ)XhZmO@H=5|GwW;F_$@syg`$EAQk& zHu1@RtZbA20M!bz2?P0JlO5Q!fwZ$&a%oCpa7j^WVs3niLNS;D(wfgEIr%-C1S8i( KKZ(h5?A!ozs3UFw delta 186 zcmeysyOM9iESAZ$St3~!fMD_jR@KSkto#U$C#xu&BkGiySgfUx5}yR5briTjI>362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu index 6990d171bb9061b1214f73276f545781532abeed..859887dd76660a34aff0b18314fcfee5eecd25ec 100644 GIT binary patch delta 88 zcmZ3@_lbAI9G1zmS)wKzvno%PW90|Z7GU-iRzWa*j+J+EF`M}009Lli|A6wsY{Edk e*kmU*Z6NJDc@LWeSpFlM1S8i(KZ(f-?A!pf)fv11 delta 174 zcmeywyP9vq9G1y*St3~!fM9YQtLkJ)R(=G>n^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 41e38dfeb48f119becf141497ad1c83782178824..a377250dbcc70ae824bf8c024586f38416130be1 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu index a98f74414371e953fddd10c73b221c0f1ed86a86..8e9ad219d555b4bb25fb7a4b68ee67847bc329b5 100644 GIT binary patch delta 119 zcmZ3<_knlAESAYLS)wKzvMNuOVdV$X79cjiQ)XhZmO@H=5|GwW;F_$@syg`$EAQk& zHu1@RtZbA20M!bz2?P0JlO5Q!fwZ$&a%oCpa7j^WVs3niLNS;D(wfgEIr%-C1S8i( KKZ(h5?A!ozs3UFw delta 186 zcmeysyOM9iESAZ$St3~!fMD_jR@KSkto#U$C#xu&BkGiySgfUx5}yR5briTjI>362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu index 371ca3f38da74f177b5799dda8755367fc1a3669..948d86fa050897b7c832acf2776e3d1491f57bcb 100644 GIT binary patch delta 106 zcmZ3>_l0-EJeJ9GS)wKvvno$kVC4tX5v-y>+62tLz{)%MF{_SXW?qSwLV12sc5y*s za;lC3*W`z+5|drnw1KkXlY7{7ChudD0JA@{NicFv^poH&$S>ehPR&cnOydFoc4i{e delta 165 zcmeyuyOwXmJeJAxSt3~!fM9Y0tLkKFR(=G>msJ$TF@rHwC+}qCg>dGxNlvb3WfOKO zF40m*iBBmm(NW+6nFKWJ15E3DHi^m7Y}yDNv1~d@5JkzQDT%=)MX8Co@g)kyU{B05lmXr~m)} diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 21deb1d2803328459c5315c469a76f2aeac9538f..d03293a22a07a5f0b9b2b182851e5e4c9fdd3d26 100644 GIT binary patch delta 129 zcmdnN_n&veN|wnhSfVC7uqsd1X5|Oc79cjiQ)XhZmO@H=5|GwW;F@g5syg{LEAQk6 zHu1@BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu index 197da7ddcccdddb006a2a8c9cd1ffc7f5caef90a..54c059adc5aa6230f15d6c0eb65ec930975c76a2 100644 GIT binary patch delta 88 zcmZ3@_lbAI9G1zmS)wKzvno%PW90|Z7GU-iRzWa*j+J+EF`M}009Lli|A6wsY{Edk e*kmU*Z6NJDc@LWeSpFlM1S8i(KZ(f-?A!pf)fv11 delta 174 zcmeywyP9vq9G1y*St3~!fM9YQtLkJ)R(=G>n^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 3108af793481c287b212234bebe574e1d8e74ebc..0ae92d4f02f84cbb24a54c3be2169e4d3c7cda80 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYFn^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 681d138b0fe2162611c4c9becdd74bdb93fa4afc..c1b497ace9ac4f9e8a9a3496af67c198b8237589 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYFzfM9YwtLkJaR(=G>hgB5DF@rHwC+}e8g>dGvNlva~WfOKO zF40m*iBBmm(NW+6nFKWJJxuF7Hi^knY}yDNF>E?Y5JkzQDT%=)MX8Co@g)kyUzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bfn^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 267c16d8df5a3ec4155a1e01617536e2f5d4db6f..68f9d8ccdd43a02ad5dfe60ad5c35ddf08b99eb1 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu index adbdd0c42de2772bebac137729bdff3388d5450e..05e3e2b60560d05481b8cc068ba2c45a055cbf9e 100644 GIT binary patch delta 119 zcmZ3<_knlAESAYLS)wKzvMNuOVdV$X79cjiQ)XhZmO@H=5|GwW;F_$@syg`$EAQk& zHu1@RtZbA20M!bz2?P0JlO5Q!fwZ$&a%oCpa7j^WVs3niLNS;D(wfgEIr%-C1S8i( KKZ(h5?A!ozs3UFw delta 186 zcmeysyOM9iESAZ$St3~!fMD_jR@KSkto#U$C#xu&BkGiySgfUx5}yR5briTjI>362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu index 52a7d8634d384600642ffedc76e4f9fd40f08dc7..ed2e4f271e3769ee2bba9f9494e775e0dbd50084 100644 GIT binary patch delta 88 zcmZ3@_lbAI9G1zmS)wKzvno%PW90|Z7GU-iRzWa*j+J+EF`M}009Lli|A6wsY{Edk e*kmU*Z6NJDc@LWeSpFlM1S8i(KZ(f-?A!pf)fv11 delta 174 zcmeywyP9vq9G1y*St3~!fM9YQtLkJ)R(=G>n^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 029d6c070dcae3990a0806e4a2175af9c402b341..e214104496db7c7483e0123522c26af1bd94bb62 100644 GIT binary patch delta 123 zcmdnS_m_9Wa+b-*|dR5#3xT>(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl1lVIeU L=qE8*o1Gf~Ul}E} delta 184 zcmey%yNz$da+b*}SRz>zfMBvAtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+k5v%P5ppRm z0jaKnt5uj>z$!QS4J*GKL~mvuP(gWqQFd`bVsa{qGV#eOY}%|48wyw@Ca1CKKn$M1 aCWRz?kWFHuzc_b6egT(qYF362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu index c2a7d416c897b736bdbbc2a47b8f5651b4392c47..4fde39a8f32ec1dd27ef1cd89ee9af7f907afd95 100644 GIT binary patch delta 119 zcmZ3<_knlAESAYLS)wKzvMNuOVdV$X79cjiQ)XhZmO@H=5|GwW;F_$@syg`$EAQk& zHu1@RtZbA20M!bz2?P0JlO5Q!fwZ$&a%oCpa7j^WVs3niLNS;D(wfgEIr%-C1S8i( KKZ(h5?A!ozs3UFw delta 186 zcmeysyOM9iESAZ$St3~!fMD_jR@KSkto#U$C#xu&BkGiySgfUx5}yR5briTjI>362 zAW9Cf>d8XHGV@Bb6w339vWp86lT#5&53ov17H89jD0g6URe~r_E=@@cE-6Y)%#ANm eCa>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu index cf81352682d0b35e7ec7396ff11fa695edeefa88..8b8f3ac42b65bc00b3205f9d7afe456e19a66375 100644 GIT binary patch delta 107 zcmZ3(_nCLYT$agmSfVBuu_{lNXXOXe;jE%S+62r#&&oUb5vz_+W?qSwLV12sc5y*s za;lC3m%?O5R`JQsY}%8VfOI#T&g8vp5@7ZxHVHzfM9YwtLkJaR(=G>hgB5DF@rHwC+}e8g>dGvNlva~WfOKO zF40m*iBBmm(NW+6nFKWJJxuF7Hi^knY}yDNF>E?Y5JkzQDT%=)MX8Co@g)kyUt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6W7+g}6nwT43qEHNG=qPYaE?|?K{DDn^k!zx##AJDPZUFWBAT9s^ delta 175 zcmey!yNYkaY?jG$SRz>zfMD`OR@KQ8to#U$7po|oBkGiySgfUx5}yR5briTjI>362 zAW9Ch>d8XHGV@Bb6w339vWp86lT#5&53)*3mSEF{D0gIYRe~r_E=@@cE-6Y)%#ANm TC*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu index 048db4ad7ee7697193050c05b27a48e878b1e4ce..ee0c2a2e2056fa240a052e319bcd07815e2859ce 100644 GIT binary patch delta 107 zcmZ3*_mOwQY?jHhSfVByu_{lNW#tFb7GU;dRzWa*mX&vM5u5mAe^$21e}VEsY{Edk x*kngGZ6NI|mRy>W7+g}6nwT43qEHNG=qPYaE?|?K{DDn^k!zx##AJDPZUFWBAT9s^ delta 175 zcmey!yNYkaY?jG$SRz>zfMD`OR@KQ8to#U$7po|oBkGiySgfUx5}yR5briTjI>362 zAW9Ch>d8XHGV@Bb6w339vWp86lT#5&53)*3mSEF{D0gIYRe~r_E=@@cE-6Y)%#ANm TC*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu index 8419e72370f7de1f800c5d490bfb3216590a3769..68585a3a886bbf02e7526ffb6f7096a70bcf0fd3 100644 GIT binary patch delta 88 zcmZ3@_lbAI9G1zmS)wKzvno%PW90|Z7GU-iRzWa*j+J+EF`M}009Lli|A6wsY{Edk e*kmU*Z6NJDc@LWeSpFlM1S8i(KZ(f-?A!pf)fv11 delta 174 zcmeywyP9vq9G1y*St3~!fM9YQtLkJ)R(=G>n^hFXF@rHwCvRuvg>V?zBqncQ5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$&75y2yIbpx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aTiGNhUtm)L06|+SS^xk5 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu index e4e4cd1b16f8a9a8d29cafde284b5fefd9f65943..6111f919728c22c2ec7f60fdbb98b0119400131f 100644 GIT binary patch delta 109 zcmZ3?_lkGJ6qd=8S)wL?W-$U{<;i}mqF}m*RdDiRR)fix*u*D)U= delta 167 zcmaFGyO?jo6qdgoN_=rqvW@~5NE9q?1y?Y+npF_NS}iLBz21=+OWnj|I%v*|$0t7Vfy Q5?;q9G0|Uq@_jaL07z*l5&!@I diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu index 5567c7d8b95a6c81413a9f2f20e5db59386fbb05..68794f6b575cdcbdd429229d244f7ce5f96f1f62 100644 GIT binary patch delta 107 zcmZ3*_mOwQY?jHhSfVByu_{lNW#tFb7GU;dRzWa*mX&vM5u5mAe^$21e}VEsY{Edk x*kngGZ6NI|mRy>W7+g}6nwT43qEHNG=qPYaE?|?K{DDn^k!zx##AJDPZUFWBAT9s^ delta 175 zcmey!yNYkaY?jG$SRz>zfMD`OR@KQ8to#U$7po|oBkGiySgfUx5}yR5briTjI>362 zAW9Ch>d8XHGV@Bb6w339vWp86lT#5&53)*3mSEF{D0gIYRe~r_E=@@cE-6Y)%#ANm TC*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu index 5bce7038679b63cdc61b99cef299c47b0a0e2e62..01b92bc1f3edbf13848a5a348e29f665775c2f0e 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index a0e5066b63666bba29808f2b0f4a989df21190cc..d0620ca07ec2c3f2eadf863375c0fc7a148b9070 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu index 087e818955ec590d0ce206eb98291c2b27dc85cd..5cdcce09615d2267d37629c58eb880bdf0bdc48e 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 57ddadc2c3d5f4f3b1969249dab2ac1e67f8b029..a633169aebb03f1e151ad5ba08d4a4db41e35533 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu index 44120943354c2d02607357752c1f32ca2fd83961..9e7ae5fd4438ebe7e5bff59fc6561e521b0a6b74 100644 GIT binary patch delta 107 zcmZ3*_mOwQY?jHhSfVByu_{lNW#tFb7GU;dRzWa*mX&vM5u5mAe^$21e}VEsY{Edk x*kngGZ6NI|mRy>W7+g}6nwT43qEHNG=qPYaE?|?K{DDn^k!zx##AJDPZUFWBAT9s^ delta 175 zcmey!yNYkaY?jG$SRz>zfMD`OR@KQ8to#U$7po|oBkGiySgfUx5}yR5briTjI>362 zAW9Ch>d8XHGV@Bb6w339vWp86lT#5&53)*3mSEF{D0gIYRe~r_E=@@cE-6Y)%#ANm TC*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu index 38ced9378e80e9dffe3535327f0cfbfe62eadd27..9c85633972545ea2060535aad7e1b54063b25c80 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 7273648498ddc43eb17e387938712fb23b2be7b1..3277092dcdd849ecded1f354b35cc81fe17b6563 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu index d0b493332fd26f38a556d374c3b4a6feb11db08a..a1f282e9cb05e37981cf6a139ba4594e7bb818e5 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index b037361101ef734b92342d3fbbe80252ece4680a..ba6e50459be0b0a9c68f82172595fda30abb63bf 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu index 0992c97b05f3b40d7648fd46e2829ba4354dacf2..812274a743983b6ff8a58370810c5ba26b2cf04d 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 9a37bc4b132746e55911dd99618607a248e25302..d37d0edb3b49d965018ee1cc206f769d8f657215 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu index 1354f552d15876c9e1635674a6887a94279a349a..837f1704579a1c833800c743f0454127661d569b 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu index 3c1f051da6e8d962acb1ae8d4a0263b377a1274b..341fd2e5ea225a42fb6c5d91bba1d299e737da8d 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu index 6adbbb06926fe40220f653e1d30b2355d920993d..8e0176529ec4080907343009dd23a22b9bf58334 100644 GIT binary patch delta 117 zcmZ3^_l|eN43^2$S)wNEvno%PV&wdYtiL7#y z@3QjCLAaTDC0YvQ`9;~q1&PV2D9Xeqi?V66!Zb-t4rkMWnA*rDg(SR*O=6WXaNlre`Dk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uQAeO=2=Hn>Ip=KbwvcL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@+P50845q4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu index 4eb167ca107d2e8e5dff9dd4aa3a0c88049cb930..e3c88e3dfd237b0c7ddb93bef51bfbf492c4a083 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu index 428d6cac9b37228536f7a514bf3ba37056b4fb52..9a6c225db4dc194304c3b1b97b410190e1166605 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu index fc5ed0bc9c467797172cb6085f3480f3c4e4b247..67eed08e4fcb33c43aaad9001a18acca0043248c 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu index acfdecd2766d1473ed9a397eba03ffca0663c6ca..c81ee566747f7ac65ac5239858f8395cef56d006 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu index a077119bc411ce90ff58079003dd507e1393248d..3b04adaaec068f062a7a0eb768008c1ee5acea7d 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu index 67f88e60337898ec342a6480000722bccb9fd01e..a639f5bd750a3f6c398180670fbfdf10fc70b446 100644 GIT binary patch delta 129 zcmZ3%_nvpdOqR(rSfVByuqscMX5|Oc79cjiQ)XhZmO@H=5|GwW;F_$*syg{JEAQk2 zHu1?{Sw#gi^GdW7%JYk|iwhEyQ$Z^7*d!*~vuOiWIEy8hrX&WJ6s0ES#+N7*gBcJd Tl9S)DNicFv^plt@%gzk|Dq|;p delta 179 zcmaFQyMk}SOqR*BSRz>zfMD`R784jlb+QMmD1y_#Dv03RV`b;6(ox_7=>h86&#EU2 zVP@u)XepHE7iAY0BqpaKtB{y1#-(q$o8pH@-xn7|cM{BRP39 Vo5Vzaaqfcr0xsp$yp+r|E&%y{FggGL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index e58863fa9b236170abaa0647e2995bf15d3a0605..043533beeb46f322724fdd84e8ae5276c314a50f 100644 GIT binary patch delta 109 zcmdnY_ltMK5|+t}S)wL0vl;=h^5i&HQ7}D+RdDiVR)fip*u*EXi3(=sm1rrH=NDxc z7bGU9>L_qce!wO%*^5mZq+s$SHXX6#(v-yDlA_ea-1riOVlV@u%#e|5vH-g>0G(7K A(EtDd delta 167 zcmeyxyP0pp5|+tJSt3~!fMBv7tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+n^h3O+0V+$ z3Q@C=Rdw}g{t&Nd Q65huqG0|Uq@_ROJ0JcdfQUCw| diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu index 2afb875349723705772c28cb26d6e1aabf62c117..c1f70a4336dd086045d656dda301d7ab631916dd 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu index 2a5a1643a3593ef5eea55e8c3ab2ba6dc70b1adc..6968d4fcbc41b04bc69c575d5d474e9d26ecdd98 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu index 8f591465fd8677fb70bbd9a9acc9ef96283df9cc..c458632a521dc26d2d2f9853ab4e169afde4bbea 100644 GIT binary patch delta 117 zcmZ3^_l|eN43^2$S)wNEvno%PV&wdYtiL7#y z@3QjCLAaTDC0YvQ`9;~q1&PV2D9Xeqi?V66!Zb-t4rkMWnA*rDg(SR*O=6WXaNlre`Dk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uQAeO=2=Hn>Ip=KbwvcL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@+P50845q4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu index 1e6323ed57b46ddaab2d658ccf813f794ade4048..8651eeaff0cc8a1ab87d337728a1a2f705fb38b5 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu index 4fab26dd33885d3c1545271e73b6494b5e1b0608..1adccb33861084a8946588f755505d5ec4812a84 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu index f150bf3298185cf6f28e9b1851f65665f349c946..86073427b837546835f3248c112469fded5ba29b 100644 GIT binary patch delta 123 zcmZ3+_m+3Vbe74}SfVEDu_{lNWaS6a7GQQit00&@$;vyqh*fs-PgZ`R%)AmUh4TEO z?BasN4+znrX&WJ6s0ES#+N7*gBehD5|iJsNicFv L^plt@&CU$~OdBP1 delta 184 zcmaFMyNqwcbe738SRz>zfMBvBtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+j#UuO5ppRm z0jaKnt5uksz$!QS4lBPLL~mvuP(gWqQFd`bVsa{qGV#eGY}%|48xmM0CWo==#N<>R1ulp%NU8W_c{XjhMv2LZY&sA#yV<0Wg!iyX MO!OC@{FaRy0LYmns{jB1 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu index 3592fbcf3d5fcf1dfe1925edba33e1740441b0dd..9ce8d0676c65d5f246f1636889def8526e356a42 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu index b2ca7d5c3b452036fcb9bc4fed74cb2574f5c4db..7f2d171b92d297310f955631132c2e2f16170c89 100644 GIT binary patch delta 114 zcmZ3&_nLRZRF=t8SfVEDuqsa$XXOXd7GQQSt00&@&dNLaGMo71kF27CnRz8z3g!7l z*~JBk$*DRDT$3-dNldn8(*~<(X44T%E=@@cE-6Y)%#ANmCzfMBvbtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hE)*3*}%%n z3Q-fsDmQrpE4x}saS2?bf43_O>)17>PO#;mR!Y0YcHPKIkyCA=SOF1V+KNlw1cDk=-o zte}vYSE8j*o?nz*T#%TYs-wUK5uUt|O=7Yvn>IpAJe!UZL@v2BB{8_9C^a!RzC@uI Q%s|#>IMH8x@@qD30LpqR6aWAK diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index a8815802196032953b034ab1ac5ee2e1a17fa052..c1f9f0cc5c25f8a81a0e2d45e1b253448873808a 100644 GIT binary patch delta 106 zcmdnX*TT199n0jkEK!r2S(PUnu=0cH3|3JfZ31RLVC9|snN>$HGp|HTp*+7RySN}R zIaNo2Yw|}{iOC^s+CW+H$#d9rCZA)I0JE9dB^bFT`blsX!R QS&zizqim9sAFwF_0BZ;<3jhEB diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 0ec64ab721458e73db8b8b0409fb6a9f41144661..ce2faaf9571981f2d69cd25242e65cd2d7680db8 100644 GIT binary patch delta 88 zcmdnZ*TlDB4a?-!EK!r4S(PX2vGN0H3ov^Lt00)Z$I3gonN56h0xR2OJ~sZz%51_w fzS!g-Hf!R QS&zizqim9sAFwF_0BZ;<3jhEB diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index c5f51e5d04ad13b901ca9a077708d3e0ce05eec7..a8b5ef0f837d4126149b316305a3fc84fc1019c4 100644 GIT binary patch delta 107 zcmdnP*UYzJEz9IJEK!r2Sd}O1v+{%KbXHLyZ31TBXXTyziB(4^Gp|HTp*+7RySN}R zIaNo2OJTAgtN7$#HtoqmKzcTt&g8Rf5@0qHy96WGL_Z1cg8Tw5<a>0jBKKUnRz8z3g!7l z*~JBk$*DRDT$3NNNlf-;(*~=U%%&rjT$+*?TvC*pm>XZBPz+{3lo>K|O%`NV1^`ZA BBufAQ delta 175 zcmey(yM=GVQkKcfSRz>zfMBvdtEzBtQL>goN_=rqvW@~5NE9q?1y?Y+hgA^4Il#)x z3Q?2CDmVE6E4x}saS2?bfu diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 7af638ac4d80a681193f2bbff1e747f2edf75aa7..61e4ebae70b825148723f08262fcb650d96794c0 100644 GIT binary patch delta 88 zcmdnZ*TlDB4a?-!EK!r4S(PX2vGN0H3ov^Lt00)Z$I3gonN56h0xR2OJ~sZz%51_w fzS!g-Hf!R QS&zizqim9sAFwF_0BZ;<3jhEB diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 291f63ba2eaa85025a470116587748856df01737..a01dc2b983754110fcdb8090ee532c20e9e46e32 100644 GIT binary patch delta 119 zcmdnV*TA=770cw6EK!pkS(PX2u<`?G3lN*%DKoKHOCcpb2}tWGa80&nRh@i?m3ML@ zoA~57R<_AJZ2XfI*@S_7vB?2!+CbV_EV(o#F}S2CH8D57M4=eW0BNmflbrmYO@fhY LqMyWMJ$7yYSOz07 delta 186 zcmZqR+sU_K70cw+ERie2JmlUNY=Ej#O e6oVPadL$2JmlUNY=Ej#O e6oVPadL$!R QS&zizqim9sAFwF_0BZ;<3jhEB diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 427608a9091651483f07b0f61f2c180f008fc068..0742f4c76588461b33874bd716acf93238aacc14 100644 GIT binary patch delta 119 zcmdnV*TA=770cw6EK!pkS(PX2u<`?G3lN*%DKoKHOCcpb2}tWGa80&nRh@i?m3ML@ zoA~57R<_AJZ2XfI*@S_7vB?2!+CbV_EV(o#F}S2CH8D57M4=eW0BNmflbrmYO@fhY LqMyWMJ$7yYSOz07 delta 186 zcmZqR+sU_K70cw+ERie2JmlUNY=Ej#O e6oVPadL$2JmlUNY=Ej#O e6oVPadL$2JmlUNY=Ej#O e6oVPadL$t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index e527fc27c39b5d3a010d746bdcb12b23a01c0bf6..d93d719192c7894b719e1f2a6ac989ba893baf66 100644 GIT binary patch delta 99 zcmdnM_nmjcLYB!3SfVDe8ckwVo*d083ZiDR3Qpd@YB1T5O>FWXRx!cMyb>*i^8BLg v;)2BFR2>Db$@*;0VA(!4-N^^pB*5&iY?6#z6a6H(3-SxNlvDFkGSj#KPCXvZ delta 152 zcmey)yMb@RLYB#kSRz>zfMBvVtLkJqR(=@A3dT^K+`%e{;Ot@LWd*5Mn8YeK`57y} z9GIh!nOCBvP@Z3uU0jfuoT{V11rY|S5}z!`rVY~~F*$)v2Vz(kn-oHHH=D#nfAPt0 G*th|wI3qRy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 1fd9b4564196e2e588c47aa9bdaad4832512ed2e..031ad7e8eef592b4a1f77b2054fe92061ab6ddc7 100644 GIT binary patch delta 117 zcmdna_m6kO3YN*sS)wM}vno&4V&wt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 3ce84e92a1c9118d3012452d6767238bf9373d1b..c5c705283324e9979b5e6be115b5f07da096fa3c 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index c4fe56c7ecab33f7fe721c3530091e71f966ae31..01bd059754e7a4d685ef84a9c6581644b1920cc0 100644 GIT binary patch delta 117 zcmdna_m6kO3YN*sS)wM}vno&4V&wt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 7e44eb402a47f08250f7cf58d38d862abb86bce6..40231a90ceb879b6664f1d604ab3b61438d9bf8e 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 26ca4d5c9a593fb3b833030c0d7419bf9c4924f9..11cd57221e0a3bf5e62785a717a6937688dd523d 100644 GIT binary patch delta 119 zcmdnV*TA=770cw6EK!pkS(PX2u<`?G3lN*%DKoKHOCcpb2}tWGa80&nRh@i?m3ML@ zoA~57R<_AJZ2XfI*@S_7vB?2!+CbV_EV(o#F}S2CH8D57M4=eW0BNmflbrmYO@fhY LqMyWMJ$7yYSOz07 delta 186 zcmZqR+sU_K70cw+ERie2JmlUNY=Ej#O e6oVPadL$t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6t3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6BJ T$;toNBpA6S`bkXIW#zfM7Bss|k#uIyr(>6v3ImDv02`V`b;6(ox_7=>h6G&#EU2 zVP@u)XepHE7iAY0BqpaKtB{zi#-(q$o8pH@-xn7|cM{BRTmn Vo5Vzaaqfcr0xsp$yp+r|E&w%gElvOc diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 43d7beebf065c6e281409f51eb0396f4222fc6d3..cc8985f328c3b960ffd0ec9202b991df8bffeba3 100644 GIT binary patch delta 99 zcmdnM_nmjcLYB!3SfVDe8ckwVo*d083ZiDR3Qpd@YB1T5O>FWXRx!cMyb>*i^8BLg v;)2BFR2>Db$@*;0VA(!4-N^^pB*5&iY?6#z6a6H(3-SxNlvDFkGSj#KPCXvZ delta 152 zcmey)yMb@RLYB#kSRz>zfMBvVtLkJqR(=@A3dT^K+`%e{;Ot@LWd*5Mn8YeK`57y} z9GIh!nOCBvP@Z3uU0jfuoT{V11rY|S5}z!`rVY~~F*$)v2Vz(kn-oHHH=D#nfAPt0 G*th|wI3qRy diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 893c0afadeced249b3020d8c30193ee80d8728f7..882ea15c2dae70562b761894aa45e6706400b5cf 100644 GIT binary patch delta 117 zcmdna_m6kO3YN*sS)wM}vno&4V&wt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index b5bbead4aa8ea0bd6f7731292b25af5d677e961c..768771c2c9ccc58d2f08f32fe522ad2c3e441ecf 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 19d24977abb1f8de1737c340369054da834e26da..f09590a20eb54d6c4f19719ef46d5c8d21c60fa4 100644 GIT binary patch delta 117 zcmdna_m6kO3YN*sS)wM}vno&4V&wt3YN($St3~!fM61<36xfy9L_2VXZN!T!r5u2$PM*jng(Q54O=6*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 9d025a4e643a477267eaec1a5900e7457e977e5b..afbdbbbbc23d04ee9b20d4c2b0b28c6ad21d7452 100644 GIT binary patch delta 123 zcmdnW_lI}GGM33pS)wM}vMNv3VC4tW7GU-~RzWa*gOzu3BdhFW4mN(F%)AmUh4TEO z?BasN*tCI4#3xT-(-BWDO-T$cDN0SujW1Cs1~Z`QBqsl6lVIeU L=qE8*i=7()SpFrU delta 184 zcmeyvyOnRlGM359St3~!fMBu#tEzBtQL>goN_=rqvW@~5NE9q?1y?Y+msJqX5ppRm z0jaKnt5uks&nh?hH7mayL~mvuP(gWqQFd`bVsa{qGV#gEY}%|48}eBtCa1FLKn(6@ alR^?cz$P)#U!1!jzko|QH7_MIjSB!%Yb=xi diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu index 167422bfc2063eb5b8f1c1184638cb5ca89fd268..8c2e425a341265f3dd8b830efe8f36144db849d2 100644 GIT binary patch delta 106 zcmX@hH-&G*K9uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) NfczfMBvOtLkJ2R(=@A3dT^Kyoyy2!MVrE%L-DjF!=yb13w#w z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6QAtBrVZC5F}aCN2V&kDHYp_GyKE8@{lzEq Hv2z0e(A6de diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu index 5cfa3b57f5d59d50398ed1600b12a96484f2df6a..99e633809aeb5ab8ba5ded6aca5ea015f460a4bc 100644 GIT binary patch delta 106 zcmX@hH-&G*K9uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) NfczfMBvOtLkJ2R(=@A3dT^Kyoyy2!MVrE%L-DjF!=yb13w#w z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6QAtBrVZC5F}aCN2V&kDHYp_GyKE8@{lzEq Hv2z0e(A6de diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu index 0454cb47dbd64de2c10e3f4696f6365c04d9cdae..f27777189457326d7a855b40b6671746d187038c 100644 GIT binary patch delta 88 zcmX@jH;He<9+t_wS)wKfvno%vW90|Z7GU-kRzWcRj+J-vWH#~11*~k7<=FTq8?y-m f`C^lk*tCJP^W-~h5@2~jb_qtViGC839oV@6-dq{~ delta 174 zcmbQlcbad*9+t^_St3~!fM9YRtLkJ+R(=F0n^hFXF@rHwCtqjfg>V$vBqm>A5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$%<^w2yIntx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aSJ@;de_&Gr0P6@WI{*Lx diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 638946d6a6e9245bb5b37655d81c2ceae1654f3e..52bf3538c5d6e2d94c91c6af4b0b5276066f09ca 100644 GIT binary patch delta 102 zcmcb>H=S?8L6*q}SfVB?uo?le^5kk(Q82xeRdDhJR)fibY+{pT*u(@g^GdW7%JYk| wiwhEyQ*{)$Ci}BFgJsvT=}vygCIM!PvP&{@P4tuCF32z7QclfF$xPz{0CJWdlmGw# delta 156 zcmbQvcY$xiL6*sfSRz>zfMBvWtLkJsR(=@A3dT^Kyn} I^RROR07-)*T>t<8 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu index a21d6ad33fa537d7af809277556e8e74a1ded9a6..6adce8136bf8e4dcdc57afebfc7c4e233853707c 100644 GIT binary patch delta 106 zcmX@hH-&G*K9uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) NfczfMBvOtLkJ2R(=@A3dT^Kyoyy2!MVrE%L-DjF!=yb13w#w z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6QAtBrVZC5F}aCN2V&kDHYp_GyKE8@{lzEq Hv2z0e(A6de diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 2e5ab843038ecedffa2cb417ee68b0badf769c42..9127e532c075e8cd66d26124ecb87f02697b6ded 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu index be006ff82a849c626ac4b25e26b19ce2fc709715..1089e31f69d993a41a4def4d5d573a6b6d31e4cf 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu index 2f7c8265ec9a3712098b17a54daaadd5f1806c77..676de64e02abff8b75d0dba19cab6235b08091e4 100644 GIT binary patch delta 88 zcmX@jH;He<9+t_wS)wKfvno%vW90|Z7GU-kRzWcRj+J-vWH#~11*~k7<=FTq8?y-m f`C^lk*tCJP^W-~h5@2~jb_qtViGC839oV@6-dq{~ delta 174 zcmbQlcbad*9+t^_St3~!fM9YRtLkJ+R(=F0n^hFXF@rHwCtqjfg>V$vBqm>A5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$%<^w2yIntx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aSJ@;de_&Gr0P6@WI{*Lx diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 4b05fa6ea4cb2a1d3feb7d2f569c594fb2c1c2e1..ac1ea8b0e1c0022f46ee45710b0a4fadc4619fe4 100644 GIT binary patch delta 102 zcmcb>H=S?8L6*q}SfVB?uo?le^5kk(Q82xeRdDhJR)fibY+{pT*u(@g^GdW7%JYk| wiwhEyQ*{)$Ci}BFgJsvT=}vygCIM!PvP&{@P4tuCF32z7QclfF$xPz{0CJWdlmGw# delta 156 zcmbQvcY$xiL6*sfSRz>zfMBvWtLkJsR(=@A3dT^Kyn} I^RROR07-)*T>t<8 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu index 0c54c5e9ab614be59b032d192128f229cb86c3a8..be1cd355cfa82565936ff006faf28f9f8d254f8b 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index 0662b46ac01d400c4146d9a9c2af44a7c54bb6b6..58b209be3baa95b5a9eed928f760e0b6e12c976b 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu index b84211621c6559ec496d40e8c5b89d846db6c6a2..0dd9d49a838edfc55dd387ff7a7a6185fd4f45a6 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index ae73171ec34cf70bf1734bf6934b7401207dfa87..7a9c681beb34818ecde569dc8dd67455a206b056 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu index 03f894cd237fabb3a67366dff8e79170bfd1c7a2..264516f36977a9e9c3449781721c004039465f34 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index b5379a2f684cbee05d287bbe022389d8ebd687d4..5c7d5a8194cfeaa231bff1b0c29ff7482ccc668a 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu index 5d6380f19d42120b39f14bf6d88b2f1790b64d7f..f43d76a616d64753f20545d85bdd41770b8e62b2 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6zfM9YxtLkJcR(=F0hgB5DF@rHwC*NS@g>d$}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index b688db3c3c59f87eb06a6f73ab0ba6e8a8d671a7..61c208631534fd4343b281bc28bf414aba574c93 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 63f144700496314d31d4c9af4dc586123cdeb599..94c98d8a0062124596de0db8b7dade48e3350273 100644 GIT binary patch delta 107 zcmX@bH<53{ZkEZrSfVBeu_{ltW#tFb7GU;fRzWcRmX&w%BsTHM`K)Y{W!d;A8?gxk y`C^k3*|dSQvsiLzN@8$HQEFmte2GFan4zPzfMD`QR@KQCto#U07ON362 zAW9yx>d8XHGV@Bb6w339vWp86lT#5&AF@hJwqVnSC{JW_Re~r_E=@@cE-6Y)%#ANm TC$|f=SJ)05$`YA6& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu index 08f6849bd735a375b47bb9fa32aac1d625117f39..259fa2b46fcdbfd7efacb80e1341b6290ba6b2cb 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYF+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6zfM9YxtLkJcR(=F0hgB5DF@rHwC*NS@g>d$XRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYFzfMD`QR@KQCto#U07ON362 zAW9yx>d8XHGV@Bb6w339vWp86lT#5&AF@hJwqVnSC{JW_Re~r_E=@@cE-6Y)%#ANm TC$|f=SJ)05$`YA6& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu index 01f8721206016d34b7a329ce40f5585bff814466..57a858e8e096ee453931c7422dd8d7805fe1b332 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 78df98c313d7070dc8a86e12796cdde284a39a9a..24be928057cdf9eb340bcf475ce2875ee4360659 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu index 400a2ffe5a339cc6bb665ba37b02c9c5a3d37cb0..4861c0ba9caf76aaeb8bd496dcbd0fc9c284f858 100644 GIT binary patch delta 119 zcmX@fH-T@%E|$qVS)wKfvMNuuVdV$X79cjiQ)XhZmO@H=5|GwW;F|2usyg`%EAQlq zY~qviSlK4au<=heWD^GR#U>}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index f327f180018cd8fdc12bc91f092486d42e736563..5f5efcc6a15c1dc33cd3c3790bda0f3fe36bd4f3 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu index 825c44f31ce499c758e71ce4013ad8e473241b69..a71bf73ae94158fa7cb9c232a98d4bc8a7e3d76f 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6zfM9YxtLkJcR(=F0hgB5DF@rHwC*NS@g>d$}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index e2ee3f2b70a3dffd25f0488e94ca3439cab6f739..bd99ad629e0bd2e994a3fcf127944958dc817230 100644 GIT binary patch delta 117 zcmX@dHzfM9YXtLkJMR(=F0k5v@LF@rHwC*NY_g>VkANlsqC$|mem zT%x6r5}#6BqNBhCG6`swARAQc0XB)rHf-7m9d&FvN)ScKr74NQB}J);x$z|m#b5@) QfCFrX6aB>}bFp&+03ebowg3PC diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 83644d8729dca58b2d8e5c51304eb3742c933ffa..1cde89557037b1b6ac954129e0218c72f6649b77 100644 GIT binary patch delta 107 zcmX@bH<53{ZkEZrSfVBeu_{ltW#tFb7GU;fRzWcRmX&w%BsTHM`K)Y{W!d;A8?gxk y`C^k3*|dSQvsiLzN@8$HQEFmte2GFan4zPzfMD`QR@KQCto#U07ON362 zAW9yx>d8XHGV@Bb6w339vWp86lT#5&AF@hJwqVnSC{JW_Re~r_E=@@cE-6Y)%#ANm TC$|f=SJ)05$`YA6& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu index 462f9a7572b6ec0c8a3bd4177aab183b4de5e0fb..e9c07ff1215a39e41275fc4576360f29caa6dcfd 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYF+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6zfM9YxtLkJcR(=F0hgB5DF@rHwC*NS@g>d$XRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYFzfMD`QR@KQCto#U07ON362 zAW9yx>d8XHGV@Bb6w339vWp86lT#5&AF@hJwqVnSC{JW_Re~r_E=@@cE-6Y)%#ANm TC$|f=SJ)05$`YA6& diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu index fb1d5c21f911dcc97285177be8793ead9d1fd5e1..f1476e65620b34f4479327615fe060d330df6895 100644 GIT binary patch delta 129 zcmX@X*Uz_MC(Gm=EK!pKSd}MRv+@IJ3lN*%DKoKHOCcpb2}tWGa834ORh|5rm3Q(4 zHu1@#Y@&jhc_ms3<@rU~#RZAUsUQ`7Y!Z{>*|dQwoW+t$Qxbzqic%AE<4Y8Z!3>BJ T$;o`|5{z6E{Uj#avU39fNz^An delta 179 zcmeC@JHfYMC(GnrERie+ot(icir_3@6-03Uv9fbj=_qi4^Z<3;XVsI1 zFf;Q?v=qwoi?WLg5|dMrRY*)WW7CG15YOhS1W}$`nvxh?Qk0sQ8(*SO3}ztfk(_*) VO=6uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) Nfc*|dQwoW+t$Qxbzqic%AE<4Y8Z!3>BJ T$;o`|5{z6E{Uj#avU39fNz^An delta 179 zcmeC@JHfYMC(GnrERie+ot(icir_3@6-03Uv9fbj=_qi4^Z<3;XVsI1 zFf;Q?v=qwoi?WLg5|dMrRY*)WW7CG15YOhS1W}$`nvxh?Qk0sQ8(*SO3}ztfk(_*) VO=6uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) NfcJI1$RJImx9ERienRaH2+C|OG(CBC>QSx12jBnlR{f-9Ijk5v%P5ppRm z0jaKnt5ulXz$!QS4=cYML~mvuP(gWqQFd`bVsa{qGV#eKY}%|48yZ+8CYQ14Knz~M aCWRz?kxgQvzc_b6egT(qYFV$vBqm>A5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$%<^w2yIntx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aSJ@;de_&Gr0P6@WI{*Lx diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu index d18b49752da94f1826402518b427d27920512369..f6a8fea0b7b7739a55feb38fbb3a80d63783014d 100644 GIT binary patch delta 129 zcmX@X*Uz_MC(Gm=EK!pKSd}MRv+@IJ3lN*%DKoKHOCcpb2}tWGa834ORh|5rm3Q(4 zHu1@#Y@&jhc_ms3<@rU~#RZAUsUQ`7Y!Z{>*|dQwoW+t$Qxbzqic%AE<4Y8Z!3>BJ T$;o`|5{z6E{Uj#avU39fNz^An delta 179 zcmeC@JHfYMC(GnrERie+ot(icir_3@6-03Uv9fbj=_qi4^Z<3;XVsI1 zFf;Q?v=qwoi?WLg5|dMrRY*)WW7CG15YOhS1W}$`nvxh?Qk0sQ8(*SO3}ztfk(_*) VO=6uc-z{(G%D_BK=vd$>Nlu>6$|mem zT%x6r5}#6BqNBhCG6`sw02@^6em058)@<4c9kpyaN)ScKr74NQB}J);x$z|m#b5@) Nfc}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu index 71c6b1d51669a47f4baeb5238cd16b90b5235c42..e49d4672258e8642818a09b38ade176466272b0c 100644 GIT binary patch delta 114 zcmX@Y*Uh(KE6e09EK!qvSd}N6v+@IJ3ov^vt00(u&dNLaGn@EiK{ipr%)AmUh4TEO z?BasNVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bfJI1$RJImx9ERienRaH2+C|OG(CBC>QSx12jBnlR{f-9Ijk5v%P5ppRm z0jaKnt5ulXz$!QS4=cYML~mvuP(gWqQFd`bVsa{qGV#eKY}%|48yZ+8CYQ14Knz~M aCWRz?kxgQvzc_b6egT(qYFV$vBqm>A5 zF40lo0vQ1UnRz8z3g!7l*~JBk$*FLO$%<^w2yIntx=Ik`$)zcY!6ikhiMjD53dLXs QvL1=aSJ@;de_&Gr0P6@WI{*Lx diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu index d0d5c250996d1c0c6e0f22b08e97b818d6d59f24..bf0f2d37ea388f3b12c01a0e01570af3e0045510 100644 GIT binary patch delta 114 zcmX@Y*Uh(KE6e09EK!qvSd}N6v+@IJ3ov^vt00(u&dNLaGn@EiK{ipr%)AmUh4TEO z?BasNVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu index 9dd98512ef13c947195bf7d322aded837c09bcd6..2f7395fed3393b15a5f29e858b281f0932888513 100644 GIT binary patch delta 114 zcmX@Y*Uh(KE6e09EK!qvSd}N6v+@IJ3ov^vt00(u&dNLaGn@EiK{ipr%)AmUh4TEO z?BasNVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu index 3abb44da0de020f994ce9c101afa6d9db9a00dc5..cfc62fa610f267b2e546e0957d284480610d44cc 100644 GIT binary patch delta 114 zcmX@Y*Uh(KE6e09EK!qvSd}N6v+@IJ3ov^vt00(u&dNLaGn@EiK{ipr%)AmUh4TEO z?BasNVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu index 0f86dd24953f5ddd1a93e85e2c68c9db73f89ca4..1599f6d766e95b39299c176224225e0cb08dde77 100644 GIT binary patch delta 91 zcmX@a*U7hGGt1;nEK!q%SdD;Kd2%kRD41T&DmeKNtHESvHnGWkY+{0$c_ms3<@rU~ l#RZAUsX7W=lO5Td!Lke3bSGbClK`{X*(DjdCd;!c0|5L_8Rh^0 delta 156 zcmeC=JH)qPGt1;HERiesRduofD?f~51!JgAp2aGN;GARSWd*5Mn7jd~;Xf;f z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q69rrVZC5F}a9M2V&kFHYp_GvuqL*{lzE$ HW8($@ww@-m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index cd2614163d7c6f047696e70b31b8bb7fc63ab9fd..6aefb1060b5276f11831a61826c315abe4087e6c 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6VsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index fb7572d1a0e81de69d0273e7a47e0ab6c374af67..5119b75ff0c9ce9eb782ecd19ece9577555835ae 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYF*(DjdCi+Qm7vvXkDW~S8WTtTe04T;CbN~PV delta 156 zcmZqYJHWSLBg^DXERiewRduo+D?f~51!JgAp1~@J;GALQWd*5Mm@L35H~AYY zzZ{sSkeOGarBI$} I|6$_>0541=3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu index 28add75563b74e78fa8f09506cd886a5a648c7cd..a86754dc5918d75b692fd301bf15219cb6ad3d05 100644 GIT binary patch delta 91 zcmX@a*U7hGGt1;nEK!q%SdD;Kd2%kRD41T&DmeKNtHESvHnGWkY+{0$c_ms3<@rU~ l#RZAUsX7W=lO5Td!Lke3bSGbClK`{X*(DjdCd;!c0|5L_8Rh^0 delta 156 zcmeC=JH)qPGt1;HERiesRduofD?f~51!JgAp2aGN;GARSWd*5Mn7jd~;Xf;f z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q69rrVZC5F}a9M2V&kFHYp_GvuqL*{lzE$ HW8($@ww@-m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 76492138b8851aa22e618687561f80b636a72970..07d3a81d545bb5a148ef4d0f9bbe1db506c46d46 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6*(DjdCi+Qm7vvXkDW~S8WTtTe04T;CbN~PV delta 156 zcmZqYJHWSLBg^DXERiewRduo+D?f~51!JgAp1~@J;GALQWd*5Mm@L35H~AYY zzZ{sSkeOGarBI$} I|6$_>0541=3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index cb6f23adf1ff58437638c497a6f2b72ca6bdf147..d5fd88917a13070e55bbe7b627a531f15d7696bb 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYFVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu index 8eb24cd9afbf273c7efc27b81785d0638ea21320..4f7a3e28a02bd11967f2e82d2671cbadb9c59328 100644 GIT binary patch delta 114 zcmX@Y*Uh(KE6e09EK!qvSd}N6v+@IJ3ov^vt00(u&dNLaGn@EiK{ipr%)AmUh4TEO z?BasNVsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu index 15407bb8a5a246d95e0668c0dffa9e3c6832d54f..650b53b7e54303bdd3ef1d75730ee248c7815af2 100644 GIT binary patch delta 91 zcmX@a*U7hGGt1;nEK!q%SdD;Kd2%kRD41T&DmeKNtHESvHnGWkY+{0$c_ms3<@rU~ l#RZAUsX7W=lO5Td!Lke3bSGbClK`{X*(DjdCd;!c0|5L_8Rh^0 delta 156 zcmeC=JH)qPGt1;HERiesRduofD?f~51!JgAp2aGN;GARSWd*5Mn7jd~;Xf;f z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q69rrVZC5F}a9M2V&kFHYp_GvuqL*{lzE$ HW8($@ww@-m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 727839b622565e89b3e3a6d3a5d534c90fc0a670..a6c1709c2c960ad219225a4282541b290c9258fc 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6VsX delta 174 zcmeC?JHoeNE6e0!RaH2+C|OG(CBC>QSx12jBnlR{f-9IjhgA^4xxmWH z3Q<$XDmVE8E4x}saS2?bf}PX#;6zvE#0ny96WG LL_dkicI?~$r9mTc delta 186 zcmbQhcam?zE|$r=St3~!fMD_lR@KSoto#U0CaWl%BkGiySgfUx5}yR5briTjI>362 zAW9yv>d8XHGV@Bb6w339vWp86lT#5&AFxVHHfPg@C{JK>Re~r_E=@@cE-6Y)%#ANm eC!X`1%U!1!jzko|QH7_MIjSBz=?J^hu diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 141f4e6c083ca6fcb2152b804cb7658b51f5ee24..b3bba802c7b25f85ca8ad7626e83fd881da7ba10 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYF*(DjdCi+Qm7vvXkDW~S8WTtTe04T;CbN~PV delta 156 zcmZqYJHWSLBg^DXERiewRduo+D?f~51!JgAp1~@J;GALQWd*5Mm@L35H~AYY zzZ{sSkeOGarBI$} I|6$_>0541=3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu index dabb3d066a38f3493df8512c7bb93a9cc60f6d23..32455afbf1be5ce2912d57517a1885919cecb07d 100644 GIT binary patch delta 91 zcmX@a*U7hGGt1;nEK!q%SdD;Kd2%kRD41T&DmeKNtHESvHnGWkY+{0$c_ms3<@rU~ l#RZAUsX7W=lO5Td!Lke3bSGbClK`{X*(DjdCd;!c0|5L_8Rh^0 delta 156 zcmeC=JH)qPGt1;HERiesRduofD?f~51!JgAp2aGN;GARSWd*5Mn7jd~;Xf;f z9GIt&nOCBvP@Z3uU0jfuoT{V11rY`*6Q69rrVZC5F}a9M2V&kFHYp_GvuqL*{lzE$ HW8($@ww@-m diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 1d7ad4cc18831538f5114201f9dc1a38bac1b19d..169f71792d44b8e843fdacf044a8b75e905003d0 100644 GIT binary patch delta 117 zcmX@k*T=VE2g~H`EK!sFS(PVSvGN0H3ov^Ft00(u#mYOmpG|zS2%D&2W?qSwLV12s zc5y*sa;lC3*W_L{iOF$n+CUY~V#%c`iNPgBsfoGqB?`r021JSEWL|a&My`o|5|eG% FxdEiyBlrLS delta 170 zcmeC+ot(}pir~y=6-03Uva)km=_qi4^Z>!+Mpn7W ze_8qEAl%Hn5-o-D{G#mQg2d!h6lLO*P1&?rVVWc+m$T_WOkKz(g(Q55O=6*(DjdCi+Qm7vvXkDW~S8WTtTe04T;CbN~PV delta 156 zcmZqYJHWSLBg^DXERiewRduo+D?f~51!JgAp1~@J;GALQWd*5Mm@L35H~AYY zzZ{sSkeOGarBI$} I|6$_>0541=3jhEB diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index be153e839b2a7ed9052cb6f4013c84cc55228b95..7c9302c97884a0faf45a9fb890ab0f1328f7fd30 100644 GIT binary patch delta 123 zcmX@g*Tc7A8_VRaEK!qvS(PVSu<`?G3ov^ft00(u!OA;%BCG6V2{wMA%)AmUh4TEO z?BasNXRaH2+C|OG(CBC>QSx12jBnlR{f-9IjmsJqX5ppRm z0jaKnt5uj>&nh?hH!Hs!L~mvuP(gWqQFd`bVsa{qGV#gAY}%|48|qmlCYQ45Kn$MG aCWRz?flXqfzc_b6egT(qYFL7eXRFzOy%hmLh1C zC~>?V&p9*4p4ntq&VkbG$ER_Y(Vu5m^m6=paz!bO-~-&5 z^n&*+bF2_SxSbdH_~2w7dQ3^?F_@RQM2uPni7p$#U*b@(^-T1-Zc^teasfNu<^4Qz z)-M;&Bsh35det|Z6G&sJ+ zA3eSuP^~FOz0gPK8_RTIQh7?FY0LPWD-2r}LP`QcxVVwIAfycC1d##e%CL~$?^A4T zVjv_lDi->Z{WdP$&c-XVdIVP?H|Y?Z?W&zKu7P^(X zN*NHVnbZgH@S&P_dFm8vj+rv~b$y5zT<9Rf`g=uQ8Kx__*(emDff&?G4Po;)XUdC? zG_CxXQ3Yw$ddzth3J#->c!`53tYSgHjgcG*vEW)0QkFwG)1J1harV2F1KjcC2H|g28E)0hbK*e2_WV=vVj2YG)jJ{hv`s zFL>WF#|jaI>pjB82PgB;XG$`U!92z!V$?E7G}#LN7=was=Az#55Aw*_ zpkNrxH7%mT=ZX+|dqmfcSu5PjJW8#4Ns<5{m#3mAz~DIx#NEylzdInf1!$Nwir`X2C zKuBg(Ec7}1WnB7C#S623qE_X$$%PQ+@)aRdo0ZiyLsDwkj(k?Uebo`vnUxvMR?Lbc zWH5zlIUGNr`g)kcyEiF3sCa-L@!Rz$tW;(hku=4JE&lsER2+idp`3=XoSrh;O literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..90615451a3f8a888d356c1676573a1ad9cd3df95 GIT binary patch literal 1810 zcmb_dSx*}=5PsiZF#-uuv}sCuh`JL2O$wZzO$D=TPSHo zlsI0`@y&dmnN4=(94O7ce;j8S{c(CpFUFrHmz2T?-ox!J>^~?R4pF+d*ReeRQd*fJ=sYKFAzw^vlO&wKEPoy`ND> zFL=)~#|jaI+u6d#2PgB;V@fiQ!E9p^F=`nknrsB$#-L#9ndo)Rq|R030(QL1`+4N7 zUoZ^jh89ubb43WfJEAMctQGEM9;H@0Ns<5{muI3Vz~DIx#K}NEym8dIp#)!$Nwer`X!W zK**(8U9exqrGM4RjEaTcLh&JYmFFfGLYT{!giLH!R@W3ssgXPKY4P?|M^I;0W;9(g zD~^%D6sqND^n~i`K??8Qr0}HTA$r7b*PpObnMFiG!_o=H3HdCwU@kMoOllVJvTBxF zo~x7reKnh!{reBqzKc_*SYynT$*<}gJm*3O8P?w_@JcXU$<0O~35|2QBnvk*=%C+u*rtI$y{%nckf3M=v zKE{w6?E{3h6&eS|fAj(&r%Dva*Z2T99jD-NB+vkgvlN~c;G_gNxd$*yp+tC&;Iq*3gz?ND0r<9CS@fwCdk z#H!YHd@s-45-FTJ@470YEO#L{Wgja~6zid8A$zf%+xN=;83_ z27mPEW=KPz81+&gqHS5F3)9+C8ckco$4p_=q7YI65W>Z^%mpD;D97j-V4(~P>7AZp z8&d{CF3tLa{WLE9Pg=FxCJ{oI%U6Uk^A9r>(y^Rgp|X;taWR?LcHq%Vbf zIT}Br`f8ZM+t(>Ps(6SV@!Rz$tW;)MBAa351mlE!l3FmAnPM(A3wTqJ;+Et(Wk6re zCQ^U@q270KVu}%Cu1tOvukV}-9c0*it-))-Y%RB2g(NgCh7qX&Z2lHZdC{S!rT?N- zL0Yw$a9)Rk%lJK(ZxDr5EC{$Zl0zYuTx&wAMyS?U{!Pu_9sJ%8$A4eK<2{TaH`)gX z8!I$Ui+^YYLQa(^kgv%Ba5_oB<2;}N6lWe|oAjDmy!ZJH9=in=6H*djz#nu_%DXgO;qaq-%gFN8#Vd}nV0Eky*C zDB5^EK4)f*XC|E;IR{G9pP$ERN`IYQ(aZ6d$rYtAf{$=_2YdfC4u@#n-EAQ+zE#K} z4ECY@>Gb;`7uq-hE9SJ+3ovuYVo=Qc&W?3#kux|?Q{a-}IUi&OHu~jLwu+2HxA!~h z=mqat=2$L*aNR91J~)|$9#fKe3}zch#HeME2-yg}4IpRhndr4)QfDf10bTF1eik|F z=M00nrFm5NUJ63*x9G|-YlVB0MX6P{)9C=9mggeR!Qeg%#x*)pFY`eC0&VngaD00- zpvSiZsx`%+7y1yPvB(xCB~Tho8;8#zVc7T}q@X8+iyN5MgxFAdX`+-wwTP*V&lPqoAPXU>!tEomC{ zFQW?5s`Z%jG8BA_K4Aq1kz2)rfEyz@ zTIz5BE90)cWtphbsV=1H`=T{cV;rw^+y(2*4E4a-vq8ZB2UxekBnBJ9#1itonCrfA v*R)9;8)wsi-IAmJIMQ*E0OkPRDjDcC^$LfS-^ZeR>MzFF&W_Mr&O3hrR(4o$ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..5bbe80eaf6c39fa9b4986f2c74e497d3438a6305 GIT binary patch literal 1802 zcmb_d+fExX5Pi>AjDmzH+B7A!6?I9ZutkWfG!^OP(Q?*K;^MU}UkHi#_|D!0+LjVp zqG;pw%sDf2?3r|S35wn&-B4i`@Hh`S1XQJ1JNu8<41$4d3`dQ?x zpEFG6hUQV>dnpLL+oCJStQGEM7Nu6*PNxHaTAqnK2ZQ@87}w}Xz03pk3$)RL!O_j( zfF9ips16j9Ug!ga#-du7lt5`TZ2~@mgkckdkb<5NF0N%J2q_XdMq+@uGAtx_eu}M4 z41}ac#av&opT;GBb-XmIfAlJ8>rx0|E?W{Zv1xI)rYK1jwq>8@Z(g+wRb@p*(!=n>DiueWqONTB$(`Gl3qEF$U|mQFBE$S0`@4uAmcATohD#lEi?5aAzb1rm{Vg0oXUK*w=x!EYxpr#m9k?Mf;&zvbQTGBM? zUqlt8RqHY5Wh(d>y~pbtL~a!e0pOq9@q0fUKlu)i_ArIa zXdfV~t&q@EVk={@SK zqLw=B!^*fTZ&@a)a;gJqn!ad_)ELJr9e2SxGebRa@N5w9{|VM@Fp0s&FtLPuDTcal w!ZmGD=f>GIV7KI`KhASpB!D%5w@LpF literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d90cfc002acd4c478a69c1de40bc5c59d3def1a GIT binary patch literal 1802 zcmb_d+fExX5Pi>AjDmzH+B7A!6?I9ZutkWfG!^OP(Q?*K;^MU}UkHi#_|D!0+Cm8` zQMB=T=A4;1_DniEat@THKR%7ql>R)uq!;7QlS@ir1Rvn`7WN)A4u@#n-EASyzgEa0 z3|>L|)9Lp?F0^q1R?KOs7hvX)#h{q?ogM4gB4==xrobgbKObZUHu~jbwu+2HxA!aR z=mqat=2$L*aNR91J~)|$9#fKeOlBKM#H?kI2-yg}4IpRhndr4)QfDf10bTF1eik|F z=M0m%p?OsJUJ63*w&=<+YlVBAMX6P{)9C=9mS-Z*!Qeg%#x*)pFY`eC0&VnQaCCDx zphq_Y8Z^bE7y1C9v8Wa%B~Thon}AOsVc5hVq@X8+i))z)LW)F=kr-gE3=7GfpJHnh z10k1Yb-{icm;BQ!H7e$M3&)4(O6Iy0LYT{zgiLH&+^s1}Qblgrr}?`#EkjjVQPFhC ztT;vu6F3|sa5Q?t^ZnZ`9S;&HzHL5Xr80|%dWNMFj1%%jYQbD;@|n~u;89f~H!_zM z19H`iO8omT<+>fGs<4VNQzpBr4)B}{9b{O4FN2qc=}K-k3N@%H234dwVEr>^%8QmX zjrtc+1!>iK%z2p#K1Lt$`Ua6(#e#rqBRS+^!L=r&XoTWfUH|&d-);Qe5692G!=pV+ zAv4+s2x}`e4vXhE0wJeLWFj x?wfE;o7A~+HVxPfzXy?ND0r<9B%zC>x?^ ztg2GS=jXY*$9LCgM=pTU;&Bsh4@6K0_Nl93I~u z4e9aikcL1p=u96Xv?#NMX$6!<)7Ie=NEo#~2r20a;o?T7f{-$h6C?&$D8oX0e@?Nr zDFPu^W_8Ja85jSjQx$EK2O-R*OG2hLDGO_cdem+^_F4Y+b;nSrRi-mrGAm9{z!;8( zF&vMdIQj0)mIn`GD8F4kVWl!zfntWG6O0q`S!%&tV)D7vEZ{xW3bz_p83S@Pi(2`I zFV(v1r_QkUF;^zNu5a&x3ms%wf2*8VhS^GPHVQRpD2BDC#$ofbV9JY*G_CrVg$mND z^@Q^(6nu<7VC@ExTg8Ha8zVX7BI8;UQg%W4tZTn1`1_4No8jcy1w7uv5K^OkfUveg z1r4uI213?4@T4WKxS;U)P`Oa9Xb{PP$}2QLtMQc@=msMDHS>Tm!n zEFvmpKNn-89ih2gM85$T*H%~n literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..a8dfa50978c2a394f677bcf0095cc951d3834dc1 GIT binary patch literal 1805 zcmb_d+fExX5Pi>AjDmzH+B7Alh`JrxEqjXs@ zAXmMp^ndtLuG?{{3ac11WwPt)3@^CQL5B5@GI(j2uHEP*1xE2N$l)ae82 ztfH1W?8C~qYj0U5s&c9WY5Klsjno*&D;;;iIx|B(aQ18v@c#+cZ7_+!#xSvjd@ts@ xZ^AWgQs>6mG+?*ns6UQ$TqJ-sfVWBpdQH8;A?4rTrF-fx#@Nn|&|JAj6gzwHcd$>qArOPwg^#`rXsyOTF%-@T)ejB3n397-`Sf$+fqVH z6m7hoIcH{$XPnNqoB^fjw-2K*F)%_85 z^n!OSb1WA@xUCH^J~)|$E>n_uOlA{E#H?kI2w4lh2_R>yspz(0QfDf10bAZ>y)1Ip z%NZtfOY^Aky%dDrZ_%Y=)(ZD3i&CqtPNxHaTAqtM2ZQ@87}w}Xz03pkbF|Te{_*Y6 zAw9nBQynNKJ=X^ajYYLEE`ic$+5~(834Ud?A|L9fH)};`_T(%%&Y}2B*CMZc2wq>8>Z(p|zRb@p*lLfQl z1T{?H=rDof;S-*P)0_SVjt2=8-!`AHQki)~J;TBY#tHc(wO}qa`BZ8a@Te-08=1?B z0lDf$CH}*ga@~$oRanKCDwADT2YA7S4l=C1mcdKIWGUBcg&Ncp{VGx&u>P4bAj6gzwHcd$>qArOPwg^#`rXsyOTF%-@T)ejB3n397-`Sf$TPQ&# ziZ)))oHH}WGfrn)&VbVN$EQ)6(w}En^m6ohd_^e?;RD>=!S18R!2qp0J1ykJ*9tj+ z{vNbHon8;*TpK4~$()vY4rT^f42pT**|v_&a|Y*W3S2Vu^Fd}{qhCB_tH?NPb$>-2 zz2IHT9Lq%zZfgUK4^C#G%amjuli36kF>4tlLe_$B0?65FD!Of$)R~H0z?OGeFN>V@ za)!y=(mX1BF9o6ZTXgA|wZgs0qSR`u)9C=9mggeR!Qeg%#x*)pFY`eC9BuTVe|&p% zNRMy(G-!%R&-DR9V^J-POQ1BGHUXbO!k~#kNI_2s7dJ8!gcOOKAThv985WXzKgCui z212gP@{;{FF8QZdYE;bi29A%>mCSW1gfN#a2pQY7=&cD#QblgrC;9ugEkjjVQPE_< ztT;gp6F53d;CT3qXW{g&zk%aH0>!t@C#+Ov9#PM*aDs6{zDO;YOHDqNngu+nO5{f7 zvSL84dQpl0@TFY0<5U$^F{a97*VO@DaG`??tM6s-(lA-d^;)3@HATOQR0phoW=wg} zlBQArGO8f0T8%g_Q^CjZBeri4xm7F(xG|DLF6LZoLW)5so>lwTJ%7LP_dJ}uxQB4-i&XXuK?5J_v-IDUqXEqkZ6Xlz_+EfCf;UCGeX3rv?A%1O90OMSxdGJuRry z2h>?bEp^z3rE%BZvP@LvR0q-weK8uTF^*R{?tpb>ihAJ1vqr%GCs>ccI0hTT*b?%s zxYT_Ut{Ia$H_j#jrzJ=I@jk~z0$2lht7M?p)GNH9{2n&lQ~xifzXy?ND0r<9B%zC>x?^ ztg2GSch7TokA2r@M=pTUzSv;&Bsh4@6K0_Nl7#`mq z9n$05Aq|0I(wRO$Xi-)R(+Vh!rft9{kT7av5K__;!o`hD1tDc3CrAviP=li^;aNJp9d6-x5JUOx@(C-I$qE!RES+GSkk3*J<`R?7rDg%osaCkvxT+YCt69{_ zKYXdy-8glHb&R<(>2-a27hLEd!}?nlyfVyIa2)HqlLoPC|H6djel+XI^Hy8eXwI=((=_Ce^qkslboW<~x{HG=V=>z_G3}t{9NIfa3lLypkO)YiU zhm~>H-m+BG<R)wrdN~C(`!m$3?JbB9uEF$9F5Srzu!V$eyxxr z7#>3Vll1!_m)bZ1Yv#1nOE3$_Vo=QcWY0Rb%o$vyDR9Z~oDVVs8~y4zTSdm9)B6>5 z^n&*+b1WA@xXun3ADqlWk15GK2D1w!V$?E7glq-h1(37NT=d#7sWTP1fR1-rKZ~68 zbB4j((L5@AF9o3wTXgN1wZgs0qSUIBBnbd&c_H!~4DPdFT%#lPG7r=*(MAWycZ1;x zJ-HiFttke*)B}XZB3qc2Kxs5>9R33dqs9jz1wA2L+{#Q4QUr2}!~hFrSm-{SQ*2{m zAS5*^=K6~LHm>_R2d~Zg30=utXF~{c*@}>T!t$rVY>R?Ldi z!EqN*$1a?VcQ|?cZkL0|h~GA!uu_?2L`lQS3C0QeBDG*HHThg>7Vx$zog1agi~+go zMWz4amvY_qQ)O8Bm@AXrRCjpEg$^=ozL(BR!)z_LTZJ0b6vN6>?Xdn?Fy%!{nnwMr zsDiX=GvT}p1s~&&SiwQ$R1&;%P22h-J;Whcs3jVW4{PQjp4qhSjw4hEOQD+sk z)Zqx$#@%?!GEt>dT}adSMQfzSI9}VdOoi-7+Rux^8C3^sU|odiBpQP6VxzNT5STU!iUVxcH9)n`uCwtbhMZw^GmI0Rx{d|x)*yxv!*=jNlo$mLj zqZhnunPY_r!gY4Q_~2w7x=cytF_~Q;5wn&-B4i`@E`WlqXQI0elR8(C3+Q;4_wvYD zuV9$WEiIzL_ev0YzeQJ$Su5O&JW8!PNs<7dmhVMTfWdthjB9nIUKWA+1=?u;__jYd zp(nQkssqKO7rKwoSXK*@3Mh@HZNR@EVc5nXq@*W=iyN5>Ldrx=kr-gE3=8SKpJHnh z10k7FvCx<7hjHm29k0ylAH6EtrW8V$%a?>qY*yZ_DN0g@-Lg-M*DtpWb!BBm(ZSGzbOBYwB}gq6xHBI+5IPB2c$N2vvKnJH#cvw%lci`>dwRSd}0 zENbx|zEtb&ICX_}jF~d|b$x&rT<9Rf`b!nOGE7%;vr(u)Loui$H36HSIa6M2NzM$)3<$&Xb?*Tvv7g literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7a7c1ee1db10b8d71ec26e568711ede93046ad1 GIT binary patch literal 1802 zcmb_d+fExX5PjdT7zGId+B8knBI=SzVT&rN(p03EN6T3|iHp~^d?6&_<2!qaKwD^o zN)&Cpo;hb`jy;oPPtJkT?E8mtmeC*QSM+lHX>vs=jNl#I-NC`5#^DgH`}-~A#g`g6 zgux-SKS{3#a-oeAuwqV2y#O1i-N(&ECVhX`uQMpu+cA{vejf9I^Ca9 zM=yBSGRF!LgzM~p@xjSFbeWROV=}uyB4#avM94<)T>u4J&qTKklR8(C3+Q;4_wvYD zuV9$WEiIzL_ev0YzeQJ$Su5PDJW8!PNs<7dmLEk?fWdthjN9r+y(|Lt3$)Sx@oj%_ zLQif7G-!%RFLWQFv8)y*6;K*Yw*miwgyA*@AtgN_T-?Z95K<;`io^hOWmrh>{S;f9 z7znvCt4sFFxb&}HnNhLOJ2*Z@S1~uG5W-x(BxGW<@@`F0k~(tBJ}ut9ZW-#z%8I5- zX2og$I0Y0jg_F?^p2u%?aYRP^w)upW$}A%48J12kPRM7e1#_7xW>T|&M^%fwmAR@I zkgHkL;y-+;*6lcTg>{UXGWm6VfEQfoAjA4w6}&P`S8}sas6j(9s3SE2o1ZyTUbLjy zs(%?(kXEh7oL8yfWAq-cZxDr5EC{$Ul0zXDTx&wgMkt^4^>6O{{l=gDaQf^!JUYM> za-)5Ku(m?uuy}qW5OS_WfohG9fYWgb9;X2fpg2q6CHc=v{<8=CvlPkzFOYgxQfCjS z^O{=fa0DykuDxZssLQDjWZU#*Yh=baUhB9I)|naVfrDp*fd5ahX@f}&Hin5M7U$JsTdFoAb)e-8(LG>*q;-QVvZFF)7F zF^mqO`-uhvkeN14z?wNNbp~b$X%Q6jKH9U6WjTY3BmpiNp7TMbU<+by=8oot!uLuL`mjaUj#(?*tF(|>^`a;OKrJsso`b=C7L03kq+aHM`V4J!czib; zozRoJ5etrSA@)LQWn-6^{Cx;?DPEX>yDvLt4wFUVpg0Ek7Gau zV>p@YaPs)gE(ehjzg<3Ir7~H8VuqCyj1%%nYQbD$@`cna;62p}w;ER&19CNsTKUH> z)w=7a&an2eP$s>pZ|{-|9c0*it(;ee`C4wb3N>gbMzyELVe_+O%8QOPt@>Am3eu|0 zl=CVSd`#YB?FNxs#e#raBRS+E<609^c0u{9YriS@hmGHx;q=)BJUPG+Qlou&yc6z^SuEz`qCBw85+hwg@vz$d_`W`_^6CCQWRd p%>#Bzj{4&;$3+a719+=spf}Vj98dlbi|(mE7h|J6p}AZ}zW_{qR%!qM literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a0119f2c4a9570d2789b7f65a44e8f442741ed8 GIT binary patch literal 1797 zcmb_d+fExX5Pi>AjDmy!ZJH*ih`JH#8Skt|G0oHUv?{{9byQR9J-f}9X8Ze%71DZ)5GUVw!%EF|~m6x)~> z2uY2KxxQq7j7#3;;FVcFqAQu}WC&p{TM;s~X;EA=)THv6oz;ByRSgFi1qM~8t1mlE!m0B>DntU!b3wT$R%#F}x!hl@$qLTmN zOSx|QsS>Pw%$3Qmt1Gp(`j^g7e$3+5|0(h%rpx4wZ98Z29>+Pw(7Gpa*LUXz3`~_~TR?Pqa literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..37c8eaa6f98f0154f59aef92ee634be315c91971 GIT binary patch literal 1794 zcmb_d+fExX5PjdT7zGId+B8j25p_wVutgPBX)4moqvfof#Kmh{z7P`e@twU1lokYq zDB5^EK4)f*XC}#xoCBrVkB{Rlqd(6s>Bac-VUoGT@ToIUi&WHu~j5wwjDXr~51F z=mqau=2#(uaGfV$d~h-kU8W@S7|c^35u=ttB4i`@Qvd~9&qQ|{CUveN7trx8@8yxR zUcoS!8(Kt#@0B3*Zi}uQvsSn_d6Zgpk|Y5@EkB8(0E7E17}x4Zy(|Lt3$)Sx(aqt2 z9^VY84isZv=sqH2nJi2yoHUxY{vHRyu=PMlNlpkC*D@D`lwq78FTh+G7Sg+Oimgoy zgk(m=LSL}o#-)!Qyfmx-JXO&)nGnKUz9eK~v$C?LC`j#g%RVjMz1=d@nUxt$m&}Tj z{!t2tgA|TOuXZqnK7P0Pgq6xHBFY(-PB2c$7pVnvnJH#cvw#;>Yuu__B@D>bENboV zzf|jOKXrn&kC`(0RegWwT<9Rf`g`TPGE7%;vr#BPLouj5H4dAfIa6M2Nz;md5mk^@ zt;d{Kq2Ocm0c$si!YUR7TpP)u5DTs~A!Qeo&${-Tg1_7NyBSWNTfw6}j377K2MB8` zG!BYqcL5=1N))Ko_y9N^r{HlA&;W|F6kd`3w4^`1M?XuUn9*|m<%3kF}Z47g-?&Ig%;jehZzttR8J)A8m_iS~n|#7bW#$p(3=1b1C*+&dg1O8TQ>j_Ni>ftlRjv{S`Enf$uGzY8vOkYV++a$XrGOSxVvl%S#L*Pa@O&CiS}FE*rU#lMUy zNUK&O&Z|)HG5mzJ8$@9h3j%J8fzXy?ND0r<9B%zC>s+9 zR#mCv^Yh%@+by=9cD#!uLuLdcQ^2j#(?*o3xNx^`a;OKrPQjo`b=C7L03kq+aHM`V4J!cyxO> zqQ|!*8Un?bGd)CPQ6>ws3MY-Gt-pT*VcdEkq$DSViyN5=Ldq~skQZR73=8r7ImI@n z2!vdj^(FgdT>PI-RkTeSgfN$`2$|WWEUP)HQM>Kf=lQ$09YdW~na+I0tT-7S#c()^ z;dt_D2V)rGx63E2R3%dm2SaY8;zEtpG8zL1&)yro*;R^lpQK(1y{3;*z?T6g`_ z3D!Oq%B0uzJefBJxa5ktxU^1zdlIeEaG*343e16Uh(?JY}1 zolJchZL^nMkr?B6E#fX%XBMaij+-q4UL0T30kb05BFroyU&~4ETW@WLG@)@e57;O< kijPAa7cpQ8;H{E@UQ@4dF!_C~x5xfejE#1L=5iVR20-ytwEzGB literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7a3efade5e741fe4d5eb659bec2cc51d65376a82 GIT binary patch literal 1797 zcmb_d+fExX5Pi>AjDmy!ZJGwrh`Jw`>Gb;`m)bZ1Yv#1nOE3$_Vo=Qc&W?3#nKL*~Q{a-}IUi&OHu}|5wu+2HxA!~h z=mqat=2$L*aNTEMd~h-gJ*Fh{7|e4Z5u=ttB4jK0a{xKp%tfyalR8t83+Q^6^|Q!X zKW7-sEzP6C_fimgzeU%MSu5PTEK04qolXY;wLBMj4hHvGFs{*&dYK36muRDd;qCB< z9^W2PttrO5)B{AuB3YQ0IB7I({QVyYqs9Xv1vw#H+{jE2QiO4WyZ{SjSV->ADYh{& z5Rw`dbA8GF7?-@w!7HR^wjyL|)1tU$s7d9yWuN6A-nR@@az#?J6|>@G zFic>0l)&-$%?>6oz;ByRSgFi1qM~8t1mlE!m0B>DntU!b3wT$R%#F}x!hl@$qLTmN zOSx|QsS>Pw%$3Qmt1GuvL+a2t pn+0r?9L2{`j*A2^1@KnMK(DD+IG+4I*4tBmEyi|sgywS5`3viOR{8({ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..53e32eadb9801447cb5fc4761557d3d55fe780d9 GIT binary patch literal 1794 zcmb_d+fExX5PjdT7zGId+B6NK5p_wVutgPBX)4moqvfof#Kmh{z7P`e@twU1lokYq zDB5^EK4)f*XC}#xoCBrVkB{Rlqd(6s>Bac-Yuu__B@D>bENboV zzf|jOKXrn&kC`(0RegWwT<9Rf`g`TPGE7%;vr#BPLvdJpY8*B{bEdr5lBN~^BB~&* zT8}xeLcz!A1J-U3g;gvFxHghQAr@R~Ldq^EpLOjw1%J2kcQc$kw}MA|7(s5d4-nQ? zXdD#J?gB#2lqgWE@d0o;PQl|KpaB$TDZC>6X-R*2kA9Xy$^Y`evywTx$DG&9QilUr z8F%F^%SD|`eHm@PmtBz=<9IFNE?8$~s0L1)4FX;qU(*4T7;Fp^OUSo!nETdS+aXP8 qoJ|8ZN`d0z9LGfpm;!jKWT02nE1XS!AM5R*{}p4C9ih3LCw~A|lU42j literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf9dd81059ef95d26ce85f22e8222b156770768d GIT binary patch literal 1794 zcmb_d+fExX5PjdT7=eTUZJGwrh`JmDLwnJkPeoHUxY{+H5lwq7AFThM07Sj83imglx zgj|{BCHre!`s}I9s95MN4n9U#F*lhI!d$)}WNfptvL+}{h5m60C&M>8m_iS~n|#7bW#$p(3=1b1C*+&dg1O8TQ>j_Ni>ftlRjv{S`Enf$uGzY8vOkYV++a$XrGOSxVvl%SzFsy#Iho1Ym|UTjFyihmhZ zkXEfmoL8aXWB3VcH;BS276jZF$)ON)t~DWL7nIMs_M3vg-}t{7PG4HV!+ne(H`)gX zD=Rb(iWheQA?HdIsMY8YI31))W>#gmO rCN$0_0UM=2@o|phA_YtVyj3#LYw8uwCVznS_SFB1vB|E`T+WhzPa{?C literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..53b440968c820ba8ae9c314dae60b0eeb7fb1549 GIT binary patch literal 1791 zcmcIlZBN@U5dNNDaVio5l&&jc74vANk|D%2)`{-rqvghK>fzXy?ND0r<9B%zC>s+9 zR#mCv^Yh%@P1lmfLfl5JO_jOEEw17NWIJh^%>geaCAF5 zqQ|#~Gz5wr3{_xcEPvs%V=u2w^T=5i+w$SyppYqjuY|&+~V0JBB)~GM)K~S#dHP#V|UG z;dt_D2V)rGx63E2R3%dm2SaY8;zEtpG8zL1&)yro*;R^lpQK(1y{3;*z?T6g`_ z3D!Oq%B0uz;$02 zI+^-1+Ga1iA~DACTEtzj&MZ(395-78yg0t517=0AMVMJazLt~Rx8B+gX+q;{9tfQt#hMv zRWTsfW>M?^@TFR} zOPWUg%cz31YBl1#N(CRoPk4iaD6C>Zz>Sd{3Nhze6H@j<`K&u&bM5ap9=AmQ)z^5q ziz(zr`v75Og~oaD`d%R9Oo;;38tnt8qZB+21R6kbmcm=|pO*Zm5BR4klmXr#^{k}M z9#H2swbWrBmd0Is%W_edQy)mv_hoBj#yDQ?rGsaH6p{2pGqr~YD$O}2&Ra+drBi1Jv1 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..28652ab985ac3c6c74e56cff4b0ecfe2cf12683f GIT binary patch literal 1802 zcmb_dZBN@U5dNNDaRLc3l&+%;#Jsjr$q-^1>qPhR(Q;$A^>A#$CA}DZ9A8ojLwE+yF^&HF$@)#8JKH0X8%?k!+Sq5A(^z%XHV546=WUIkAY;}G_ z9lhWk%N#325N_)UF+Mn%hYnMcc_j0ckcg~hkQlNS{3!tiTTMl$9VT_IA{VgbUEa+j zXWfD!nHyR}h3}Oh^lpnT9kW)r*Ljp$Z6!$pfLfl3q5y;YEEw15NWClq^>eh*-r>>B zL608a^r#LLNze5@hQ_j57*~PPXxb!vBoYQq4nj(JLb$k=xgew@(nnx`nKCS-cYcbk zObmo%M#Vy3u%E`Ie|5Yx%YXE$U^k@@!d$)}WNfptwqtj(ep|Ncw0Zs(~htaD72$*<}IJm*3O8CGAbMG9C0c&lWfSJW%Kq5Lasx`+N>j7_$M=5m(&0_+-B Ab^rhX literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..04738c31fe97c871bc489ebb8947e79e7d2396af GIT binary patch literal 1802 zcmb_dZBN@U5dPj@aVipGC|ySxh*3gzUns5k@w+6AKpBjp zQB|qqyL;}r$9HaPN6vuK^v9=Bn$n+VSM+l9d3;4F4B-RZ-ND|Y#=!usySt6Zi?3DW z0D5nr`Dt~#Am`dR0ZZnz)N?R1$YM~;`__(iY@RbXPgCHMp`Q;j0~`I~DO&}`q22iv zb@YOFEORUuLAdr7F+Mn%g$`4ac_gzRR#VYwhDn{N$OW{$%eq!8jpbq!!GjCZ9^p0v=T* zaxHUNF(B4vQHlTXrCc}jR25b^rpjd3)d5~`p@R&o?`87RFj>m=T8Vy5wO3`@B;5SW znDU|_O|AZAR6$y`8gX7y!N>3;wr>!*RV)a&F_J?r=3Hw+ia{uzRr_yx{(j@{dFa2m zhlhJeAv4+s2rDZzUKTGO1VYY~$Wg7)K5#lpz~gN|11QcCcuoA%0{`>@{xpFi!7ETt z3+VI#bXGx29rj^q+_kqX6ID6Yfz(4^j7DmVb_1_kI9A`XOn=_lB523pW`9{tO2}LGSF-472Z((4L03V|1ZY2c7*0~*7^hF Cs#kUZ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..fe4c6356fb1d3169152a4d74560ad815b44a6afa GIT binary patch literal 1799 zcmcIlZBN@U5dNNDaRLbeO4m^aVqRORWC$^hb)tLuXt}Z5dN{V_7fLIB{4Pl&P&Nc; ztg2GSch7TokMG=MTh4&e?EA-2meC()SM+l9X?#T~4B+hWsh35devUTUJ378S z?9r3k9u0wF(sO-)&{$Rr;|eH^rcJ;nkT7Us5K__;!o`iu1tDc3eIy2$DZ@f~@2A+x z#6ZZESzfYV#-;zstD@c1K?rmCf{?Mz%Ep?Y9(CB3eNr5~Z5isa%5o+PWwO}qY#Z+n*@SJLe8;z@q0l7Ad zTKR`B)w&(0uCR_VRVKf#Pw#>Y9b{O2tAbaC$x^P@O7xpldv&Buz|GH$DKA>mH0obQ z6{J^ z!paJbH^s}RfRHmK3RG*f51fut@OTx_0E)8|UX%Z{AB|H)Al4!NxGQgnTV;bl-$) u#$=NlXOn=_QlS2Lnd2e_tO2}LGSF-46<$w%51a0>KNn+@ZK1iGCBFenJ637{ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f43a3cca487cdc8f3bd0ae4d6fb90e7708054f9 GIT binary patch literal 1805 zcmb_dZBN@U5dPj@aVio5l&+%##Jsjr$q-^1>qPhR(Q;$A^>A#MGQOr1hVT*Y?_uws#=!usySt6Z%WqZW z0DAk-{It4VkaKOEfF*NU>N%JhWHBh_eQU=$HqRMcq$zO8(9Z{%fsKCgoUH=m(C++> zI(or7mN}M-AY6Nk7$2O>LWe2IJd)WaBqD1WB!;X7-zFeutEuQT!=%nsW!)@t z*3B7`xubbh_+AP^AGYYyF>8f;mqn>nyVYs|pq3XR&%xk63&zztQZMsB{TywycYJbp zazsz>dQ@wQr04n&Lt{}bjLSf2G_4c95D9}i2O$MKAza+bOb}8K=_4?}Oc@rE2S3GD zCI&)MqhhYF*dODPw{g5S%O`Lp@}?R>n9CM~jBQ$6uL-JBWp3Cf`SJUPp(?H@YO-Kf z^ijqHPL2{d9lqdMIDP1C;dq!p@onM>E0vi?lr$`yV4RSzQVZr%lTW2)0Z*&axmLQY z7!Yf-sPupQQm&hMstT(dQ)RN7>I^Tr&_RaPk1}~_m@MUbtwg`3+N&~c5^jEGOnK3e zrdIzdsvxaejW{o<;A8j+Z*UN~RV)a&HIhRv=3Hw+ie4z5RR?UY{lmu7mgv9w8V~o7 zLT0oN5LQ-boENX}1wzi1$Wg7)0dP7>z~exm0TgEmye0lwfq(W0f0{s%;0>s!1$6od zI;)_i4hOI_?#5e|iK?9HKHB7U}G3tLcSMs x-Ph^rHra6FY!a|ra?~G3IxZ5x8o*m61HGYM;gItCc*3gzUns5k@w+6AK-m~& zqpDKJch7TokMG>pj+_Cd>9-G~G^O9quIT0HLWe2IJSMXZBx2SwNQA5f-v*Gg)l_tvFsU;Yxq!BJSvQNE zb#sQv+|oQMd@lu|_gi%7n6<*a&Z5++-D?Z_S3lJuZ~w{`Hx;D?WPn$n9CM~jBQ%<)&wP~!Zz%a{OC=?P*qk`G+8h! z`lw+7$A<}=44?2UoZj}fa6Cw$_%`{3mCDQ`>KPVJFiyxPsReVX$){4YfJaq{T+3Wm z49K-vRN_B;Dc8+7RfScIsWRDhb$}OK=pe)DYZ<&WOqO!JR-#|0+N&aM0&ae0OnK3e zrdIzlsvxaejW{n;!N>4Dwr>!*RV)a&F_J?r=3Hw+ia{uzRr_yx{(j^4dFVg8hlhKZ zLT0oN5LQ-byeytS2!xy|k)v9pec*JIfXCZ_22h+O@RIze1^?*-{%Hb5fEP$TEvVB6 z)LBI>b=Zfcao66mOjPAm2T~7xF&e2cj#oPFf^}w!df>&gM!^3kxEX_S3^s*3gzUns5k@w+6AKpBjp zQB|qqyXU#P$9HaPN6vuK^v9=Bn$n+VSM+l9d3;4F4B-RZ-ND|Y#=!usySokK#n%cs zfZiKuep=lw$hkI7z>+yF^&HF$vKSQezO`c=o97J9(-gR5=;wpXz(&7#%2ttaXm@@^ z9lhWk%N)x^5U#xi#s?>}&|yk4kI8HUiI}wv5+Q5Bw*lmAH5HvEOzKQUE}-pQ*3BYk z-JD@Ew=|Cm-%CO0{T5w1X0329IT+k$!MIvS>SZ3NpQDZTj*f4S z59!Hmj|NRK>A5~YXe_FQaS4=0(>mZYNEp;H2r1|Z;o?SSf{-GSJ`w}Wlwl#c_fu?T zVj$$oEHBw_~JPW7yy)7IM5-7e+K4GOY^N4zeg%gYu@I}|7pR0`hb6$KoQ^-Qcnx& z^Z|8NQA-{6VQJj8w=5G?In{yGLtl(WYK-HRj=Nx;nW7$e@vIT>{|Ro!U>t*uVQdNc zR$S`74p)!KCO6I|0jDKL{qa7>MFLm@c&lWf*VHS#q5K512S@8y AoB#j- literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..8148180027c58984abd63f093b505a83225bbd1a GIT binary patch literal 1799 zcmcIlZBN@U5dNNDaVio5l&+%##Jsjr$q-^1>qPhR(Q;$A^>A#bUlnqfd zR#mCvyXU#P$G&T{Bj-SA^8MpDN$8LBYkD>QG`Xe}M(`f)?_uw+#^DgHySpvq<(CRM zg#JFXKT)p-a-oeAuwqV2y#Opq3XR%fR413&u4%QZKVW{Q_;Ye|&Ox zazsz>`ZNTJNiXyvLi3_pn3OQRMl*{9j@+m@j!t0-r>WL6ANz!*-B zVmKW=<5@Vp>u=$B7(?-G^9d`JS>z~YSUSNtA)lod%q1q9NzDSDQ>AdDaal1SSG}l| zfBaIe+i|K2s~9t7(wpk^F1gS_hV{2Hcxjlfk~MXCeVKXazMXi3wkf0e5s zty+&cFH^zC=mXx}ATq015O8ZGhfFNE)`S#YP&})IE&#m`Ogaevq${17>WR|ka|*3Cy%JpidyP$ z04w8eyk)7V%Bc>d>G+~G5@Q^%ble5&%nbFwsk1@Azb9C?!6Xkh4--qs*J7gkCS21d sb#9zZ19nS>`r|OiMGRO2c&lWfH`FT}PktXS-BW)q#zs3rb2*QG1Flb2kN^Mx literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..a98ac691b97e16b052ae35a3ec482468168b7b09 GIT binary patch literal 1805 zcmb_dZBN@U5dNNDaVipGC|yU{Cg!!3N`?^ASSPxdkCq#|t%qYpV)WI!TfMpqA&ND8S%83&u4%QZI`@{Q_x&M#Vy3vfsv~FLUt9tRBHt$lGiPVJ=@1GO<}%UQ<-1j@+_Oi+68YhB~=2sp*ng zF*xq0fI6mdGTPzf`?otBJVHeLw(*3O$}A#E8kSBlPRJLj1#_7xW>T|&w^i%hC|zX? zh_zkR`agWB*6lcThINdYGWm6VhZkJvAjA556}&P`S8}saV$e|S*O9gXw?A{Hyl6?& zsDBw%kXEh7oL5xvG5UxV97JIi3j%J8HD%ZGGiRCb=(K*%nbFw*|R~1|0lR@gGmfFhKVKQTRGQ# w6Rv5KEjP}l0lTF@{c)t@A_dF=yj3#LYw8saDgO$K?y0{RW0O6hxtu3|09V*pZ~y=R literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..69004b21d037d8e5dfb9b3d3613d3f2421d4f34b GIT binary patch literal 1802 zcmb_dZBN@U5dNNDaRLbeO4m`giFs|Mk|D%2)`{-rqvghK>*3gzUns5k@w+6AK-t*J zMpdPb@1E!G9{aAzuABj-+1K}@ETiAfuIT0H!}y9)7{Xh)yMz7T8V3Wk?(MaZ7oTe6 z0D1?|{v_Ql$hkI7z>+yF^&HF$@)#8JKH0U7%?k$SSq5A(^z%XHV546=W~<3K>~y|I z9lhWk%N#325N>A+j1Nxcp~IA99+TMy5;1ETBtq7LZv!aUYAQNynAEw7T)>WZc{h)o zbqj{c+|nW{e6Iwd_gi%7n6<*a$fMM1CrJ_j)bd;u1sL3C!MH|8>SYnApQDW)_im4R zC-mgDM|Gf>^jseyG?vxExB^O}X%p}-NEkFR2r20a;o?T-f{-$iJ`w}Wlwl#g_fu?T zVjv_lDi->Z{V*>5qvMrX{-ak#yD5bb=JEw0W1E$CYl4#0VO#b|@%m-UP*+w~G+8h! z`p3N#P{b5YhFf^Pd9{V(5i;Vp%_po>W*$+`uyBHLLOx0@n9EEtm6`=Ss#@em=Bi>q zuFawr|KUruZpW!BtYb`-$*=1Jyx>9y8CGAa;FV#rlEgNA!kYysMhEZI313x}5qznx-#XBQwVFTE{)G&P-7c96W0T{C|R*HW?CfVWBpdQH8;3FQy)(mnS7Vr;T2G?%mFC%X(+WB>pF literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..289263685879187c2688b69107cc5894b2816391 GIT binary patch literal 1802 zcmb_dZBN@U5dNNDaRLbeO4m`giFs|Mk|D%2)`{-rqvghK>*3gzUns5k@w+6AKpAXB zqpDKJch7TokA2ra;}XNuw+h4JqI&`JO;(QPj;{S;f7 z7znvC%S-mlxb&}HnNhLOTR1*OS21r&A%wYnLCDx<<=vW~Bz5GLeNw!A-7?gbl@(1E z%!>YTF9j4ag_GeHp6}jl;dq3M_-*qEE0vi?)H5ucV4RT8QVZrXQ%t310gtK{xskc5 z7?5kTsKtNyQmxx@>I&-^Q)Tk&`T#Gu&_RaPw<>sLm@MUbtwg^`wO2>l1l;`0nDU|} zO{4y0R6$y`8gX8wf{)<`yuLvcR0m8}(jl<&kjX=nm5(TO?Is{HfDR`U)G=SnPg_q<%E%{F$@J~}H1H3@$SxKEe zpw4S*sly>Gjl1@i<)SX9K9HvA%ht$@alF=X53Dm&)B^|48Ug>G;HC}6G1wT!mXNRI yQ1?x^rcE}vaW)CqEd}b2^Bfl`U=84{l7U`RuW&;71H5!k{l6HS>r7WXjWC7)-jd6e6&n#z^h|hekld@$M2F5waQkF zY8^ts@%f&6?(yA`?8!M$ntl5)&NBM_?22BFKTfVFg%P}iyE{1eqj5My>;8Vr^Wt;u zIfVWTXn&Gk59C4{Ct$^#mU;nZ4tWfUd7tcA#});H^DF}{8J_b&=3t{=K4z=IICQ!{ zqK;nhu4RrDA_&*n!N&(D^U!5VGLK|-af!%U28k{k!FO>e*m@?qZ8xcN6}fKACE$Nk%* z{s}#~?b8q_l3wT|bd6=UFsU4+(XAF$gK)3E|>K=7Nxt$N+%>=E|^;-k(!! zZDJtg%B(KgPvg@6bgF{g=0ON^`I3-{&C0@>q8@eFmVH{hdDSx1X_e_rm&}U6aX$rA zFol!R4kzEf-r?X8BI37=C#+Ov5mC&rbb@h0K1nT@%SqJodnd#v3c3aeNUaAPEgLM*t}gp^%SKI_`w7X1Cj@69lHasiJHkV0;>4-nQ? zXq*&J?*c;3lqgWG@gZYHXC-v@06MRsr4EO% zGVa=2mWw)_`a+tHFIyus#_?LmeX!2VP!F6s8+7=0g4;Hj#9(8XSVF#(6WurAnl{;T q<7^tRTME=4hdC}%z#PC^B?G;tUg3E1FRqPhR(Q;$A^>A#MGQOr1hVT*Y?_uws#=!usySt6Z%WqZW z0DAk-{It4VkaKOEfF*NU>N%JhWHBh_eQU=$HqRMcq$zO8(9Z{%fsKCgoUH=m(C++> zI(or7mN}M-AY6Nk7$2O>LWe2IJd)WaBqD1WB!;X7-zFeutEuQT!=%nsW!)@t z*3B7`xubbh_+AP^AGYYyF>8f;mqn>nyVYs|pq3XR&%xk63&zztQZMsB{TyxdsCU;p zKB1>~J*qWD(sO-?p|PkI#$}*1n$`(lh=f6%gOCEA5H4@qSMKTZ5$Ig#BUQ%SgFiBqNHKr1mlE!m0B>DntUoX3wTTew4{e!(=JfYbE+M)n1iplW_AhW6Fz$ zG`0FyQ3Yw$YQ%X-1s}stc!Pt;tztpIt&tpZG3QznQuIRctU6$G?H@LtwnYEc*Lb*x z6f&cIfUvSc2W4uI290v-nf4WKwn;4Sgb3jDK2_|pW61aCk+Euhm! z&{+j7bvS^faW~$wOjPAm2U7Qa(Hf~Sj#oPFf^}w!df@C?W5E9t+_b?s1{=fJ67s#6 y>%LA`x54lB51O(s7Xh)&Slr8R!l53Wt>6$4mFzUyQM>9ih3Lwf+K%Dp-O5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..46110113e259af9e8f6d2c05581a69771414aea4 GIT binary patch literal 1802 zcmb_dZBN@U5dPj@aVipGC|ySxh*3gzUns5k@w+6AK-m~& zqpDKJch7TokMG>pj+_Cd>9-G~G^O9quIT0HLWe2IJSMXZBx2SwNQA5f-v*Gg)l_tvFsU;Yxq!BJSvQNE zb#sQv+|oQMd@lu|_gi%7n6<*a&Z5++-DM_~m#@Qs`wB)Ei-siYT0BZnml??QndWAQXe}zr=*#C>MtsSAcoV9)d?NV2E literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..2fde5da8ba81fcada85377f258d91a70659bd8d5 GIT binary patch literal 1802 zcmb_dZBN@U5dPj@aVipGC|ySxh*3gzUns5k@w+6AKpBjp zQB|qqyXU#P$9HaPN6vuK^v9=Bn$n+VSM+l9d3;4F4B-RZ-ND|Y#=!usySokK#n%cs zfZiKuep=lw$hkI7z>+yF^&HF$vKSQezO`c=o97J9(-gR5=;wpXz(&7#%2ttaXm@@^ z9lhWk%N)x^5U#xi#s?>}&|yk4kI8HUiI}wv5+Q5Bw*lmAH5HvEOzKQUE}-pQ*3BYk z-JD@Ew=|Cm-%CO0{T5w1X0329IT+k$!MIvS>SZ3NpQDW)_HKJe z$Mod3M}wxA^jsexG#1suxCBb0X&vwxBn;{pgcS6IaB(9uK}eBEABh2G%CL~!`zf|E zF%WWPmY3|eamhcuQlnz7w{U!nu4LYnLI`u&f{?LIi{6@`Bvs^weUcx&YZ$7^ii#!+ zW<~$7mjH^Gz{&6#&jWNie!q=l0tfhQ@(C-InMc$!ESzARkS|gT=2DYSrDg$-suH=D zxvUtFYqO}tfA~_an{lcNs~A&dvg_&qFSyV_hSm2ncxjj{<$A3|zfQGRMcM@1{LGm0 zq9IMK{$*4_TD2N+UZ#SN;YV!WAabi%5O8B8hg{6L)`S#;P&}*l-}LwMP5E=_mn@w*d{HI7{F)`A-Y}(+B+11d0H!ka}8B zrw^#JidyQh4@={&y=9rG%Bc>d9{OT5Qezyible5&%oO#&i)W31|4(o;2IClP3}Z{k zx8hRwb+~#=Ho0*&2{W}w1E)u{Rz*{8)y{2B_4dvfp(>?Y7Vr*+iXf9{1Kjq?A Ab^rhX literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..095eeaafb9f2e62f8bc9ceed0821d4d57f97566f GIT binary patch literal 1799 zcmcIlZBN@U5dNNDaVio5l&+%;#Jsjr$q-^1>qPhR(Q;$A^>A#bUlnqfd zR#mCvyXU#P$G&T{Bj-SA^8MpDN$8LBYkD>QG`Xe}M(`f)?_uw+#^DgHySpvq<(CRM zg#JFXKT)p-a-oeAuwqV2y#Opq3XR%fR413&u4%QZKVW{Q_~eHsGAq!;=Sp?Og)OiG|Mnl=HSLBgrZbn@?D&%pylI!_o=H3HdCwU@kG)OllVJoGOJIjmwGwx#~rw z{NtB$-HuaLSjCtrlipOPcgckgGOWLq!Arw*B{v%-22HB{DpDP={+To7MN67S{i|FB zY1Mknd6^16Mj!C*29a6Cf`D5iIb>qNwI-zKg5p`-{rbW`Z2Z{_gBM@m(H^Fd8tnsw zwG|pC#ml>ZkaHz6RBL7M#?F*e!}n#+0g8%%RnY5)KL literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..a6cd1554cfdd44ae6bb05d7424949102ec9d6e6e GIT binary patch literal 1797 zcmb_dZBN@U5dNNDaVio5l&+&_67$+hB}0g5tP|bKN6U@f*2A$azt9))<9A6Kfx?(T zP_=2|^F8<6N%jmE3YkD>QGP$M{M(`2t?%?2`#^DgH`}-}=%Wt*k z5c-GE{v^E~$b~jez=}C7^#aTs@)#8JKH0O5EeZw~Sq5A(+~{v3ybZDyj|c9S|+kqhW}m-q6> zS+8J7W=)Hz@VOF%-Y?OWW7Z1yE{{^HPLd=5sO5zy3NX0Nf^m(G)XO4Jzd##3>90@v z^mN^)T2th_(8uT*%Vc3v`AMT`6Yu|yFl-_aQi2o0#jVT*Atj9g;sVT-VIjTWr`X2C zKuBg(Ec6xoV_f<+8?Vjk5nP45ONJ2U@+Bb?o0Y{iMNR6+E&H_i@V;fJlPi;&E}0dB zlYR>2?{xH58;?^czim8Wr80|%iiV{Vj1%%zYQbD)ikZ|b;91o&H$qnl17huNYWW{N zRQK&Tb%J$_nKJoJeT0`>=pe)9M-{v>OjmNdRbtRk?bnfZ0e63Mro3oL(};f+RghL~ z#++AF@G|;@1=pI8vJuK_UH-e8zh8LV4}+H$@#p{<JkA0dKyjAB8^WKJ@MjO;XDO8UuP2_B$k_wryhfHf9Kp)C z8*f=I>SXG}Xj;DPip&_tYZ3RsIx|BxaOiB&;nn46F*ex~n#+0e7d)a?y8r+H literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..334ee1d8aad4e95b00923fe1965ab06289dd1366 GIT binary patch literal 1794 zcmb_dZBN@U5dNNDaVio5l&+&_67$+hB|``_)`{-rqvghK>*3gzUns5k@w+6AK-oYr zsM<8~`JQ|3@!ciak#nFl`|)|4W%TFi6}=o^Os*(}5qyNZJJ|cDaX3Wl?rzKT{A=ww zg#JFXKS{3#a-oeAuwqV2y#OzU}b-K5S{@0aMxF>8f;mq)2pCrJ_j)bdOe1sGgs!MH|8>SYnAU!aX1^>2^* z^!T<l>Cf?JIFl-_aQi2o0#f{7bAtj9g;sVT-VIjTWr`X!W zKuBg(Ec7M&ZCv`~#w)Y>&s`PlHWNaa%a?>qY*tp*6a}fnw(Qg5!~2$@&aBL6x@1-i zj`}H-zvIzcZ9Gh&{I>CgmC7t4${ChUFiyx9sReVHDP~f$fCp7;+^Ae742ZS8skMLj zP~ErV)CtxxX3FH(_4%E1p@R(T?^W>1FkQ*bMu|Z~wO>ct2HgJ5new6~O(XtgR6$y` z9&=t%!OQ3q)@~4mRV)a&F_J?e7F=sW$}T9cb?t8p{(j-_W*EG*f=7GEAUE0v2x}`e z4vH6d0U@VK6sXns05~0|;BgSp0E)8|-VpwzggR3)|nZqffHwg4zK37?SM%PHin5Mf0(h%rpx4wZoK1co>+P}kim}O#&|J=wKd01HuK)l5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..572103fd2e3c608291e60a2b81b2cd0adb367518 GIT binary patch literal 1794 zcmb_dZBN@U5dNNDaVio5l&+&_67$+hB|~V_SSPxdkCq#|t%qYtC$9I=xPtJkT?APaUmeJqmSM+lHb#g^1jNl{O-NC`5#^DgH`}-}=FW+m= zA@mQS{YiQ~kPB^`fE9CE>IIlNzU}b-K5S{@0aMxF>8f;mq)2pCrJ_j)bc_U1sGgs!MH|8>SYnAU!aYi^lwl4 z^z^n*gQm!Np^wosmdV1T@{>l>Cf@UoFl-_aQi2o0#f{7bAtj9g;sVT-VIjTWr`X!W zK**I@U9!K%rO)oljEaTcVdGuRcqX+TqO*MwY{me zfA~<{x8u|a)-h(v3kAPTEk5O8B8he9m4)`XN@P+sfW-xmD+!vD=McxeTX4v;}^v=0#0 zR%jd)FYW?D&Xp)otML(VI!?jkAfN#hXDPfP{8?rGsaH6g{2|ucQ|}dHlRcrioG1SPpk!6A literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca174a9a4430668c4814b0b3f3eebf6ff9b7a596 GIT binary patch literal 1791 zcmcIlZBN@U5dNNDaVio5l&+&_67$+hB}0g5tP|bKN6U@f*2A$azt9))<9A6KfwD1y zU{#wYKHqcCJ-)jndvXqxX5T-KvyA>Yzou8?Pm^m(VFd5t?hX$AY8(#Hy1(D@y!=vo z4xxVt?N8F{fm~?g1gx0TQZK;FA&)^Z?~^_2*rH%?k!8Ro!+k!;9BlN<$80qihfeor z)X@vxwal?X1mQYQ@bSUPJan0o%p;knxI|Lf`5fLdONq5y;IEEw15NWClq^$WDolm7ao zPfyo<8UjVm3w?~9u}l^wm7g@4Hu3)L2*V}N z41`>p)fM|?T>77ORj|7>2w^T?5;Cz_Syoe2qYm4$Pm6bNTZTHVGM(v?Sur^2r%?V* zN3XTbcgckgGHkw8!7IabCAV881`XAI9cdSE_cv$Ci; zqYqfPK@?W8AmG+W4ux27tqCbxpuE;e2Ng%bbe#Iq7Pdw`tR$Wn(RSQ&TYEz3on zOnn$lvzJ|w8RK{@;yzesW~c^^n=LxLnBT4gCNbC;CYF$|P_J+>`9rL?$6hMNCVN72IZu8A4*yi6 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..42d2171a0d9ddd6954c0fbaa862f85c73128d9f2 GIT binary patch literal 1797 zcmb_dZBN@U5dNNDaRLbeO4m^sVqRORWC$^hb)tLuXt}Z5dN{V_7fLIB{4Pl&P#6;k zsw#DSzUQ8Me0NQ@4C#nvk*)%4c2vo0`Ahc-#;DmlpAG7a8P6`v75O zg~nO&;zl6kOo;-u8tnt8qZB;O0vbSZmckptpO)~a58$UMl=!bFo|VYi1LV9$mOAXi z(zt7HSuW~i>dRqPhR(Q;$A^>A#GWzrMl3t8Hk1r{OA$)+_TiE@laWFvZ&Q8no{A=ww zfZiKuf0Aw&eh*gWk>2 zAw9n7Q5`69p6h+|jAgPguKc9Yw2Ak)BMh1dgp}ZfaB(ejK}boXkGKFcWmrh>&MCGs zF%Xg&6$^dAejArQdhpUL|MOG@yUBzQ=JEw0W1E$gH9k_ts52`wnk<+V z{exZ#M~5jK4`1uxehTHcjVG*BW*$+_uyBHLLcT~Xn9EEtm6`>-s9NJj8)GM4#{tedKL;owrCfh=DIZOTkB$ZX% literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..faa46fb34898b049964e205b33f05a0db60032af GIT binary patch literal 1794 zcmb_dZBN@U5dPj@aVipGC|yTkh*3gzUns5k@w+6AKpCJS zsH)WQ`JQ|3@!hqxBWFNq`s>RmP3iBmD|$KlI=-S5hVTjQ?qKgx<6wZ+-Q9-g#rMi{ z0KIq6{It4VkaKOEfF*NU>N%JhWHBh_eQU=$HqRNHrzvpB@SG1a0~`I~DO&}`q22it zb@YOFEORUuLAdr7K0Y{^g$`4ac_g!qOGMT(NOV~XzKui9R#VYwx=Ed>$OW{$%eqYDFM;Eu1Wtx;b?_j8;@iX%Rw^@(C}&tW!8jq`q!!GjCZ9^p0$x<5ajkNhFd)`u zQEC71rCc}TR0&owrpjd3)%{&?p@R&opJnjUFj>m=T8Vy5wO2*j1l;`0nDU|_O)dUq zR6$y`8gX7y!N>43)@~5FRV)a&F_J?r=3Hw+iY_RgRqbyI{(j^CX6V1Pf`@y^AT!zr z2rDZz4vH6d0U>8f40$zHioe!+Pxk6=PdFLUTE5{R15(Ronmo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..b92ea109eceab67e59eda3bad72eb706d15a7fe7 GIT binary patch literal 1791 zcmcIlZBN@U5dNNDaRLbeO4m^sVqRORWC$^hb)tLuXt}Z5dN{UaJCs)Z_+8>gplnPa zSXHHt&-dJOkMFL@wwwW_+4qm5ETccpuIT0H)A))~7{YtFyMx`o8V3Wk?(DQYFTO<2 z0rd8u{YkoAkaKOEfF*NU>N%Jh)_1%HY|!B$hzX}d|CtC9=Y@-FY@ zC1>4&A(>lRlnUQ#LFoM!T{>p1aBuQbYPFRl2>@z&E{XyS?z3QAqa*dQ2-MHfMh|

mgCjiyb!e>=jUi9kpNP6!t_G8cqYH2R1OFjIzw^!}V; zD^mtSuFUe1{W327Pp4|wO&WwSmoEqz+pH?9391ppw(OJQ-P@KSrd6dgSuiX5hrJYz zk5V`pzShBm6sm6v(w=T<9Rf>RTPWHcXasy;h>%Q0>J?n}D028B<=gq-n&zELD(Jtwx;J zRPZtUfQ1`GVHFDkZj9tmh&k7qkg5f$XDs|pwcl_2*$MsUmhW&E8RSO$0AXc?#wqda z79iwIi2}76?E|Ny6g*A=8bEQD!Yjg`R`90};HN26_%A1(Rmj-`$his^EHVN1$ l1&WVD92Y5I3gE4hfnHOua4`8jthdMhRE$lwh30aW{057YRL}qb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..c894ca7c8d93d52344cf68b2c27b66a984fdaf95 GIT binary patch literal 1805 zcmb_dZBN@U5dNNDaVipGC|yU#5cAqfB}0g5tP|bKN6U@f*2A$azffB7<9A6Kfx-}= zQB|qqyL;}r$G&T_Bj-SA_Ve>N%jmDuOL{T>GP$G_M(`1CZ(;AB#^DgHySuH(^KW(J z5c+SR{YiQ~kPB^`fE9CE>IIlNLzgMZJd$}%NJQ2$NDSEs{+xh zS+8J7=7ttg;d>6ku?l1>+hWsh35det|Z6ba->n zKc>eweX2D@(hGfnp|PwMCRLy`nl=fah=gI2gOC!Q5H7A|E(j@!3=kM#t_%z5ou6WB z69XZcQL)e$?2mEj>o{JT)dRQ+d0P!3%;if$CN?Xt*A!K$Gq>#1;={X^p)Rg0YPw`r z42}*{Kpj&!9zEgtzMn$r{iNdoBI37=C#+Ov5mC~xbb@h0zDg~a%SQLMYCYz>qJodnC%nNy6jrey;MzzIg;;Q{2`PJ_eAXSXz4mt-4_jjJ;%hwG zLkhXkK0sJop>bZkycY;LRiZ$(#{0nOI0cUbfd){VrSO{gCnf&LJ^Wb;WrA0ro|Vwq zJ?OlKmOAXi%D5|USuW~w>H}%|zHE)m7{_ZJcfmR{Lp^ZzY%t*e32xh95`&FlVhQ

qPhR(Q;$A^>A#$CA}DZ9A8ojLwEuhc{h)o zbqj{c+|VK_e6IwdcUyGnn6<*a%A?e3D@hUn)bdOe1sL3C!MH|8>SYnApQDW)9o`)D zj_L7DkLo}%>A5~YXe_IRaRrn{(J{WLE9tK+3v{-ak#yD5bb=JEw0W1E$|H9<-0ur2$fc>B6#s4FWgnk<+V z{iDMaP{b6DhmUx^>7`J5Kk9gZjQDNy2`iPEN7OScoM4=gPf`o!GE+>YW&w|?7P*nR zsu+-Kv#7O`Wtc4GdaXpiNwrr;+63JE%$V|` zB~7FLMN~mrwHk3=rGk&)du-nz3aeNUaBU=qLd?0=gp`9&KI``1^!(k%@AJ@qb`KAC zF@@Y{A0Vu((0Exqe-H>cRiZ$(M*G0&C2IrxMoZ?xp6iLI4uS0kM}t)QotI(TO|X%qF&(*<@d1Z9{PVVHrW=M%USXZCTLfT literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..68b69eab2c282cdd3e2e7183ae8ae75ac468d7fa GIT binary patch literal 1802 zcmb_dZBN@U5dPj@aVio5l&+&=h*3gzUns5k@w+6AKpBjp zQB|qqyXU#P$9HaPN6vuK^v9=Bn$n+VSM+l9d3;4F4B-RZ-ND|Y#=!usySokK#n%cs zfZjedKdo*TO5nu<;nCUvGF7tr=D>t>O& zZq6{7Tbf6O@1-F0ev2+0vsSn_S(I9}TdfuVYI!d591QNWU|g*u^)e6C&(TJYk8Tfp zC-mgDM}wxA^jsezG#1suxCBb0X&vwxBn;{pgcS6IaB(9uK}eBEABh2G%CL~!`zf|E zF%WWPmY3|eamhcuQlnz7w{U!nu4LYnLI`u&f{?LIi{6@`Bvs^weUiU_+b~p>6%|bu z%!>Z;Q35Dp0w=>~Jm2*aD7>F_JVZwPHu;2=%FH9`85T}3PRJLj1#_v%r&6AgP@E<3n*65)|LFt%X#z!nS4cfA zsM813Sw$^%IDn;b*WR*BROM6$QV)GG8mTdkS32&3b!Lis;Kj2>!2c(>8G~^QHioe! zkl6I BSBwAv literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..b1ff3810618b28577fb0e01dd210816394610440 GIT binary patch literal 1799 zcmcIlZBN@U5dNNDaVipGC|yU#5cAqfB}0g5tP|bKN6U@f*2A$a+o81L$L|t10%b#h z#;PiHeD^$e_toc#iSSd0HI}7Elg^lG@3R6A3?&fi9tw3PY4&+G8cqYi42e!V6F@c>7Acq zYf}b7F3swK{W327PhK_cwhls=%a?>qY*t;YDe4i!w(Qg5-J6ynmQ|HAT{0^MM~5k( zf+-x29`StJPoeUD)bRir@!RGTRw}b7QOvM(f^kAVOD&koOfi$11w5yyaHDZuF(B7= z5tYCHQm@-_VudlrOqu*Dp58eZI>@m8RtK*Q)0Nz8lo&Lr_G6@N!0peRDKA>mH0ocJ zDoCr=W6tYT@G<&;cQ=T_Di#D>8_A&%3$8UGRTosxc=xv#{%+&XW*9vC0+04Eh1_T# zAgry>I4Pdr1%#X`QJ`Amec*JQg2z!n11Qc?cuD?~ivQ#u|15O4|Q z9rj^m+?BU17qOgpAWg?tt&thyc+_zhtTQvz1EdR)7Ei literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..d967f2f53542145ddb133582238d5c8b9835c6fc GIT binary patch literal 1805 zcmb_dZBN@U5dNNDaRLbeO4rc=VqRORWC$^hb)tLuXt}Z5dN{V_7fLIB{4Pl&P#6L< zsw#DSch5cd*mq5~+hWsh35devUSJe0bYC zI;JPLJ*qWD(sO-)p|PwM##Nv+nl=faiG)FugOC!Q5H4!M;^=+LP#0GgHCZq# z`p1VUppGe=44?7*&`Y88e%A3Ih4S0R6ILoSk0@zaIKem}U!@kzWu}-)%>tfQt#hMv zRWTseW>M?^@TFR}^VAjAIi||w*Yz1*aG`??s~=VJ$}m~V^;(I3L$z0D+9cfk%$V|` zB~7FLWmG|0wHk3=QNhRX6W-t;3aeNUaAPEgLd?0=gp|EdKI;zHT>JZt$1Tx+^)(*u zB8A*&A0Vu(&^Rw%-wTACDN&$WqkZ6Xl!C{BKm#bwQg}=J(-Qym0sb_FGQk^A&r0a* z0d!tNOC9!MY23B9EEjb-^?@{fU$#bOjN`SAJ7Areq8>PV))?^r1UGFkj={z-wuF2y y=elpwHEput#@Qrbw-l&9j&xk4fHi=(N(OpOy}}{o_wdp^^%rAovMn^1v*a)C7g(qO literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..0da5a24bc6b639e7cec0e73f3fcad1f2827819d6 GIT binary patch literal 1802 zcmb_dZBN@U5dNNDaRLc3l&+%##Jsjr$q-^1>qPhR(Q;$A^>A#$CA}DZ9A8ojLwEa;}XNuw+h4JqI&`JO;(QPqwXN^Mb)ymI0Rx{d|x)*ytAz*=jNlTb&*)l_uaFsXADxqvP2@@^hE z>lO@?xuHc=_+AM@@3!dDF>8f;okywFR+1zDsO6a`3NX0Of^m(G)XO4JKSvuqI=JZ_ z9?|2Q9@T+j(sR9!&{$Rr;|eH^rcJ;{kT7Us5K__;!o{`B1tDc3eIy2$DZ@f~=cm}p z#6U=9R4nub`)OSISI0}U{70{fc2f!=%;gJ0#x^T^Yl4#0VO#b|armZXs4FWgnk<+V z{iA~vP{b6DhmUx^?WIt9KkB%jLiug;2`iPEN7OScoM4=gPf`o!GE+>YW&w|?7P*nR zsu+-Kv#7O`Wtc4GdaXpiNwrr;+63JE%$V|` zB~7FLMN~mrwHk3=rGk&)du-nz3aeNUaBU=qLd?0=gp`9&KI``1^!(k%@AJ@qb`KAC zF@@Y{A0Vu((0Exqe-H>cRiZ$(Mti{NC2IrxMoZ?xp6iLI4uS0kM}t)QotI(TO|X%qF&(**3gzUns5k@w+6AKpBjp zQB|qqyXU#P$9HaPN6vuK^v9=Bn$n+VSM+l9d3;4F4B-RZ-ND|Y#=!usySokK#n%cs zfZiKuep=lw$hkI7z>+yF^&HF$vKSQezO`c=o97J9(-gR5=;wpXz(&7#%2ttaXm@@^ z9lhWk%N)x^5U#xi#s?>}&|yk4kI8HUiI}wv5+Q5Bw*lmAH5HvEOzKQUE}-pQ*3BYk z-JD@Ew=|Cm-%CO0{T5w1X0329IT+k$!MIvS>SZ3NpQDW)AKvzk zj_Jv5j|NRK>A5~YXe_FQaS4=0(>mZYNEp;H2r1|Z;o?SSf{-GSJ`w}Wlwl#c_fu?T zVj$$oEHBw_Z;VFD;(0w=>~Jm2>cD7>F_JV>DUHu;2=%FH9`85T}3PRJLj1#_v%r&6AgP@E<3n*65)|LFt%X#z!nS4cfA zsM813Sw$^%*oUQY*WR*BROM6$QV)GG8mTdkS32&3b!Lis;Kj2>!2c(>8G~^QHioe! zgplk@x zSXHHt@1E!G9{aAzwwwW_+4qm5ETccpuIT0H)A))~7{YtFyMx`o8V3Wk?(DRX7hfWB z0KGkEf0Aw&^g4;`i?^O($YASGrkgG9($@aF&uwwj7g8zyzGN-ki_yS$s1 zoOKI^$=uSSRQO&CLhrZe(lKj=dy|(^tF0tS08q zJ36K(w>=sH#iZx@0HI}7EsSfRG@3R6pFqN(i9tw3PY4$`G8cqYiS&^eV5STU>Ajy~ zD^mtSuFUe1{W327PhK_crVc`w%NK-3EPr^=Y9b{O2tAp2u$x^P@O7xpldoj``;O1w>lou^&8uc$r z6{J4C#nvk*)%4c2vo0`Ahc-#;DmlpAG7a8P6`v75O zg~nO&;zl6kOo;-u8tnt8qZB;O0vbSZmckptpO)~a58$UMl=!bFo|VYi1LV9$mOAXi z(zt7HSuW~i>dRqPhR(Q;$A^>A#GWzrMl3t8Hk1r{OA$)+_TiE@laWFvZ&Q8no{A=ww zfZiKuf0Aw&eh*qr;nn z9zDM4Q5`69p6h+|jAgPguKc9Yw2Ak)BMh1dgp}ZfaB(ejK}boXkGKFcWmrh>&MCGs zF%Xg&6$^dAejArQdhpUL|MOG@yUBzQ=JEw0W1E$gH9k_ts52`wnk<+V z{iDMa4tgmZ4`1uxehTHcjVG*BW*$+_uyBHLLcT~Xn9EEtm6`>-s9NJj8)GM4#{tedKL;owrCfh=DIZOTkB_>te literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..64fd370fe25e0d4e9c975b7264012ae54c0e61a6 GIT binary patch literal 1794 zcmb_dZBN@U5dPj@aVipGC|yTkh*3gzUns5k@w+6AKpCJS zsH)WQ`JQ|3@!hqxBWFNq`s>RmP3iBmD|$KlI=-S5hVTjQ?qKgx<6wZ+-Q9-g#rMi{ z0KIq6{It4VkaKOEfF*NU>N%JhWHBh_eQU=$HqRNHrzvpB@SG1a0~`I~DO&}`q22it zb@YOFEORUuLAdr7K0Y{^g$`4ac_g!qOGMT(NOV~XzKui9R#VYwx=Ed>$OW{$%eqZ;Q38j(1Wtx;b?_j8;@iX%Rw^@(C}&tW!8jq`q!!GjCZ9^p0$x<5ajkNhFd)`u zQEC71rCc}TR0&owrpjd3)%{&?p@R&opJnjUFj>m=T8Vy5wO2*j1l;`0nDU|_O)dUq zR6$y`8gX7y!N>43)@~5FRV)a&F_J?r=3Hw+iY_RgRqbyI{(j^CX6V1Pf`@y^AT!zr z2rDZz4vH6d0U>8f40$zHioe!+Pxk6=PdFLUTE5{R17gRonmo literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..91c8529c3170b17b8a1f44fe6163f09780b72960 GIT binary patch literal 1791 zcmcIlZBN@U5dNNDaRLbeO4m^sVqRORWC$^hb)tLuXt}Z5dN{UaJCs)Z_+8>gplnPa zSXHHt&-dJOkMFL@wwwW_+4qm5ETccpuIT0H)A))~7{YtFyMx`o8V3Wk?(DQYFTO<2 z0rd8u{YkoAkaKOEfF*NU>N%Jh)_1%HY|!B$hzX}d|CtC9=Y@-FY@ zC1>4&A(>lRlnUQ#LFoM!T{>p1aBuQbYPFRl2>@z&E{XyS?z3QAqa*dQ2-MHfMvsqf z4}0|Fwnsyt$a$_0(6g+Pg>mgCjiyb!e>=jUi9kpNP6!t_G8cqYH2R1OFjIzw^!}V; zD^mtSuFUe1{W327Pp4|wO&WwSmoEqz+pH?9391ppw(OJQ-P@KSrd6dgSuiX5$44m~ z_EI<*zShBm6sm6v(w=T<9Rf>RTPWHcXasy;h>%Q0>J?n}D028B<=gq-n&zELD(Jtwx;J zRPZtUfQ1`GVHFDkZj9tmh&k7qkg5f$XDs|pwcl_2*$MsUmhW&E8RSO$0AXc?#wqda z79iwIi2}76?E|Ny6g*A=8bEQD!Yjg`R`90};HN26_%A1(Rmj-`$his^EHVN1$ l1&WVD92Y5I3gE4hfnHOua4`8jthdMhRE$lwh30aW{0599RL}qb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..552e3070f4df6df38360d779e45cf058400643d5 GIT binary patch literal 1797 zcmb_dZBN@U5dPj@aVio5l&+&=h*3gzUns5k@w+6AKw)e} zP_=2|^F8<6L%Z`U z>gWaUSmszRf^h9Ee0*>+3mv8;^GIeJmx!!okm#}&d>eeh*43)X&vw1jxeYr5K@2>!o`iu1R({DKH>t*lwl#c->2Bh z#6U=DRLu1y`)yqEIvcOd@)2B#yh(--=CTDLW1AMmH9<|P$PN1>fB&{&sFEv^nk<+V z{o|tqir>lbKO5imw%mA_K=Ezj2`iPEM^rQ{oM4=gFH#HUQj<@mW&zKtlDQVTOc)Sr zb5qIx@S(hK#;FplVoa6EuB#)w;6eu(R^Q9urD3v^>$MX7nrg3#vM=akUa;sPnaAPEgT+F%FgcOZXT&wcm)cpOz<9_Hrw}^*($RIP? z2M8-GG|q}=Hv%DNO5~{3=m0n!CE#%u&;W|F1YQyTw17W-06$Hjz<)XMv_MWDAZHb_ z)ZqY@#$9{MGEpT{9Y)>qMOUQ8I9`dk3)Yz_s)0jijSes7x9Na!3^sN&L#mHB}egbl;ceTm;!jKWT4m7D;!ULAM5R@SBtT&9ih3Lwf+DkI9Ct= literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..af861cdd5a577516e0954f026bed5d9efd8a1e89 GIT binary patch literal 1794 zcmb_dZBN@U5dPj@aVio5l&+&=h*3gzUns5k@w+6AK-pkr ztV$EdclX@g7oRK3 z0rd8v`D=B%Am`dR0ZZnz)N?R1$YM~;`__(iY@RbXPgCHMp`H&i0~`I~F2Ko8^WOWKu7^j2p2ap6ND6D^br?erVI?RXJn9CM~jBQ#})&vQu+&1Ks{O#+8pvtVsXtH2d z^pB4cDE=qIzbL-xZK-&eKyfwkgq6z7BhncbPB2c$C#eNTBt|6ik+Ky;h=MhuW(=Z5(bMGp4*~P*anC z8C8&0twx-eq2O)!9&0y<+$t6X+!)Cr7jv#PAw?Gy_p0_c1%JQsdo%Q(TfxIUj36`G z2M8-GG!BYqcL5=1O617Z=m0n!CE#%o&;W|F1pXuZX#s!w0DhW4f&cQr(*iktfSgsx zQilUr8h7n2%S4q-br^NO7hRDW<9J2lE?8%#$OcZFH5$A)zD);=W3VxdEg@fuVeV^h ub%$(1<7^VJQF0_7=Qu7Bz!bn+B?G;tUg2!=`&e&}eOHWa?Fh}~tn~|zE>;8p literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a51ef227fa1b9de8a29ab7aced7426db7254d6f GIT binary patch literal 1794 zcmb_dZBN@U5dPj@aVio5l&+&=h*3gzUns5k@w+6AKpC(i zR;7vKyL;~L@!h4hBWFNq`s33mP3h0GD|$KlJiek7hVTLI?qKgx;b4H$-Q9-e#n;Mm z0KI)^{#xBG$hkI7z>+yF^&HF$vKSQezO`c=o97J9(-gR5sON*sz(&7#N>+h!Xm@@^ z9lhWk%N)x^5U#z2jSo&{p~IA99)sD&Bx2MuNHkdszKuc7R#VYwnn|6h$OW{$%eqdWb!o`iu1R+HjeZ&QrDZ@f?ucz3` z#6ZZESzfZ=#wE{crAEbEZ=v`YT#3BNgb?Pk1tDXb7L_$YLMqP<`6Pe;wjro8D>9la zm=*owqXdfo$?zYF?|NG*9wty+O*~=`_YYsnb<&J;YfVVe1;xFp{Y}B&Z~Wa1{g+noa1SHM zjP?P-$_kBx;>BG+$e9v3ay2>tPDcrN90W9g;w*vx2!C3@pFV(}CQ#tNKJc_aP9Gp= z6|&Uf0G7sGd&@FWB~u+n-S0(Lq{cX2k+=)inJKb?6K9PEua0li0pl2K3}Z{kw_=$4 w+FRWro6tC$1Z*3gz?ND0r<9CT0fwHj` zu~luF_+WvL^WszV z9KhiVXn&Gk59C}MCt%5(mU<3m26-72^FG zC1<^YA(>lRlnS3~LFoMwT{>p1a4+*xYSl@S1OT->7exUE*I6*G(UE#t1nTE#qsK?L z$4B(!_K=1^k@H+1pl4Yn3**{P8cmyce|CgH6M>KloDeQmV5STU>HR*% zR;CPuT$$x1`(a%ApLW%-n=}YvE?*EbwpmqH6I3IHZP_Qqn^!GEOsh&~vS3#9kB?HQ zeka4fYUOp!hl$tn<)Il zhx)!9CngwUOqI#6m=T8Vx`^)N=-1l;`1nDU|}O(Xtgse-g> zHR8Ocf|ubtEZiUpt5^_lVQtR4q_lW8rVA{eI!sPUt_ie206;AUE0v2rDZz zPKhVC03l~e6sXl`A2=PQ;BgAj0E)8|{vrHn1%LVgewsps|9s+Eg`7P=&LgtaVIP*p zU3<%N5tE6B(KLJ26`3)PM-g|yIx|HzaNMlX;o1B)9WX9~EyLIn^0}JizKPa!$c7qc olYos+P|Zim}O#&|J=vpY{$^_y7O^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..c4b8ac1da049da83760f3dab7c28aa7d3f154bc3 GIT binary patch literal 1788 zcmb_dZBN@U5dNNDaVio5l&+&_67$+hB}0g5T_?JikCq#|t%qYtC$9LCcPcDGc?C0lkmeF75S9CJ|GQFY{M(`2t?%?2`#^DgH`}-}=%Wt*k z5c-GE{v^E~$fY(;z?wNN^%Bei@)#8JKH0O5Eei%0Sq5A(Jm-VV!A8G&%2tDM=yZQa z9lhXP%N#325U%qKA0M2|LzgMZJd$~iOGMT(NOai>{v3ybZRVoec9S|+kqhW}m-q6> zS+8J7=9U&w;d3Pjy?;g5j#(?*yF5y*I!TfMpq3Y+D8S%*7L03jq+S+*`X$=v>Fr4$ zmp;{+BITt%M#os@3e(C>8cmyU|Mi1mgHK2aO$Zk^G8crDBnAizuuz7D^!}V;8xsQ| znNhLO6ZXfr^lc7ane`*M3VD|cAbKg3dd>L10}WKU==7s+2k_EjbT literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2d55fdf672e53878737c6c58cc39687a03c4fcf GIT binary patch literal 1785 zcmb_dZBN@U5dNNDaVio5l&+&_67$+hB|``_)`{-rqvghK>*3gzUns5k@w+6AK-oYr zs4CaV`JQ|3@!d7qk#nFl`|)|4W%TFi6}=o^Os*(}5qyNZJJ|cDaX3Wl?rzKT{A=ww zg#JFXKS{3#a-oeAuwqV2y#OzU}b-K5S{?_bfCW7Z1yE{{^HPLd=5sO6a`3NZMd1>+hWsh35det|Z6e0$W# zrB8LBNO_?T(J_{}!lZJOM$;zTlYTI4@ChlQ3E|>K=7Nxt!~kId=E|^;-k(!!ZDJrK zGb$GPlKnO=ee&RyS^ej!3U-?aAuQQ})L~opY4PEG%TQ-lW;9(gD+b3Y z9Q9K;9=*}I!xYMI8%|iM%p#(gVd(_pgnW@&FqfHPCN&FqOSQs{##Oq2Sldml{KJQ8 z-;PtKSI3wslV8`jcg}?lGOWK>!7IabB{v%-1`XAI9cdeI`!i?Ci3iAPTEk5O8B8he9m4)`XNDP_A{|Z;So@#ovuEcwzaD_K-ksv=0#0R%o0N zFYN$APL(K7sqq1DI!?jk2%rHJXDPfL`bmj?@&J97LJ9wB+F1#lJpj&YV5!3atc<(% zmgS;OrM`%!)60IyjB&hHa2KpIGt>en%?2G_OK)5JlNf9a6HCaqa)$dRSkoL^W}Hm} n_DF%stC$9LCcPtJkT?APaUmeJqmSM+lHb#g^1jNl{O-NC`5#^DgH`}-}=FW+m= zA@mQS{YiQ~kPB^`fE9CE>IIlNzU}b-K5S{?_bfCW7Z1yE{{^HPLd=5sO5zy3NZMd1>+hWsh35det|Z6dVA8x zrB8#VNO_@;(J_{}!lZJOM$;zTvwkpa@ChlQ3E|>K=7Nxt!~kId=E|^;-k(!!ZDJtg z%B(KgU*pnePi02MLho?!F}MnOn+PGyRzQm?@KA*SGhD3ms%w|Ez*nhUrRfHcAW{s{K0BHsJPW&XgA|Wg6)(qYBch z^_cUD3NE8hShqnGR$=|-`~8dm8)5Lm@*N!@f!t^xAgry> zI3r%#0fd|@QJ_-eBj9wLg2xd+11Qc?csum768-D}`YeSK{?)Xz5;%JRoY%loha*@S zckM09MV(505lyF;{g4^sc&*?*SZ8La1x}g`I=q(Nw)iJ8*cc|3kRRm?_f4>-IkwC= pn+EKW0+q+{jhhrO1MpVKK(DD+IF$S$mfBN)C&ngwLUTD!{sFM6R2l#P literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..00c1c9c9ceb2879363fd64e9912e66fdddf12fc9 GIT binary patch literal 1782 zcmcIlZBN@U5dNNDaVio5l&+&_67$+hB}0g5T_?JikCq#|t%qYYzoL`zr|A`?FoO4RcLxW5H4cYp-QRC{UVf=P zhtNNS_LKB_AeY)W0c+;8)Jrf6$YW5<`()2Lwk#N2WEpVDaGwt{2OItBF+4_&4t^GN0?E)iMFAkk$j_){DTwwa4=+fC|RMJ}M@UEa$h zXT5?UnOj;!h2NDR^nQu19kW)rH+hs=b&@0jKrJssQGmg97L03jq+S+*`X$=v>Fr4$ zhdvE~BITt%M#os@3e(C>8cmyU|MY`lgHK2aO$Zk^G8crDBnAizuuz7D^nRaW8xsQ| zS7tq7zl=-&)2<43mjxlrTK9sne@t z%$3Qn>!Z8mLI)W(->Tr1VYZgrtrCNVYQK)O3%L7PFy%!{nMV3aR6$y`8FOAy!C~|P zi#CYDDi#FX7|Ed!ORhB`W%-w5UG%$3zhC&X3kJ`u-O&LO$c^>^!o~`X7sPYRpOAAU z3RG%*1e}gj@Yw$}fZ{BL*F!%m(a#>B&r>MjUrakIfwKp|c?~ReID)lt*WR*R)Tz`5 z(X@Hl51BEJ*9z`~b!LuQ;5D;FhnLdZ75_8_8^hER^0mCceG{x{jvX`3W&wMoK;`l7 h#z_j80eGuqpx4wZypjANmfB{ zqQ-K3zPslhKfY#LasiYUKR->1g8n)_qoDEkU~!#A>pLE4kUG*(p^XNoWTfY` zW<@EX=>G8F5?^|7Ii$u=ETb~}7%bKG!?Xz{t)U%+|BeNt4hJDM1VXsDkfk7`20N?q zeEqRTbBKun7Rs`i-+C!_Wm6=iuqtVD%KliNKhMT9ySf*yiM+0s5a!DzAyZe>f}A0b zZRQR8Ouu`*VQ5QjN?bB04w2&=n&Dvl$ksQ*9Nxdp;n9uz7!iM)@r0AgRtXu7OD`BF z7Yc|Uze~~x)Q!O= zh#JfB`R<;3{P>z}$puhae0x7B3i|!%l%7mJPERR?F}#KAYuNdtaWq2f_V$M7hcB(? z2!_vK^T`GSkd?7sz>0YzO$BxVWr~W0kZn27D$U@yD1b|b`$Ci@xD=Lm+1g+ndi@{C zq#$_TG0(J!!uKBF6QY-8>@y`csiy<|UVi}d$!(gedAEr$xX$|cl{BtZAbvOvAArQjFxhw@CHP~5= z=hN3UngdJ>uuzu8{KiYMOPeAgg;hzL6ZX^k{I45N?dm^wHDTAK62g4BBxLG}T8%R# zu}!;SpXt{xHVkc@O?^w|!~qhVLo@7;@7eltn8TY_Io!K(4siqTHi#(55P$|otWsFM}9l9g>TY2q%dtUn8mm7*=VJ=pFB*UB!e^}J<&8=SXu^HJ%!_be!aXRYBMNg2DzAo}rxY`irqH K+Y*M$MfMAK)MhIH literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..3ac6a2bd120bcfae0e0d46eaa2b24761d3c8be45 GIT binary patch literal 1805 zcmcIlZBN@U5dNNDaVio5l&+(D5%XxJk|D%4)`@D{N6U@f*28b|3kAfF-z8}T>R@yd zM2+S6e0R@1etgZg3JeId#cTndZFY)vo@o$jw> zQV_iBm}go<;X51ngy>}%yG%(I5X>en2~o=^(d9<)O&m14o{H|en>3|LTtFxIvR5X~ zdYU1aE2@*i@0t*Lw?vnoIVb$9GD)L4S(X7HmnTANu(-~m^(~JyNF8aIqm2fqWTY3g zVnr#T=zjm;3SW9~)u+)=EMso=F<7eVhjATBT0>h1{~ZekEe=8|2!wEPDN8{}1$I*5 z`SN{*<`5GD%#>v@zw=V;+NMazxm}*I-`3~<+*MeWwAo=N#l}0dMSUr`7MLf%f<(B8VWWZRfNyGmAk9t1DarhB+7KL_-MFE#qa?oPVjUl8` zY&AAIU5WE<;cwv{K7+J|I|!<@CPawWP8huB&!JdCj+M~D#L*sbI?5s7)us^?XF0rV z#-l3Z(Y=h*9IBKrQeIIdEAAyLn`F|&Jy=?Q5gaQ;Q*v{#?G3J!U0CaR!~QlnZ>Pu^ zcA6V>cnQcWS&vh6DUKZ>RnO3&1<=ZVg_g5PMD=StHEiiP$pNc^w)+Kz4JhoQoFDs( Mu`b&ZhRa#@2XJI&D*ylh literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..abac3e7ebd5dcc02ec06c2858ed0b87d0fa3544f GIT binary patch literal 1802 zcmcIlZBN@U5dNNDaVipGC|yVQBIeObB|~U0tP|C?kCq#|sfXX>7Yc|Uze~~x)D6KT z8a0;V^W8o7`0+K{kqe--`0;U46!hoGIX#_=_k}1+a49SwvbDiD^!mS& zNkQj#WE^$fWcB-KTMlY(i+-9`0rRS>TnQJLm-5UOIZp+YOu2! z&zJ9OG)I^iV4*CF`JIzXr3>a%YX{o>e(agsvUDB#jc4q8;)7(!~f)?+KuwKDG( z{#NbLGw5o(hoDMpLWFqjguzSx9C9V(L49@iOeqN`*w1m216BoH>kA6IPk4cHe(3+jx@<=nE*IG! D*B)gb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..672d2f430a4051a034dd41c67595cb2f908db6f8 GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAM)C|yU{Cg#yfB}0g9tP|C?kCq#|sfSyqjkd?7sz=C-rO$BxaWr~W0kZn27D$U@uD1b|b>q3+zxD@73#hPLqI^Cbi zq#$_LG0(J!!gn^v3DL_kcA1haAel{460(+2V#=-Hnon=g+h7!Y&?_TUK5*O9=DjoRG0AYCBF) z#RhpTKGAPquLYV`mo3hj6Gy0W4hQ`l4u^kP`R>hz3-@!V|LgUHlgd^J1&(ts7$@Y5 zG=llU>Zvp=;$}8Fw@NQN2CTIjH2Ob$S&oPJ9DYEbMWLNyQNWdz9JHvoF@)5Pt)Gog zSLVE*cx>FG|M%8#2RW72gb4A*34_P{85~QzyM*ffITEQc2bJgxzc z9{^5ss1aYHxuV7@9$=LXmNan>7S>+|$4b#O+-z!lerr<~)_UGZzYWgYDJq7y%q=Fo z0OOUT$0@oL$BvNt!qB0`ZymoP%h@EN@ilH4-sU*S0h@yM;tL9IpRkKge(LLDUA83* Hm$U3Q&X;9} literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..876263f4fa18605ebc492ea84791bd47a5de4de6 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|yU{Cg#yfB}0g9tP|C?kCq#|sfSc-YR zsH({EeVu!Je3Na-8Bkh$emg1(`sL(;o{!#*FDQi}yoS3w*!ih(FhJ|}_FClG$0l+B z{asl9WxXEA%2+R8!Mu^C0y~2;Ma4qMwwz~`W^h^*z$HV!5M>E2h52K)CK-oL_iHjK z2;OzfGcBUS7jq9CLOKnNFCvJ`~WP$xB- z_n%f+j<7DkOj#E5`zXb3Y>I>wRwZrD**EL+Upijc#b0_Y)2=!tg!yt#$k-LN7AHty zQ+CZh(J!B`8Ja$q{mq#ZM@VrF2mKrlhrjWB^mfdeZ%|}WXs1{daAhS2Eh=sdA+=oVXCu;; zGVeEjRPE8f3v0N8m`ZCxgm~kG!8`sRB`#qTy z1n)ZLnHEv_&IU0ddRfLUQ<4P)vq?xo)G|s8xfOhq0L^ZuqPrd@O{o$W&ZI^}NeI2)q6^QQ6aGb+q*0wL%K(teQ=v6j+-K4HmPZ<-jx-678rmZG+f*=UQ4mrCAcTu6SqegGsFNDa z$1f`^M_3nNrYwv3eUxH1Hbp`%?BbmLus;8-S7BAsW&_8^(k(NuIwgepa!$zD6}1*8 zNMTcX%|6kuU#=ONK9~K?nG;7yaSjLl91e$n@O<-XL&yCb>i>E^;iR%vLU!Za3&sig zB#mIcuzD&Di+GX^$t}&xjsbJ677h6iUzY13K8Np6WKn3RSQKz&B?m1kZVVx{TprH-1&^(Z368xPzEVYeIy0Zk}=j{|3!x3|f z0ndPVrRZ^rF2%7Uq@ET!wDnuXufTFPi70%HM~0&u7dc>4&`!OeaP)*-RPs}w7VEMt JVYr-SKLIk7WP1Pr literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f0ab8521a25254df5de85654207d356d2b7b42f GIT binary patch literal 1794 zcmcIl?@!w>5dEIN;#4FAC|yU{Cg#yfB}0g9tP|C?kCq#|sfT~bb|@hJ_???Zpl)cL zwpLZ-_zHR+MBzId#D8M{nL7Ld#)DK)Z|QDVxy;F~09b~hE>^)zWpRdWHI;LBcF zbJo)g$=p(1D|}xHLLaB-!ZYWDe^J)bs7{t;0I21;&>AexvuJ&*BMnkV8Y+y@;FOH? znkH405{e%7k8h9q^!T<KJ*@PPjZuM%>mgcc@_4uI294grreji5NoVXJ_r3Gnm@;53JX zxQ*tDgjGDjDjO_m;sGqIzYdO-qG`C<)b`{iQy12H-blX-&f6&}hWE=oCTxN6%F*LG zx;l;>A?d2np~Y_yJB6oBMg_b>?fg2 BV{ZTe literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..1d8d9ff13c862b6278ab0a453c7b738198ea11e9 GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAM)C|yU#5c6oIk|DGQ>qNEfqva-U>fx9Cpn&-CyKWkR%FsFu zsw#4PU*{el-(*{229y@xKa7fk{y4p$=c7;K3rb-K@8IqZcK&D_4A8p0y%zcLYZEzu z{w}QlvR)5Ft&QWbU``8NgPB1YgJj-kTh_5!F*qvMxkXV%Q67u@{B752KQMouH}(>p#t?a+Nifu1bRsm zD^hYv_xnfJhX?fNx=*#H*hQ`PF;`6O!+4oV7)@IQ|C$O0Eeb*s075E$B}z_6f;vfP zK7U(bImWsGGig}J@1rEUF)W?nT*2y^9}kg+Y294E+P zQ+drkQEy+b8Jbp?EzX(c$H;OHhX*+v4gcbqoZj>|aNN%!{nztxE2OC-5*+4^vx<-} zLUZN{qozW$fQQ-e+%moF7%WbNC*07PzvK1rAq6RG@fWX-!B{YP`?e6w3}Z`38W_sA^;^ZSz*cM$Q1}Xu45v9Ra=@mb9eiHm^zpl>qNEfqva-U>fx9Cpn&-CyKWkRy0LW+ zsw#4PU*{el-(*{229y?`-;9caemT9M=c9My3rb-Kui)+uc7AFc4A8p0y%zcQV-q=m z{w}QlvR)5Ft&QWbU``8NgPB1YgJj-kTh_5!F*qvp#t?a+Nifu1bRsm zD^hYv_xnfJhX?fNx=(eW*hQ`PF;`6O!+4oV7)@IQf0_ygEeb*s075E$B}z_6f;vfP z-hWzQImWsGGig}J@1rEUF)e zx>Dx-#*eB!{&!&wcMwx)v=0z&EZ2C)|ASl!Ih9;-9Y%Xl(NPW_Z#4}dE0)8P7CcD> zPaYJU=8!5rMRP@}RXnIwHnoHfd$2I>(py&Yrs3vL+sm6oT^Qr4hWc%=&PAqW}N^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a53c72a24a2716afd5398c4ef5df137b53fb5e0 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|yU#5c6oIk|DGQ>qNEfqvghK>fx9Cpn&-CyJ;GMI@mgG zR8{2ozRo>9zR9-a3@9zWz8e(<{dRgm&qp7|7nH&f-oV`*>^y254A8p0y%zcYa}zm$ z{w}QlvR)5lWvmylVBSbmft^8_qGBOrTh6meGdL>>;F6(Vh_VEi!u%;)lZ->B`#qTy z1n)ZLnHEv_&IU0ddRfLUQ<4P)vq?xo)G|s8xfOhq0L^ZuqPrd@O{o$W&ZI^}NeI2)q6^QQ6aGb+q*0wL%K(teGodwD+-K4HmPZ<-jxI?j*u^>fVSWBvufnRN%?6H-rCVlRbxH{H<(!bQD{3uH zkiw?&nth^Qzg#mkeJ=Z(GbfIb;v5bSayT0P!LxRH)!)EzKZp9io=-TbY?YARIQN2a zLOw|&m@llJO2ZWT&qPx{==8$dWg^ATNGIo+9?(VTv^FMi;5dVNG;d; z*@$$d%=?XBReSvJ!W!-%rqY@aA>KG)@Q(inxe{`!gcc@__JGq-4gqg9ji5No;aLlw z)CEr-6rAQzSA34qNEfqvghK>fx7chXUfq@49IO>W0>7 zYgI*#@9W&-$_1=&~N8g^m6oJd_^e?;SD@Iz}|0-g8^E1ch@4{e{LcN z(BFskU)Jk^tc>*n7R(!IDzGys>!?@=*^cw9(hM$&0=Q)87ose|)nWdetx3kA)BRqX zItbo%%rh;b@SP1}LiDnXU8W=p2xgOz8d1wAG2~wGO#(E#n~Lswm^7uTxqwdaWv{F` z>uH8yZmF&nzAp)(k6U!%nRCLwC~Ij{C(AMbvz|3|bU~BmjhPaU)AXNP;>^ zXg+>fVL8RR05fG-%pao^yR&s9-mI}%2qW}8|PjyPRJ)|1oMT} zQ)yVlV{901IbL=Qm}|9Yn1A}RTo3U%e2WT;LOaExfEz10Xi;%v2uWH^&qkpuSw3$3 zD%sP2SJiM2F_qSY2=UGdgBSciXqAw2CA2VcbO4-=atL^(X#~Yt4qGjFmI|IdDLBm` zRoq5%MXFUisZ}<$q=^Txu>LwYR*I(K=1|+4n?zk$>v=={E;w(e$QaI-dkokD;+3Mu zb#!$cJ3`W^(4np0Dt-l)vq?nZYdkWXt4*?Fhr=Ec*!z C{bQE^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..898008db3452d9f4d0b96896b6ec7496978e60c4 GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAM)C|y^yA?DFaB}0g9tP|C?kCq#|sfSwC|+$Hz{#BWFNq@#E8|DCp0#D|$KlJiek7hVTLI?_uw+!NCBdySp2i7vGx9 z0UYkb<}d5@Kvu?j0So4hG!@txlqo6}Lbl^Pt2Bf2q5v)#t_x9?;8K`B7i)@f=yZQ2 zlY-z~$2`*_3g6ixCqysH*kwwxfMm8wNyu78i79u2Z)qWz;~|L~X|FAu3P6uYQQA8Vz$eHgDQNo#1!;9pC@pk+ZwO@I(CZe%G4si{tC zHebH4xtt&_z)V>d^M@?OZf%N$6jmi|F4=GE^Osq8WfxD%ttzjZC4~8MPRQ65wH+s@ zVuQR9pXm2*Hv&zos}|?XiIe_O4k&XD$HOgF9=+RUAy&lyWmBc#4CbZGHg$FIq9Hi>9_ja!DdISz8brl7s}g2LM;?4y&P`?^?{?Fhr= GEc*ijYGs)K literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..c4dec94beeab06cc9417edb979ae3a971b0f63da GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|y^yA?DFaB}0g9tP|C?kCq#|sfSSqegGsFNDa zhtF#)Cs-F?rYwv3LzH5-Hbp`TtCBXC?7Q{(Zym4f;xD~cY1f?+!hAU=WbBGsixZ@< zDZ62x=+`ed3{9V_{^rbylm1Z-$Z-zG!!10IUTx!u8S%fFPdKS;m5|*y_kwXkK1w5) zFRY$Q!y=w!Lvl;=s$;-h>qSHUD7DHa9XSjj<)iW@^nE!XoV1R@u~&CLX}T`s?6WDVm0xLv1f_E$YHr&l~D@!FfAH#&E>kVZbvW zUMqT>qDyh?2&t!q4sHEb@oTW0O(F_kC>%XuAC>&nr^UK#M;I<= F*$-{JWQ_m- literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..ef795a299f93127e50f7038c808bf3e89f5774c5 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|y^yA?DFaB}0g9tP|C?kCq#|sfSR{@DA?pVed)fV1U-$-HphLFHPhC z4)SqegGsFNDa zr>|=)Cs-F?rYwv3LzH5-Hbp|N?BbIBv_Ai%S7BAsW(&us(ycPDJ0*nqa!$zD6}1*8 zNMTcX!#>e(UvC(iK3DzCnG+}dqa2Xq9FB)ucpkmk#t}2(e>0zOQrRjYyK(LXY8pXtmcxq{Jgp0! zJ}NlPp|1E6%@uX6;!&-#sU=N3fQ9wf!Ld>_4L66{Ufx>Ng|(hH)bE1xc8ZMQh`Gan z7eKsL^f*PA;@A;VPYWH|`mN&EU^$ya6u!nI!%>cl9Iz>9r(RGvdcr;``MFPvb=i(E HT+XszW=v#_ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..f8826de64ced4ef7e8ee4434ffaf9bec79b194f4 GIT binary patch literal 1794 zcmcIlYirvu6#edBaY>=H#aWiJl}>8{saHz3br}uYJ_^NB9IbkxN7I)6_+7^_4cQ7A zTS73huI@eO9v#Wqo}2-t#kUWmqM+X|Zs_&s5r16djC1uU31(o|q)P^PF@2-%+VtkMiFivqZ0xGqFlf=glkQmiS)q0{}5 zObUW`9rH|!D12v&oDjV%W0xt(0+QJ#B_V4WC8j(IzD58Z6GUXnm_A4N^xMDvZ(Kl#KM2 z)~YBa6z!kf_m5BM>HRT{kzyB>>0_-_w-4iGC20+98T@T27_=-1sR@xkQ=+WW@JQ1LinAPc3V2=vp05E;bEpw_ z(Ogku6>C^!gC$Kof`#?B!Ld>_4L6(Gp4{5hg|(hH((i-wc8ZGO{ql$jJ7BzW^f*PA z;@A;VUllsE_^sntWI3BeG`_|y!+RVDIbc)JUU@;`y%P@6$uE6ZtjqR<;c}M!0@V0p AfdBvi literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..76338c38427c13f86e938f966b9b50fb77435e93 GIT binary patch literal 1792 zcmcIlYfsxS6#brGaaAM)C|%d0A?DFaB}0g9tP|C?kCq#|sfSYhdwV1E?0b_r zguyOs{<2;VWM!-uuw>pyQ-PgBnWADLWLwU&N;5bu3gD9Ax)5avE``NYv8EV@PWN{* zDG1(m%rh;b@SSJmgy>}%yG%(Ikj!&Z60(+2V#=-H&q>hiW+u9uY0{J`aRHs+%U+o{ z>uH8$uBlE6-&car`zgBg%sJuTl}Q@a$+8RpwLBGCgT;9kt#5UtLF!0Dg)thOl9680 zS{0>)qWy#G{Q*6^9#CT_Hc^>AmP&Q^Fj*Cn*3g!~e^!EF%Yl#@o)9jsWGM)#iB4-S zUw^Eb93d>gTv-I>wRwZrD*&plk*I9UBmygP=DzAGbg!yto$ix-38KJRTX0!^!{78lHkqy9k-`-2<~N3XUphd%x{>j@{7trF@R7hW(<$Tw*O^M%zj zX;{RqY-DbQUNsC@YdvV>fB3Q*kMKGAgeHqZJH?`aD=Rr@QE_7ksqI=n8@)@TPQmDYp^@x}>*NBre>CFDd2EleEm0jJ{}0-k9aL2;JDTjCwpc*hU$W;xV= zZ_rv%LlqC8$_7fBxCcw?FN0&HXxeQyv^~1Dp9^a}Z*<=V=j{yj!t3Q06JDS2+RT#_ zU5XP&NPSi4(89N7UvuSb8qxC__X}@v9OQsKL3`l^h4)U_MH4^uRk1GH5{Aop_7}Js BW8?q; literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..291301bd41faf263c4e684c1ef9478f025869237 GIT binary patch literal 1789 zcmcIlYfsxS6#brGaaAM)C|%d0A?DFaB|~Tr)`@D{N6U@f)Wa|NK>_jOchfWibz@=! zsw#4PeeXH<_}Iy|YhdwV1E!3Jbs@?UTndYaVofm)o$ilh zQV_iBm}go<;X9AW3DL_kcA1haAeqOcBxEh4#FQJsACsWj^-Od()1)a?;sQFsm%TD^ z*3%5hTv44AzOMwKcT;rfnRCLwDw8y-lVuqIYI!QO28;77THoqOgVd3R3S%@lB_q9{ zwJJ&pMf(R=`vZD-HJ~O^Y@#xKES2i+VX`VDt)VS}f2;(5or2c^|xS79Q6-!*dOF@IC{2)IrQe_g5fo=Ryd>Unjdy$xZfL{vtS5il*IWL)&v(`?;{z^G5e=aNf>PFT7iBFyZ+LugyG3 z(WN+Xgw*$h4lR6Z_BB_|rV%}_alh~i$3YI*6STKoPN0lbI1JJ@?P*zaR>cXuQ6)3-Xa z550ZZ{AHaE$eFQTz=C-r%?#`m$`lm~A=`1D%`}7aq5v)#t_x9?;8K`B6|0MJXtjSO zlY-!F$2`*_3g6lyCqysH*k($yfMm8wNyu78i7B^&Zj~R zU%s!IoFFW~R9P1D`z*z7Y>I?j+QkL?ZGHaFRfSban=KYTR&G^!-6$c+`&{)mXHJ}Sk8(Kd<#0TBv4uHw@xNJ5IH~L`p}2AG1>=N#l}0dM zSUr)3Mcl_)<3{CG!+^EcgIfECFRSqYpMwwRuqd=sEDE@`l7kjAZVVxnSF303(6ue^ zC;qnV$+K-W*h5OCH6cR0al+vFe!f=;Ia5Ll6Nd-D=`e?Y2bxAuoaOMEc&8QK=>xn; z4i(@lv{qD5#RI6ahLR>8z{2{g;8-c@cIypo&TZxA!dlO3-FLxxJ3+nhZn?#TmnXb7 z^C(4^;>ZzF-4i-A@QvBmTsa#@^t{IX!YdpHIbcuF+;&0Xr4#ni#7})vtjl(U;c}Y& E0VD}x*Z=?k literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..722a3e564e52673e4f534ce16e7032fa33058eda GIT binary patch literal 1786 zcmcIlYfsxS6#brGaaAM)C|%d0A?DFaB|~Tr)`@D{N6U@f)Wa|NK?~x?@1|)4>W0Jw zR8{2o`rdQy@v)Qb$T?72{P;L73i|Wxl3t8IPcA8i5xj@HJJ|bYa5%*1?(SCRr*BQ> z5C;3O{mXhikd?7sz>;|*O$BxiWr~W0knK3nD$U@$D1b|b>q3+zxD*zT#hPLqI^AE% zq#$_LG0(J!!grpK6QY-8>@p=;Kr&BBNyu78i7B^&KP5r4o0;ftr%6+)#07MMFMDO; ztfv{0Sy7!7zOMzL_fvG~nRCLwDU&p+lVuqIYI!cS28;77THoqOgVd3R3S%@lB_q9} zwJJ&pMf*pq!vQ^B4QPxMo2X15OQpJdn5+v)YiLX0KP$noUcPY4&+vJ`~WM5i^E zFW)yzP7oGgt}KiBeU@T3Hbp`%?ec>CwmyHQRqL{wE(u}2To5vGMQy_=s@K47#i#n+ z+pR#;


9bK<0bl*8d5hvU(UEzF^h|LuChNoA{q%EpBkj1%%z8o_*F^-LNTaT6PX zTZz{V1J>FM8sQ(ltj8mKjy|BhqR>vUDB#*k4q8;)7(!~J*3ZVC8&BR({O#A1XM1Y2 zhm=ZdLWFqZgu%o8e4`R_ri2zIjt_v-aSj1bGmW4)%i%TgPHVi=2Y9m_YQR@$t*D`j z2T)}LB~3hlrS(_Au~IbcHXGU=+S<>BwVpS+?}GDohI-+}a*GKsPk3YINs2DTi6f-G vBy?!uTeEMtayE_Vd5!yp_cso5z@DJJ>Vm=>C+wq%AN!tIm+c6{YhdwV1E?0b_r zg#BIE{AIl!$jVqRV9C6ZrUE;MGDXEg$hMqkm1b~S6u>3Jbs@?UTndY)Vofm)o$l{s zQV_iBm}go<;XBXB3DL_kcA1haAerZ+BxEh4#FSgXpOc{3%}jJR)1)a?;sQFsm%TD^ z*3%5hTvMGCzOMwK_fvG~nRCLwE0Z*;lVuqIYI!QO28;77THoqOgVd3R3S%@lB_qA0 zwJJ&pMf-#6!2vzI-lxV;Y@#xKES2i+VX`VDt)VS}|EvVVmIEO*JRw|M$x;wf6P?yv zzW!J2=nEFkclg5Gfq*% z26-bs)gRt(1e#V?EiRZ7NBuz#gM%CnN3XUphd%x{>j@{7trF@R7hW(<$Tw*O^M%zj zX;{RqY-DbQUNsC@YdvV>fB3Q*kMKGAgeHqZJH?`aD=Rr@QE_7ksqI=n8@)@TPQmDYp^@x}>*NBre>CFDd2EleEm0jJ{}0-k9aL2;JDTjCwpc*hU$W;xV= zZ_rv%LlqC8$_7fBxCcw?FN0&HXxeQyv^~1Dp9^a}Z*<=V=j{yj!t3Q06JDS2+RT#_ zU5XP&NPSi4(89N7UvuSb8qxC__X}@v9OQsKL3`l^h4)U_MH4^uRk1GH5{Aop_7^n> BWDx)W literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..d7217f76583f6b35ad40d5613583db7b1dda4b3c GIT binary patch literal 1789 zcmcIlYfsxS6#brGaaAM)C|y^?D(2BjB|~Tr)`@D{N6U@f)Wa|NK>_jOchfWibz@=! zsw#4PeeXH<_}Iy|YhdwV1E!ZLR`OA7ekd?7sz>;|*O$BxiWr~W0kZn27D$U@uD1b|b>q3+zxD*x-#hPLqI^7@1 zq#$_LG0(J!!gn5#6QY-8>@p=;Kr)X>Nyu78i77XNKPEx5>zU|orb$z(#07MMFMDO; ztfv{0xuQBLd|wGd@22R|Gv|bVRVHavC(AMb)bdnl4HoBFw7%7m2B{+r6~<_AN=AA? zYgLpIiuMOrg9CbawNFi?*hFReSSr=s!(>%RT0>g`|5yozEeAqsctW_il%*h~COWOT zeEPa(a)ht|b7fh~@3IuTwkZ-)Se3LnV?V9W|GMzpF8_1Ys_eQ^LYOZXgiKsfJ8_Bv zHn1DTkiEIO-2_7#!qqIC{2)IrQe_g5fo=Ryd>Unjdy$xZfL{vtS5il*IWL)&v(`?;{z^G5e=aNf>PFT7iBFyZ+LugyG3 z(WN+Xgw*$h4lR6Z_BB_|rV%}_alh~i$3YI*6STKoPN0lbI1JJ@?P*zaR>cXuQ6)3-Xa z4~P4(`O7*TkTYYwfCckLni<$Blqo6}Lbl^Pn`s8;MFCtgTosvk-zGt`n~7*|rb$z(#09j1FFR%8 ztfLu{SyG)8zOMwK_fvG?nRCLwDU&p+m1P+KYI!cS28;77THokMgVd3R8OCUEN=AA` zD^-*digtU;-Vr@s9@1ziHZe0@ES0M6VYDhFt)UHp|EvW4h65oLJRw|M%Tf?h5gk`t zzIu_HyVQ<#0TBv4uHw@xNJ5IH~L`p}2AG1>=N#l}0dM zSUr)3Mcl_)<3{CG!+^EcgIfECFRSqYpMwwRuqd=sEDE@`l7kjAZVVxnSF303(6ue^ zC;qnV$+K-W*h5OCH6cR0al+vFe!f=;Ia5Ll6Nd-D=`e?Y2bxAuoaOMEc&8QK=>xn; z4i(@lv{qD5#RI6ahLR>8z{2{g;8-c@cIypo&TZxA!dlO3-FLxxJ3+nhZn?#TmnXb7 z^C(4^;>ZzF-4i-A@QvBmTsa#@^t{IX!YdpHIbcuF+;&0Xr4#ni#7})vtjl(U;c}Y& E0ieob1poj5 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..4dce804558dab8dcabcc69bd98aee63005fbff85 GIT binary patch literal 1786 zcmcIlYfsxS6#brGaaAM)C|y^?D(2BjB|~Tr)`@D{N6U@f)Wa|NK?~x?@1|)4>W0Jw zR8{2o`rdQy@v)Qb$T?72{P;L73i|Wxl3t8IPcA8i5xj@HJJ|bYa5%*1?(SCRr*BQ> z5DxcY`_=*M%rca49Svi#5eKbh^Ki zNkQsvke@cR8H#5=QPLrloi3{iiU-rtx zSx+-0v!XgFd|wMf@2BX}Gv|bVQzmItC(AMb)bdAmP&Q^Fj*Ip*3g!~e^!EF%Yl#@o)9jsWhn@$iB4-S zU%qdcoFFW~Tv-I?j+T{iNZGHYqtJY;VT@u24xgccXirR)#RIh>Eicj^s zw_Aax$#ruJ=EO;Vki+08hvU(UEzF^h|LuChNoA{q%EpBkj1%%z8o_*F^-LNTaT6PX zTZz{V1J>FM8sQ(ltj8mKjy|BhqR>vUDB#*k4q8;)7(!~J*3ZVC8&BR({O#A1XM1Y2 zhm=ZdLWFqZgu%o8e4`R_ri2zIjt_v-aSj1bGmW4)%i%TgPHVi=2Y9m_YQR@$t*D`j z2T)}LB~3hlrS(_Au~IbcHXGU=+S<>BwVpS+?}GDohI-+}a*GKsPk3YINs2DTi6f-G vBy?!uTeEMtayE_Vd5!yp_cso5z@DJJ>Vm=>C+wq%AN!tIm+c6{)hkxn`}eQfYRc}$5BzxpC{+^Z1j12PALrGJ>1^H)?bZ-0a`aV*CIcCYa$2G z--h*H*6V?+jP(K*%o}McurnxAR4jyS!+BO|2B$>ZI^}NeI2$q6^QQ6aGz^q*0wL%K(teQ=v6j+-K4HmPZ<-jx@G> zQA#M<-#@t8>(j%lJ~f777nRw?T&Zp!#>-678rmZG*HkcQQ4mrCAcTucSqegGsFNDa zm+vbqM_3nNrYwv3U6f+iHbp`TtCBWn?6>v#%Q&9f#e;Oq%&TSzVZNLbGIm8R#|g67 zR9>@B^#0p5L(}TA#W{202wBcyub;!=@Cna@cRB2$^OKId=!gIHe8Nd(tAqr{xfhHR z@>Lqad|~xe8W!;|8=hOHmmLG@4qb9LwpWDpw6PuPO&K9(n=0mRNNRsYQ@&i zMyD%r-fcV-?$Q5iYq*7&N^3%dcoI0$ywyL2;JDs}?-2 z3m)GqIL)E1_!`X>b*;#Xifn?w}8#v{XNj*A?yDQE{@P&j?UHY)kCuZwlr JhA>>tvOfVRWtji~ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..fee41926eef49969c2139fb8c68eec0e7d35273c GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|ySxh)hkxn`}eQfYRd2+fh-_uP5j9Z1jG7PALrGHQe69)-R2N0a`aV*COA2Y9a^F z--h*H*6V?+jP(K*%o}McurnxAR4jyS!+BO|2B$>ZI^}NeI2$q6^QQ6aGb+q*0wL%K(teQ=v6j+-K4HmPZ<-jx@G> zQA#M<-#@t8>(j%lJ~feI7nRw?T&Zp!#>-678rmZG(^N2MQ4mrCAcTucSqegGsFNDa zhtDf4M_3nNrYwv3U6f+iHbp`TtCBWn?7Q{(ZynF=;xE0HX;+;R!hAU=WbBGsixZ@< zDZ6H$=>3;#hNjPDe{<%<5mKDPUO$J!;Uk_0uX5N$=SLlP(GUOY`Gk|oRtedSb1xVt zLg2K@gwo%CseOj!` KHiY4Fmi+*1dSs0N literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..f0c20a9cb997ed7235293ae7202d8c010e9d9ca7 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|ySxh+Z{h9^b{;kMdT8C=UW@$jrH<@D zcNf-wS*HVXYOEJ9XWmFN1v`NmH{A_=R#|+xX+^X4UaTP9ch@NjRvP=q*t_J zMJb_Z_wZI?j+W7_hX?^}jufnRN%?6H-rCVlRbxH{H<&2P_D=ICH zkixq1nth}XU#}VJK9~K?m=h;RaSjLF9FF_XcpkmUVIQ5Jb=*fk{IBN|PAWT1$Zni@ z!8jqGr4h^*R*$7&5l^xvxuJR4F<`FMq9*_0%W~bvXa60FEDG%uivq5#W1q>Z7y#m>cU#jYwEYbc{@hN zaKzkVzzZN=DSDWqOL6E3siuVvP5nmkE3ll6A_`yQk>M!EMGn{$G*d4q96ezdmHgDF M#ky=u7%nH-FJ%g3jQ{`u literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..af24f798c28562ebe60ce56463492603823cdba0 GIT binary patch literal 1794 zcmcIlYfsxS6#brGaaAM)C|ySxhB`y-hY z1n)ZLnHEv_&I@8f^s;7yb)=y}8x2m$NUvzk zic&(+{^8NhL7yJq^l6L~yQs`Q=1O(@FkWVo*3cHg-==~=i-M3E03lpl%Tf?hL!H!U zK7CzbIl;OBGi6!KAEFeywJ8#EX%`plr}g=Uyq0NKZ4$zKIVWW7idu*h|LggLlgd^Jsf}|l7$@YjG=llU z>Zvp=;xRT1w;V4!2F$fuG|WGKS+0lp9KJ(^MWLNyQNXp89JHvoF@)4wt)GoTSF(KA z_+7Fm|E{Xx4q_^;2@&G069zB%f6yu+XG&;c;%E;z9pw=4O4A66vm7>B@U$*?`l#SE zhq~e>nk(vB#iLqfQ%jn-2Mg=3f@7s<8g34?y}7lh3u`@ZsNV+X?Gzcq`ErK=8$i5L z^f*PA;@A;Vj|v^y`mN$uU^$ya6u!nI!#R$N9Iz>9M_y1kcfu|z`Kj-Ub=j6MT+Xsz D)3#%Q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..abb555d9562d25bc4654f9ea18576d3eaa80d03c GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAP5P`Zu|5c6oIk|DGQ>qNEfqvghK>fx7chXUfq@49IODnsiu zsH({EeVu!Je3Na-8Bkh$|1c^F`s4J1o{v6_FDQi}yo0+t*!iPzFhJ|}_FCk}uTA6t z`mbR9m-TueD`UNY1@lIl3hWHZIw}@Iw&gslG=sCE04^E&g(ypKb(lY8Ym#y3bbr>S z4uW?b^Gu59*(+<# zdYU1aYpQF7?@L1H{T5w#=A7`a%UT-M$+8RpxjYkEgT;Lot#5gxLF!0Dg*Ff`3f~gBArL2>>BnT**=plAul! zn$O=>SdOtSz)V>d^ZO{pZfqS1DXglsIcLAD&!5Nf!Y&@ATV`H0O9=DjoRG0Ak{l<< zVpDm|KG6qn)(lOn%NFO%iDP6rhr|6Gj)s5nOipk68#wOekpAoWgpUNBC` z7ik3Zh1FAOSj59@cy5_qb_|$nwP^T%__ACN@i}~tI*UR(#iD>KD>-OUabpNcicQZ( zrz>&ZZ#)+6@&9XUxPzEVYeIy0 zPaYJU=8!7BL~})|RXnIwHnpUQyRfkSGB{R>rs3vL+xweDU0CaRL;W^5Z>Pu@PMKQ_ zcmc#KMUU&~>Ns|Uq=BJBTfbHO3M^-nh{D%+WH`-nkpngb?cfUvr%!l=N`C6=VqLZ+ I441R)H)Z-|zW@LL literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..9b74fccd9e336312d7c20de54ff6d0bdb4a63e49 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAP5P`Zu|5c6oIk|DGQ>qNEfqvghK>fx7chXUfq@49IO>c-YR zsH({EeVu!Je3Na-8Bkh$elscx`sMV3o{!#*FDQi}yn?$s*!ih(FhJ|}_FClIk4@wN z`p;nfm-TueD`UNY1@lIl3hWHZIw}@Iw&gslG=sCE04^E&g(ypKb(lY9Ym#y3bidZ7 z4uW?b^Gu59*(+<# zdYU1aYpQF7?@L1H{T5w#=A7`)%UT-M$+8RpxjYkEgT;Lot#5gxLF!0Dg*Ff>BnT**=plAul! zn)jbpSdOtSz)V>d^ZO{pZfqS1DXglsIcMLj&wuH7VHbbtwM@I}lo002IU!?LBrQ&m z!lvw+eWDLutQnd z57G$c3#+Hnu!twwklfO|>=-cDYSEDY@MXCk;&b>KMHYp2ibVleR&vmy;>Hk?@K)0ZinAP^wBSi9 zc=Dj&G>262DVi%%t>QtgvZ*Ca+=Yepm%*`8Gz~Y0+Fsry>cU#j8|t^gc{@eMaKzkV zz!M-|DSBK-SI4mqNEfqvghK>fx7chXUfq@49IO>R{`% zQB{%S`#Sgd_$J$uGoZBi`fgMd^xNqLJs*7-Ur-7|cmsELu=A*KFhJ|}_FClo&rRe2 z`Y&Mpm-TueD`UNY1@lIl3hWHZIw}@Iw&gslG=sCE04^E&g(ypKb(lY8Ym#y3bidc8 z4uW?b^Gu59*(+<# zdYU1aYpQF7?@L1H{T5w#=A7^^%UT-M$+8RpxjYkEgT;Lot#5gxLF!0Dg*Fg1=1#gBArL2>>BnT**=plAul! znvY*rSdOtSz)V>d^ZO{pZfqS1xv-0K_QU%8w_b%+wKf|#K9+8odDST)%$IXQ#;!blk>VT<_j5QJ{=qXjz3y+|xR*ovujdm^DqGdaZk&6; zI3b^;5zH4>Po-fIPqHDorFq#gV6N4oA^+jaay`W7@GXig3hfk&0y# LWn02 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..d0c41a874dc5fd22151068908ab37a2793e9c0c6 GIT binary patch literal 1794 zcmcIlYfsxS6#brGaaAP5P`Zu|5c6oIk|DGQ>qNEfqvghK>fx7chXUfq@49IO>W0>7 zYgI*#@9W&-$_1=&~N8g^m6oJd_^e?;SD@Iz}|0-g8^E1ch@4{e{LcN z(0>8zzpU2-SsCjEESNXaRA6UN)={w#vK{AHr5Ril1#rpGFGN{_tHb;`Ta%1Kr~AD& zbr8Jkm}go<;X50|gy>}%yG%(I5X>eaHKLYLV#vMVn*?ZfHx=FWFlkCva{-;;%U)S? z*3%5Z+)`aDd|whmAGhejGv|bVS=Q31PL^c=$mNC58Z7R!Xno5g4N^xMDzwqyl#KM6 zCRUUZiuR9=Z;ub@$!(v;NU@8`9AK_GwGZQECTR_A5&Uf`7_=w|NdO4p;zpK&kOXy- z(0u%|!g7jr0cOgwm_J4-c4zBI$dz4OvLDvxTk=|_UA0LF^W~h7u`7}gC&*t@cFjJ~ zN3YflO_R&!=FEvx-mI}%2qW}8|PjyPRJ)|1oMT} zQ)yVlV{901IbL=Qm}|9Yn1A}RTo3U%e2WT;LOaExfEz10Xi;%v2uWH^&qkpuSw3$3 zD%sP2SJiM2F_qSY2=UGdgBSciXqAw2CA2Vcv=5w)atL^(X#~Yt4qGjFmI|IdDLBm` zRoq5%MXFUisZ}<$q>1~mu>LwYR*I(K=1|+4n?zk$>v=={E;w(e$QaI-dkokD;+3Mu zb#!$cJ3`W^(4np0Dt-l)vq?nZYdkWX_=*M%rca4F26iZ#VJbh^Ki zNkQ@p=;Kr-8;BxEh4#FSgXw@J|KW-7XyY0{J`aRHs+%U+o{ z>uH8$mQ*K&?<+y*{S;kz=A7`a$|Q~IWLXA)TAmB7!QwoN*0(y+Aa$gn!Wa!s$w;qg zt%_1Y(WCzIuzyUCmwjps#V#syh_zDPK8#nDq&2i<@UNv{(6S(;CO`-m*Rm9Z)Kn)m zn=jwjTuu-dV5TgK`F)mRH#S8=3agSf7wot7`SUEiw2MdOR+ZPy62g2rCuHo3+Kv-c zu|eL5PxRZ@8-b?PRf}`x#K}=V2b4L7qQD~=F6mV@N2Q4aY3?a2+>u2NB zwK?x69vk=M|GhQbLr$eNAws-y!r(D~2FDU|ri2zIjt+p+Q4RqQHjSV-%i%=b`Sz^0(R_=3XQC+wq>pZdC3m+c6{ Hc-SP zsH({EeVu!Je3R|S8Bkh$c{?f!`t|IRUX0$4FDZo~yoS3w*!!h%FhJ|>?ndOhPfg?i z`uni?%X&SKm9bvHf_WoN1$G8yii(Ah?KsaW&EUK!fJ=scA<7b53iHQoO)?Ih?zd!8 z5WMS{XIezzJ6ptr=w%tZOi30H%r+qjQOhVXJS znjx4a)k)#|iV%9gMHikqC;YQANuxShmH{A_=R#|+xX+^XEsr!v9cif0MuSr_(koiC zqLfhdsJ}eyAJgMypPERqi^?2gu2i=V<5ebU4Q&zpX(|}BCZ^ECnGo)Jcuz z!{;@Y6RZm`QReSR9!W!-&rqY@aA>KG)@Q(inxe{`wgcc@_4uI294gqg9ji5No;Ykag)&)-= z6rAQzSA2@*in>R{@$ zQB{%S`#Sgd_$J$tGoZBi_I^|p^!wQ*y%>EQUs4J~cnf!Tu=l8OFhJ|>?ndN?FHPhC z`uni?%X&SKm9bvHf_WoN1$G8yii(Ah?KsaW&EUK!fJ=scA<7b53iGFIO)?Ih?vG?r z5WMS{XIezzJ6ptr=w%tZOi30H%r+qjQOhVXJS znjx4a)k)#|iV%9gMHikqC;W>tNuxShmH{A_=R#|+xX+^XEsr!v9cif0MuSr_(koiC zqLfhdsJ}eyAJgMypGHHmi^?2gu2i=V<5ebU4Q&zpZ7LYFCZ^ECnGo)Jcuz z)7LeY6RZm`Q=N# zmPRmNSUr`7MLfxdQtgvZ*CaJb;DuSHZDTGz~Y0+FssT)P=R4H`MQf^LC1i;fT4# zfM-CwR`fVUm*UtFQcnvV+WM{H*I+rDL=?WpBg0XSiyW{iXs2FKIC{c9D*35Ti*?zK IFkH^EUo-Y(djJ3c literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..e4f10562115c394988e83ae907ca09e00f199fc7 GIT binary patch literal 1794 zcmcIlYirvu6#edBaY>=H#aY&L8=ck!Qm>Rfx{QWxABAEmj#jxI4-zGt`yQ%1Irb$z(#07MMFMDO; ztfv{0SyG)8zOMwKk5hEvnRCLwD3dg*lVuqIYI!NN28;77THoqOgVd3R3S%@lB_q9| zwJJ&pMNj(60 z>z5mWrpZ-vbLPa^Nk0b^IET~W7AxPp+G61`R>c2iJ>jIXRYGaw+zZAD`7DiKzOZ^K z4U4#qjl!+QtBwI{tp|qNEfqvghK>fx9Cpn&-CyJ;GM%FsFu zsw#4PU*{el-(*{I29y@xKa7fk{y4p$=c7;K3rb-K@8IqZcK&D_4A8p0y%zcLYZEzu z{wrAjWxXEA%2+R8!Mu^C0y~2;Ma4qMwwz~`W^h&%z$HV!5M>E2h51vqCK-oL_h&LG z2;OzfGcBUI>wRwZrD*)Qw!=W)ESi%02}nODsc!hAU=WbBGsjuT|D zsk~;N=z}+FhNjhJi*x40@qRxCq&bJ9;a@!W(CP5)CXPAm;eS1!a8lVSA;EF(1>=N# zkw!3ISUr`7MLf)g=a%VZ$AGz3i-!M)FU$21pTqa4vnaGvEDE@?l7ki%H-?Z}vGudj z=}MgU8;^y1{Que-?jWYpnh+u0IAQRbKZ9ZkIaNXn6Gywi=_rSQ7n?>qNEfqvghK>fx9Cpn&-CyJ;GMy0LW+ zsw#4PU*{el-(*{I29y?`-;9caemT9M=c9My3rb-Kui)+uc7AFc4A8p0y%zcQV-q=m z{xewrWxXEA%2+R8!Mu^C0y~2;Ma4qMwwz~`W^h&%z$HV!5M>E2h52K)CK-oL_iHjK z2;OzfGcBUS7jq9CLOKnNFCvJ`~WP$xB- z_n%f+jI>wRwZrD**EL+Upijc#b0_Y)2=!tg!yt#$k-LN7AHty zQ+CZh(FZTq3{9WQ{^rbyprH-1#@@xKdexPzEVYeIy0Zk}=j{|3!x3|f z0Z)K4&`!OeaP)*{sN}~!E!JgQ J!f-juz5{KtWQ_m- literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9d2760ef159af2afdd91d08e2e7315965bbb3ff GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAP5P`ZvX5c6oIk|DGQ>qNEfqvghK>fx9Cpn&-CyJ;GMI@mgG zR8{2ozRo>9zR9-a3@9zWz8e(<{dRgm&qp7|7nH&f-oV`*>^y254A8p0y%zcYa}zm$ z{tHE2h51vqCK-oL_j@ua z2;OzfGcBUI?j*u^>fVSWBvufnRN%?6H-rCVlRbxH{H<(!bQD{3uH zkiw?&nth@VUac9LK9~K?nG?tR{Tz_v9FB&6@Z3YE!`GWQ=CFtV^?brfWvhhj#<>@a z6Y@zK!F*x$R2mlXBpZ@jnwK2|=2|Tp@*ln|*F$^`-=fH(&`z-^;L1u4T2$N^LTb6z z&qkyxW!`W6s@mg!7uIkGF_qSY2=T@VgLnKt$d!;&CA2Vcv+|$4b#O+#G6qd23M@)_UGhzYWgYDKds5 z<`x5<0r5)F;}l(rV@F6mEp%w>w~AkZ#HXz^0&`dO_jn2`^B|Pkmaf L%eI8!a+duBWqNEfqvghK>fx9Cpn&-CyJ;GMx}kO2 zT2+zb`#Sgd_$J$tGoZBi`fgMd^xOFry&QcQUr`D}cmod)u=iWzV1U-$-L=T~pPR@5 z^k2aGFYEO{R>pb(3+9b971$Y+DJm91w&OgjG=qzx04^E&g(ypKDa@aUYR)S zX@+2KsZI*tmxR#AExPc`IpJTHNgCD3vJ3#ZybxN0#eEj7Z+WCa>PSO{HX59gkzUi9 z6{UothyC0B(J?)_?b8@3c2SuF%$4f)VZ6*Ft)VS~zfA>$76lmfdeZ&6`UXs1{daAPG0Eh=sdA+=WPXQR-SEFU+1 zmF(%it7^E1m`ZCxgm~wK!3+K$v`Wai5?Yuz+6PWYIRw1YG=kzRhpiSos|%hzDLBoc zuDFfnin> zpuY>7pQ6_TIXBh|STb*g`|5*tJEeAp}JRw|M$yyMSiB2+? zuRqpIjt~}LrYwu)eU@T3Hbp`ztCBY7?2q;3>nyym%SYu_mDjxz!hF3TWbCSJ#tCZJ zAaBGc`osH;K-21~#RYTX=%8Oh{v8fqYvFzg`MX(9IH~MBp}uk91>=N#lSVLKSv{48 zMcm3p=2qxc!+^EcgGT;`FRSqopTkdRvM97uEDE@?l7kj=ZVVyWuKC%RbnVRhiN~%z zda<*HJ4mUvCPaugP8dAmFSjcpCrW5x;%E;z9hDI9Ow$O8vl8AC?>OTfKfs%okOAMI zwJJkZ51{G>N}9L_OY1L#W3_17Z8o$$y4lZ_wVpS+Z-eu8ihAMoa*GMCPk3$Saf&X* zu_Gj36*{!=t=ZRHIh#cEyvF^)TO0=^U{BCqctPR46L!(WPyMS{S8NHx<*fJ%h6-cS literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..a088516a475edf709fd99faf554b0fbc8611e199 GIT binary patch literal 1789 zcmcIl+fExX5PjdT7zGId+B7Ao5_L(WutlgBnu@6Hqvfof#KkxHLICmc9cQxwrKO?; zqGaXuc;=irK6Z*NIRi?oZ|_G{MZce%)3edX@j0b1gtu^e3p;-d4h9(A-rmUk@TJKd zKz|oDKSi$xa&D{_uw>pyGY30^Iz`1oD7KtubIstis(?#|>q3+@xD*x-#hPLqI^7@1 zq#$_LG0(J!!gn5#6QY-O>@p=;Kr)X>Nyu78i77XNKPEx5>#68&rb$z)#07MMuX}ak ztfv{0xuQBLd|wGd@22R|Gv|bVRVQgwrzi>l)bdnl4HoBFw7%7m2B{+rbBxj8l#KL( zW>u6DiXQZ@4*K-)s!vU%*u>oIW2uz8hw-YAw1&0>{;?7aS`LI{ctW_il(irv6P;u( zpT4e{93d>gOj#DoyDY`7ZHk0cRwZrD*iY-rzb-tt%l}-pD!Xo!5a#OzA!Ao%Cr(hn z26iJp(QjUF1e!iq{VkXiM+f~9^6zl?TnqP0$luL+!bxT43B`>IFBm7}vowPF%Ic{! zEaE;k8n-I18V0Pj9yHqDe_4%(_#D1Nhee^CVo|`Ql^nE~b7Kg}Ud_+Op=(>-P5f@z zqbJ*HxPz2xYeIy0?S#Se{dBJqa-xJ5CXV)i(@_Zl4>XOSI4j{L@s2az@jbk02^sJO zTB|Ztbq}g;prna=u(bXnI97|M-DX4EbDRBKS?hVD`!+alr>Ga+EjO6({DjwL9;fJ1 z96LhtJ)uJj-&$Z5I??I$;-0{Lo*Db;Xu2T+WJL`DS9q literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca317b8b7e4d912b8c8c928ed3c269a0d8b19f40 GIT binary patch literal 1789 zcmcIl+fExX5PjdT7=eTUZJH8PiMk|G*do*mO-0o9(Q?*K;^LcpA%OV!jTQm9e^ zQL^%SJaf(*A3MdioC2lQkB`HuqCZbB==t#T=z>xhzu#Mm z>uQE%uBlE6-g`|5*w8EeAp}JRw|M$yyMSiH-UQ%?1k}E4Qq?YLpP>>p3AKS7j%T zQNRXyEk4%o-mV3jK9~K?nG?r{y%O^8Xz)@C4@$`2^?JfdWoHS+jdL#;C*-R%g89no zi8L(YJ~kS+DlZ!bthE|6+CO|*jtBT0d_adqp`Bt;z?GF8w3u;Y2+3Z}&&HuETi#Fn zZQ0{z+iI|jlxk~2gm~kG!SnrmuM%>qgcc?a_kq)42>}l@ji5Ly;WhD2GTzApyh#Ze z@D*CCGF0^ds&1g9iTkjy{xUdLi>BRXL)&wk{ajh=d87LdIBzGY7v3$mnDFw1S7sii z=u#XxLh?PKLkr)UeZ`ftaYWB++%LSsaZm#G1nq4X6ka-E4^8~kUy60bwlG{yi$C%9 BV#WXf literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b90bf0e3f93ea2d6e2878ce9cb6a7d2b64501f1 GIT binary patch literal 1786 zcmcIl+fExX5PjdT7zGId+B7Ao5_L(WutlgBnu@6Hqvfof#KkxHLIUFBJI-bWN{d7V zM9IqQ@yt1MeC!lEat4%EKR%ACivB#iq!**l<4a0m2=C$U4)*>T91JkJyStV7>06UI zfc`#ge~MlYsvke@cR8H&fBwPLrlqi3{iiU-#<7 zSx+-0v!XgFd|wMf@2BX}Gv|bVQzvOurzi>l)bd zW>u6DiXQb>M}2y{>eCo0HZeDcSSsc2VZ1IRt)VS}|EvUqmIEOfo)9jsWi1HFL?@Zc zm+u=UCkP8LQNT)i@ri!- zb}P^{xo&R3oH#k^mymzQ!MNH%JIHul_j@_yoPzn(nXQ^P%^ zR9h1w#2Y6J9`5HGm5?(fv@mgW0Gy6W2zZ)l1jShiuZee>@lGG$O-smtuh3eRp{fT^ zbps_$JbqNEfqvghK>fx9Cpn&-CyJ;GM%8;0V zs)`(6-+RtIK6bJVIRi?IpPxrXL4TcG(DTuk@dc$YgpY7{2V4IP4h9(A++52%``%;@ zpuY|4zpU2-SsCjEESNXaRA6UNrl?p5*@pA1(hN?E0=Q(jE<{;^OJV+0tSQE!)BT-H z3W9eX^Gu5n^PB=eksuXZkUG*(VT=Z+WTcn0 zRz)eH=wAQ&V4ogd_o*=yo2bk#mP&Q^FkTjt*3g!~e^!D)%Yl#@o)9jsWGM)#iB4)R zUw^Ea93d>gOj#E5`z*z7Y>I>wRwZrD*&plk*I9UB7mvy=nwDL0!^#S7U#@~qrH9(2m3i34qt2GZVvT-y`FGV*(#yFaqb1GvoYz)nfDWq zU3>ImXAQTIQfWf&&5fo=Ryd~aojd%P2Z<<35 z_y(;NHB|8cs%)U7i94{c{xUdLil*IWL))WU`?;{z^G5egaNbT)FT7rEG2!(IugpA7 z(WN+cgw$7s4lR6Z_7zvoCJ{Zaalh~u$3YI*6SNmzPQfUbHc^>fES2i+VZ1CPt)VS}f2;(9mIEO*JRw|M%2E(g6P?st zK7Cy=IYL-~nX)YAcUg*E+Y|{YtV-IPv7grGe_eQP7yr3xS$5SZA-B__%2o-*jdL#;C*-po0<1rD)o1HncsrwVw-XJ#Td11n2D(^}@U51{0p2@XE~N z6kUpAM@W57=+MHqW?ymTY!cD)8utsYa2(`-Jwbch1%;PR*hUjS^hvQU+YpA!S@sKd CHe&Mt literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..617de1e696dbc8922a16f2f3ebc094e816601b37 GIT binary patch literal 1789 zcmcIl+fExX5PjdT7$p({v}sD9h`JW}o0yqBES0M6VYDnHt)UHp|EvW4h65oLJRw|M$x;wf5gk`t zzI9iFXx1eTv0i3 zi~`okYw@vu_jWB%_qptE&YU>j@8xiKki*g7r55hxQ2p2I2`81EB@{Q#yuhIzS z3#%v6u!#FuYuu>3Y#6ZCYEWzc@MSq3;B)W+9TtUlibVleR&vl{#*HDQ@@n<09lEmR z{lwpvJ$|;W2HQxfv?fG|H%=Hl-_Q3dA*V`cVd8KXI34B?@IccDinAPE6Yr$LJ9&UN z$)N&#h1QA+s(1iZ)=<*KU07Iu85}D`-EO_1&AF}oTv+RQt@{=@Zzrf1-YvJ7@bZLL zW*(*JQXDx#s(V6*2EH--iYsU1h@RKDUwDP%AP4LTn%gcYymZ12n)sqNEfqvghK>fx7chXUfq@49IO>W0Jw zR8{2o`rdQy@v)P2YhYilF()3+vb z2!kEi{AK+<$jVqRV9C6ZrUE;MvW|*{kaeABm1b~O6u>3Jbs@?UTpbpV#hPLqI=x@D zse|A>$2`*_3g3A`PKaKXvB#8T0m(cirAF2=N=&&G{3!{V-ONO9GfkRO)m%U)__ANt zob@$BGS^hs3g1_P(EBO6^vpTo-;}j9s*`0I0BU(Av<8dwELz{{NQ2aoh6-aeI3**! zq)8Q}grfU{>%#+jbUmOkQf#6!dswPY-NR&6NLoW%0{>YFhAjs|507*Ig3Ae7PWG;)-O$DXQ1NZp5eh z-P?^o)8wkT1#{wfe~`oBK@LZwms+@&L;7#l6HY2y)u?P-c)>U!U!@Vu7go=tVG%d6 z5xA9j)i7YK^`H^{;mc|~!sqA%+A9j}6pI3`tmL3Y#f>2(88tl{d#*isKk>I;kDu+S z(Kb>ltqBq0jS~hB_w$WP$f*)qm^j`APRBU}Jk2zM;w*>P#5+lNClByuIV8YWXst+4 z#RI6afs!Wf!qWQ7;8-b|cAE`t4{h>uVXfzl?pxrzouOWMvD{+9%M)Ikc~VDL$B82( wT@pI9@U7X`TsfOY^t{IX!uuNsIbcuFUUfm?jT3gz#E*SXtjoH>a5>NZ0Oej`=Kufz literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7eb8fe18ebb34620e1a33823059449a4db301c82 GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAM)C|yU#5c6oIk|D%4)`@D{N6U@f)Wa|NK>_jOchfWil_8i0 zRTVkDuXB%&Z?X+J14@e@pGHMNe;%LHv(e}AIi)a!4{&=6TYohU258;fT#G#Y)auAk8@(44?3P*UzE$e$sIlGva?epKwyyDj~sf?git7 ze3eEpUsyerhDAKghUb>)WygTIR*Q!J`!CD&5TC=3sIw@vQ!EO&w334s6*q>ETCw%B z(dkN@cN-6dd-(s_8g3z`(wYz!09N5fESxaP@LuPss)eg zf=BlXPIIU$zD9FJU8}fPt88jX6L(-?{Y7xB6ivg;p|o4Q zFyIvsuM|B_(WN+cgwz8=hqivJ_!U^rCJ}|N@yKwR<01!a3fjRJ6i%P8jY@v(>tbED JAq<>JPWv>7L literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..fd4f44152a35b7c2d892092ccb12bd25432f22eb GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|yU#5c6oIk|D%4)`@D{N6U@f)Wa|NK>_jOchfWibz`sx zRTVkDuXB%&Z?X+J14@f8??y#IzaF2{v(fwUIi)a!H*k9kTfa08258;fT#G#Y)I<)T zzYXiZtk(ls8S4csm^ac?U}sRKs8|TuhV!h_3{HvyxMb)TqAbCsFn`F_B;(NOeoH0= z!Ml!mrbQIK^MsfXy)0vwDaitYc}hq^)G|s8xe@#+0h(P;MRz?+no=b$pc8!AD-&lu z%@E8L)k)#|k`Q{gMHikqC;W>tNuxShmH{A_CqiqmxX+^XEsr!v9cif0MuSr_(hFL% zqLff{fA4Cye?Ske`qV^-+zZAD z`6!KGzOZ^K4U2e^4aqId%Z>qatriXW_g|LlAwGw1QDjkQr&ttlX(b0ODsBuRwOs3G zBhr;J?>2r`?cu)*Yq*7&N^3%dcRQFUT4hs9nz#cC>o0<1rDz&%4z<0!wWtegJ#VPr1n2D(8N(5C zg8|Qgc%|rZiY~>mBcz@dI<)m$#jn6}Hi;;FjYo!~92YrYQ_xPmpm6krZB+6@pBC$~ K4Pm&PWj_F?%VeVf literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b5412a8bc484e7389987d8f9a1ab02135451c0a GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAP5P`ZweA?DFaB|~Tr)`@D{N6U@f)Wa|NK>_jOchfWibuc<@ zR8{2ozRo>9zR9-a3@9zWy&n|?{eE&z&qg1|=aj+_-oouI>^x{34A8p0y%zc5OA|SO z{tHE2h52K)CK-oL_eU}* z2;OzfGcBUXZ=X%Q+!qSJYaZ zAcalkHTy)rd9`L}`ds!mXHFa)?B{?S=WsZD!t-@MhuZr|$34u5|Mh&rNoA{q?8dnl zj1%%%8o_*F^;8-b@gy6PTbh?01Lj&S8uIVIEZ0MP4&R~3qR>vUDB#jc4q8;)7(!~f z*3U+yD`nno{I1%ge;3wp2Qih_gb4B434?e1KggAk6D71TakLAZj&cZit7!zqSq{%y z@VG8`e6Qd%hq~f(G*{HMihH%nrj|5u7Z%oE1jkCzG~66&dwFY77uI^-P`?e%+bJ@J zBjyGJo&oVn(c=_diepDeJuP%->$i$uf#qxxQTQ5<3`aRGa=@mboq9px=m{@S$&Y+iQ{MUz*4P z^mk$Xm-TueD`UNY1@lIl3hWHZIw}@Iw&gslG=sCE04^E&g(ypKb(lY8Ym#y3bbr*Q z4uW?b^Gu5n^P1oM)R8d1wAG2~A0mjq~bI~CpaFlkCva{-;;%U)S? z*3%5Z+)!OBd|whmAGYYiGv|bVRo2p|PL^c=$mN;P8Z7R!Xno5g4N^xMDzwqyl#KL> zCRUUZiXI-^?Dvo8(M_MmNU@8`>|?GvwGZQECTR_A5&Uf`7_=w|NdO4p;#!u1kOXy- z(0uy3!g7pt0cOgwm_I}*c5CZM$faFeu%FiF8}eGFUA0LF^W~h7u`7}gC&*t@cFjJ~ zZ(pw&nkJXc&6yL&hX*+z!8sfapYeRt&mnm~>$r~@@xPu=IH_z^Beikv1>=N#mPRmN zSUr`7MLfoa;g;iN$AGz3i-!5fFU$21pTqa4uqd=sEDE@`l7ki%H-?a;)%0u>x{~F? z#_y6n{&!UkcMwx)O^6V0oiKR8|ASTuIaNXn6GwZ%=_rSQSDHppoaM05f+wlq$)keP z98$$iG*_fr#iLqfQ%jn-2Mg=3f@7s<8g34?y}3!$g|(hH)Nh0Hc8ZMQe7VDb4Io}A zdR#|W$FU9M_y1kcfu|z`Kj-Ub=j6MT+Xsz D4+LYE literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b1690088f88b3487c0b7b73a6350e2bee35ffd1 GIT binary patch literal 1800 zcmcIlYfsxS6#brGaaAM)C|ySfh)hkxn`}eOfYRc}$5BzxpC{+^Z1j12PALrGJ>1^H)?bZ-0a`aV*CIcCYa$2G z--h*H*6V?&wQ(F4%xR%(Ff%A)kj(pR!#Y+g2B$>p#t?a+Nifu1bRUe zD^hYv5B9G5`v>&!s!z41*hQ^(F;`6O!+4oV7)@IQ|C$O0Eeb*s075E$DN0UAf;vfP zzI*tWXpLE>KA^q3$aVw;$BN80uj=-cDYSHk2|7E!z;&b=`br!g?k_8TzMpU49U1?27QfzuQ zI$eqLZsVbFkN#gWUGj|`_dE^@%8pdEZ(;`H&`sN~1KF2-gX JTvuY2{Q+msWxoIb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..653bcd12f7001449488642cf959fdd05b43e6a7f GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|ySfh)hkxn`}eQfYRd2+fh-_uP5j9Z1jG7PALrGHQe69)-R2N0a`aV*COA2Y9a^F z--h*H*6V?+jP(K*%o}McurnxAR4jyS!+BO|2B$>ZI^}NeI2$q6^QQ6aGb+q*0wL%K(teQ=v6j+-K4HmPZ<-jx@G> zQA#L!uy@tpKcI(KeQF}bE-JH&xl-LejF*|DHMB+Wr>S7jq9CLOKnNF?vJ`~WP$xB- z51&_9j<7DkOj#E5yC}u3ZHj~xRwZrD*mvvm-#VV##b0_Y)2=!tg!yt#$k-LN7AHty zQ+CZh(fcpg3{9WQ{^rbyql3L1kmDQgP~ySb#OLr0iYyB46pI2bt>mCZ#f>4PmTUcN zM7mPu-Nw(VJ^FWH4Yv?eX-$X_ubnV>$Nz&|2{}!09N5fVY}PP@LuPtObwj zg2(p?PIIU$K1Xv!U8}fPt88jX6L(-?{Y7xB6ivg;p|+Q|7Ik5*=MD9n;Jlq8V>n`N zFyI*wuM|B_(WN+cgw)eQhqivJ_!U^rCJ}|N@yKwL<01!a3fid`6po&-jY@v#(_&q= JAq<7~WWU~MO literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..3e62fcdf418ef1f4df1827b5fcbd3953ca837824 GIT binary patch literal 1797 zcmcIlYfsxS6#brGaaAM)C|ySfhR@!* zsH({EeVu!Je3Na-2~b*mdp{@&`u*&ZUJO1CFDZpSyoI|v*m>00>!EdfdoA+AmpZZs z-CbD!Wt|Slsj*(boOvV76zl}bDk>I2w&gsVY6j;;0bDZl3sIKfsxW)XRwv`oYX7KA z6$EcP=9v~z_|^t7A$nQHHdB%X1hYvdT3#u!H?@L1H{T7{j=A7`a%1Rp5%CZaqxjYwIgT;Lot#5dwLF!1u6m2v(B_q9} zi4~=UqDKdd?%@$VUUX?R6uX$3eauy*_F=fpB(0$hg1=1#y#@s#2>>BnT+31rlAw+f znonO>SWd7mz(iRV^ZO{pZfq3^xwP{O_S5?Ok6wjUl{OnVK9+8odDST)%$GAlhOS6j z93h2uoK$vNA-i$r z1>=N#mPRmNSUr}8MLfxxsr#pJ(yd66&x!?-Ee)V&E-v^F0A#urhXfow_{`s zN6ak-ya3{rqK8#MB!^ZG92Z&$N`&zX6glnqbKa5lArpt MSeI=H!{sFV1&n!$Nd0GAB?LX;)AI?SK4HOV-1x<6`D z2f@3Jd8S1azVm{Z5WOs8mnq2tf_X_uji_am7;-20O9C{zor>;ym^7uTxqwdaWv{F` z>uH8yZm6ymzAp)(4_kEMnRCLwDr;#}C(AMbvz|3|bU~BmjhPaV<+hNP;>^ zXg+;iVL8FN05fG-%pal@yR~&BKWLSZGbOYzakK}Vj&cZirD+7kSq>X5c$x~HJ}NlP zAywQ&b499EJgQYTwWNuAu(19rI97_L;pR}=o0~*kSnGL1{Wdsnr^p!2mpcsD0OFOR z$8~gd96LhNsL-LU-zt6uma|Dj;cGlHoa4C20h@w$qNEfqvghK>fx9Cpn&-CyJ;GM%8;0V zs)`(6-+RtIK6bJVIRi?IpPxrXL4TcG(DTuk@dc$YgpY7{2V4IP4h9(A++52%``%;@ zpuY|4zpU2-SsCjEESNXaRA6UNrl?p5*@pA1(hN?E0=Q(jE<{;^OJV+0tSQE!)BT-H z3W9eX^Gu5n^PB=eksuXZkUG*(VT=Z+WTcn0 zRz)eH=)wN=UY{Ob_o*=yo2bk#mP&Q^FkTjt*3g!~e^!D)%Yl#@o)9jsWGM)#iB4)R zUw^Ea93d>gOj#E5`z*z7Y>I>wRwZrD*&plk*I9UB7mvy=nwDL0!^#S7U#@~ql5h%_WC&-4qt2GZVvT-y`FGV*(#yFaqb1GvoYz)nfDWq zU3>ImXAQTIQfWf&&5fo=Ryd~aojd%P2Z<<35 z_y(;NHB|8cs%)U7i94{c{xUdLil*IWL))WU`?;{z^G5egaNbT)FT7rEG2!(IugpA7 z(WN+cgw$7s4lR6Z_7zvoCJ{Zaalh~u$3YI*6SNmzP>QfUbHc^>fES2i+VZ1CPt)VS}f2;(9mIEO*JRw|M%2E(g6P?st zK7Cy=IYL-~nX)YAcUg*E+Y|{YtV-IPv7grGe_eQP7yr3xS$5SZA-B__%2o-*jdL#;C*-po0<1rD)o1HncsrwVw-XJ#Td11n2D(^}@U51{0p2@XE~N z6kUpAM@W57=+MHqW?ymTY!cD)8utsYa2(`-Jwbch1%;PR*hUjS^hvQU+YpA!S@sKd C$71sU literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7138361db92a43bbbc98c265178fd8e5e17fc700 GIT binary patch literal 1789 zcmcIl+fExX5PjdT7$p({v}sD9h`JW}o0yqBES0M6VYDnHt)UHp|EvW4h65oLJRw|M$x;wf5gk`t zzI9iFXx1eTv0i3 zi~`okYw@vu_jWB%_qptE&YUuhIzS z3#%v6u!#FuYuu>3Y#6ZCYEWzc@MSq3;B)W+9TtUlibVleR&vl{#*HDQ@@n<09lEmR z{lwpvJ$|;W2HQxfv?fG|H%=Hl-_Q3dA*V`cVd8KXI34B?@IccDinAPE6Yr$LJ9&UN z$)N&#h1QA+s(1iZ)=<*KU07Iu85}D`-EO_1&AF}oTv+RQt@{=@Zzrf1-YvJ7@bZLL zW*(*JQXDx#s(V6*2EH--iYsU1h@RKDUwDP%AP4LTn%gcYymZ12n)sqNEfqvghK>fx7chXUfq@49IO>W0Jw zR8{2o`rdQy@v)P2YhYilF()3+vb z2!kEi{AK+<$jVqRV9C6ZrUE;MvW|*{kaeABm1b~O6u>3Jbs@?UTpbpV#hPLqI=x@D zse|A>$2`*_3g3A`PKaKXvB#8T0m(cirAF2=N=&&G{3!{V-ONO9GfkRO)m%U)__ANt zob@$BGS^hs3g1_P(EBO6^vpTo-;}j9s*`0I0BU(Av<8dwELz{{NQ2aoh6-aeI3**! zq)8Q}grbKB*ZTu{bUmOkQf#6!dswPY-NR&6NLoW%0{>YFhAjs|507*Ig3Ae7PWG;)-O$DXQ1NZp5eh z-P?^o)8wkT1#{x~@F0i%K@LZwms+@&L;7#l6HY2y)u?P-c)>U!U!@Vu7go=tVG%d6 z5xA9j)i7YK^`H^{;mc|~!sqA%+A9j}6pI3`tmL3Y#f>2(88tl{d#*isKk>I;kDu+S z(Kb>ltqBq0jS~hB_w$WP$f*)qm^j`APRBU}Jk2zM;w*>P#5+lNClByuIV8YWXst+4 z#RI6afs!Wf!qWQ7;8-b|cAE`t4{h>uVXfzl?pxrzouOWMvD{+9%M)Ikc~VDL$B82( wT@pI9@U7X`TsfOY^t{IX!uuNsIbcuFUUfm?jT3gz#E*SXtjoH>a5>NZ0Ojvt=Kufz literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..2dc0c5717fc6aa2767715d4e48d7d1dd6c1406e0 GIT binary patch literal 1792 zcmcIl+fExX5PjdT7=eT++BAhyL|qaoY!T{(rXp(lXgO;qaq&&Q5I}r<$Jwkvp{S{V zC|P+uo;hcZkDX#uPJq(t$Hzfc(Vr*h^lb2Xcupzw;XT~m!uDT-y&gulwpKDfeQPp% zu=fg9KSj3-a%!v>FlXLKGX*<=Iz`1oC^nsEQ_bMCs(?#|>q3+@xD;lO#hPLqHafqO zNkQ#Gh_gLQa&>!o*WR$o}cj2 z%)=C2ibF?8zAAKR;ajsWxpFp&=y{F%g||2kO2D3=z3_sH^VqLK*440GQ E4{pa~CjbBd literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..984256e66b120804e12eef1c4d0fbb6f03adba17 GIT binary patch literal 1789 zcmcIl+fExX5PjdT7=eT++BAhyL|qaoY!T{(rXp(lXgO;qaq&&Q5I}r<$JwkvX=&3^ zqGaXuc;=irK6Z*tIRQ$mFK-7`MZcb$)3d?*;W?$yhu3g>3){a8_IeoI+FHqc_o>P3 z!QL}i{S@6U$f>bjz?^v_%@phe>J$|Vq1beuO*Mnlssb(*JIIHO_Qcpi3`{WzV6nE zv#w@H=8Ed1@O>c&y_=$Q&zuwfd7Y$D8%0q7pq8gXYp^)aqV=tgG)Nt3m|~0ur(~oT zG^?VNQ1oE`>R_K9UhPp6DK;@RyI3ma?qRqnB(0$>fxoN-y_N$Z8J-X>E@dqU$wWt) z%ZJZPCPxShFj1Dp@-9oUYnvh=l~qZbGxpv3^0y1m?ff5CEy^w%C4~8UM##`r*@+_* zuz_8PkMye-D}koZMSnBq#L>Zi3Hf)}|HHzUduuM-Eg^qb>j@{7ohB4F&b(lpkdM*` z<}0hm(y)m8*l66Uyl5D()^gBjfB$7M?&Gun1|1fKc8Wy-msWDnV#eO!v80{ zH1jY;m*UV7lJ5x}TKLxNORk)aB6?oqe&H34gA%YOXm7iq@X`s-(8Le@rC3*N3d7~3 F_yOVPV;cYf literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..f5b8e9b16fe3047c6beeb0641c0541e86fef166a GIT binary patch literal 1789 zcmcIl+fExX5PjdT7=eT++BAhyL|qaoY!T{(rXp(lXgO;qaq&&Q5I}r<$JwkvDYU65 zQL^%SJaf(*A3MdSoB*ZOxA%jpqTf%>>Dl1p@SIZU!&|t$h3yA}y&gulwpKDfd}%U! zu=fI1KSj3-a%!v>FlXLKGX*<=Iz`1oC^nsEQ_bMCs(?#|>q3+@xD;lO#hPLqHab6& zNkQ%XicL(-E|yBUdl)VXNo#0J;2$eNujN2Uh9`uJOIZs-GSN}y z^6Bf6$q~W=Oq6A@yvtJT+NMazxt*V}pVpUuTvb_>v{_@}L**8gmyHs_d_5y%=&J0* z5enEKuf#|C&8w9_)90eU8FS+3V84X?JM8~u;p@FM7w(pjzpM3xlgdsLiW_HMFiyy4 zX$13?)njQ`#C>cuZdG113|MP9XtclovKaUA*?)%)i$XiaqJT>)IcPEE#t@RdnxBnB zm$tl{_}#KcPqtNm8!6S+gb4B434`bR>0Tw|LE`Q9cR4b zdwAm#GT<|`R%NK_9#q{xNfUQqZv91YtQJkX&4#w;Hv74<*7HX9EpXnBQ7^n(ZZP5h z6JDBmn4(K@=m^R8gbpoyYxX5q&PEYEuW`Td3dcbS*b}t3T~K)GgcoSy$No~RD>jAU Ha#H*P+IM3c literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed003053e39e820896258bfa06498174ddeab438 GIT binary patch literal 1786 zcmcIl+fExX5PjdT7zGJYv}p>Zh`JT(xRrK zM9IqQ@yt1MeC!lkatf4I-`)?aihe)6py$JnqYFx50B_;$4tD++?DsLcy}g$C;Y*X* zhy53@{waDrkTYYwfCckLni<$B)F~svk-y}h^n~CVIr%6+*#07MMuX}ak ztfv{0xu!ZPd|wJe@2BX(Gv|bVStn^!rzi>l)bdPd4HoBFw7%7m2B{+rGmO#Tl#KL} zW>u6DiXI+ZA0E)7>wOv{#U^HE4@;%oJ&cxxq&2i9@Q;XH!V>p3AKS7jTHQN0FsEk4$7 zUabY1CYR04nG?r{2PNd+(cmu&U+-_YaIb{?U9TscRCbn7**N!taY8;zBbcwOo=C$Y zZek;FEAg^nz*?(8BmBdc<#>S4!8^296xt~k1zcIlL5mqThLCL3{A}#G^5p%*?|wag zvZn?+NU63aM2I&|7(CohH!2~gN@!u?a2GfomJskX(+G;Q5}p(9B;%btz?+ni0iU6@ zDnnHdpy~!nnz#!K>o0?2wP@OHHncso+0T`=o;SL0gY$NRdf~-#iwXaq@XE}i6kUoV yM@YUTbZFsQv#+>vHje0djr)c7Hx5d`o}j(zg2Edoyg(B__J3ksu_X+b)8ZFV0AddS literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..51d3e8816c9fcccca94437a23bf81373854c9150 GIT binary patch literal 1783 zcmcIl+fExX5PjdT7zGId+B7Ao5_L(WutlgBnu@6Hqvfof#KkxHLICmc9cQxwg(6Xr zu2z=U;|*%^d6u>J$|Vq1bVr%{7Dbssb(J&u*fLxvnt-<0ti`KV1(jaxDVU9K$oRX1V z(aefcLeb-!qdpFOY7E61=H?J1rCdCWS8=2@w8j6oKrm=CAS8nm!o{_$1tB@jNzUZU z_jMj8hzc-Mmc{bHOR-y)ZT8uA~1tmZ=;haXU2QD~=F6mV@N2QB8@7(%jC^VkSm xBP90-9a{8O=j&8Cn?#hm#>2wqjgu0vBxw6wP}p$7KI-?mpNVzFjxb!#ia%9fVm$x= literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0c00d7cd01b32726d81fcd0e0e29009d81046c1 GIT binary patch literal 1780 zcmcIlZBN@U5dNNDaVio5l&+&_67y)Kk|8uU)`@D{N6U@f*28b|3kAfF-??c7>c+$d ziz3VM`R=)U{Pr5Z=Pw9qjzpI2fRHdwaw4!{^3x z0R3Iqe2QKVi%BxmyR z%Q}xELjTd(LkGoc3*L4!Ye7zuK?5eE83DVcV zZrCUK&Fc+AQ|GF_1#{x)u!Mtt35UZM+P7aqemBDjCzYKiq&6KV*IDCidib6ZZqJS$aIcPEG#t@Qqn#V?+Yf0WO{3_R@ z=W1%WgBjJSr**YcULFUTJI-*&58$RHWW1Lz ztjb8$1Ejh^k|yrK()!EbSS^}jo5O5xY*un*t>+Et+u*#NB3sx~Zqebz^IhwBoT5u{ y>`!7{u_X+bv*IVwtzj_$ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ef5c6f2a35d1b7f551f6fbf61a41c0b22c15702 GIT binary patch literal 1780 zcmcIl+fExX5PjdT7zGJYv}sCECF+t$VT({NG!;?XN6T3|iHmRYg#hB?JI-bWN});x zU9Bvy$1`W<_;|M1ku#vQ`u<^5RrJT{1w9{q8edQfLwE;wcd+-UaWFvZ?(T-?$FGg& z0Q#?B^C@~gkaJ_bfF<)rnmO1R)F~aSNZlpDrJIz>?cAeU!CYp}S^qV+A0G)Nt3n4^sbr(~p; zG_#_VQ1s~fu#ZEZMnkcNxjDc{DHjjpRUByzZSnst5DeN32+81taB(GTK}b$>k~8`I zZJoz4q5{m6WwE^XQtZa2NXUg4EYPg3P)z*Xv@x}>*SNqvICFE2IEleEk1E-@B0$ybrL2*{Xe}p^9a3>GorX^&& zmoKc!NYw+Rxv)`^ zOL6Q7$qhn>7QNN^I#tdl5#_G&u&{IEqy#Jp+9nqicAM}D_50MH#JXZf7%peUZ^}Jk AF#rGn literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu new file mode 100644 index 0000000000000000000000000000000000000000..494c8af12750f1495290a6d14132cab609268477 GIT binary patch literal 1777 zcmcIl+fExX5PjdT7zGId+B7Ao5_L(WutlgBnu@6Hqvfof#KkxHLI~pHJI-bWN{d7V ztX7uS_($homzz>;|*%^d6u>J$|Vq1bVr%{7Dbssb(p0RH+T#CTAQ-e65R$f{>i%BxmyF z`zDVQLX&dle4%}ZCFFNIoN!Xvc|u;}!VAU;`6`WIzOs5M4U2e!4ZSVF z>v92OZ6*!v4?ouPA&$cjD6J^8Q!EO&wvvMub8ZYFS)_Svq`A@K{lec`J$bI6hI^P% zZB2*}S56qb*UuIyA!kZxVdCfjI31M`@E+3$in9`45$-g@oj!n@mXPsYzOX7IRS%Hr z21%ND088tyf@8I4ifs r^j7DaR5_bOl)J{m!nTc*60js_J6urMYQjG1_p#rIb;XV_T+WI=N%UYP literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu deleted file mode 100644 index 3828423399ba519e605671a5151787c2cc95985f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1648 zcma)6TW{Jh6n@XIIHFS3R-vqPn>rVeAgiiQ?U>58K3XO=;E~vtFJ++q_}NJal(s7o zGRZmq&gDCo13m$alV4x^NrHc0-Q&Cdx8Xg;(1S1V^aMw*D!W~;Jv`h&-h3~SU1%Ri ztrkp~V47G)VZ00#U1%0_2sjxgMF_4oM}erAd~5QG$V>_W2P+6 z5xVwgq3WEqFJdT;i{9GR1JUYT;CVZS&mUvhz;1-EkxMO?;$8Hacj|w&%PYk*IdyEP zwu5Dr6d_xrgwQP)jOaw>V=hSW)G4?wtW|HtUu`&Kx^1$|pRz}uvXX~R1pcYORY78S zDI7c(R+I3uVDZ{kVWPf9bUH(5sFE;sBi}Ibrskz?lz;kA`Pi*s40TOn*yTBf4SN;7 z_FAS}vNsM5QRaL@nXue0OFxBTwBV~%#yY*qMB7tnnO_EeX~3i86mmADLLd}IAe`m? zs2Y1C*bE1*be&Ly-ci9v;5wDQQ@}126Mp_7Xc4-~Snh-Bp8&=E7>u7;Y$2nhiswDx zAOL-hYT!7AvWfxuy0au;k}XWSjp4$cz(VS|Q6y#CVU(#ORI(f5Po&heOzaSpmSdld zUjQo)`%e_>rgv63J5&h$2piF??X27XnQKZ$){l4Y$?@anUt;iqf>9Z_c#cix8`~UT QMsKZDB~^651m%_$56*UP`vPFm%nu@gL(Q?*K;>K%RzK{m-@wIn17Xp+- zk+Yfcnai0oyT`|XarWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;KREYJ5NmNkoZ)|Q;RT*CBQYWVK65~aybDth8mUZ zX+tIjfy*od%1O9q9nV2~JAKVp1*6bt-g)UQqfJc=DVSrXu>;12GIQ4?f)g8x*#+W5 z)!gv_d0=!GKtUEG*4&11nF~)DG_1*6xu>MH=Bzc=Cv9wb;WmtTEEI92bw%^s5UrUx z$~~8&kt9Ag0oP=qyz4=CNi0I5G$C}W<~QVzGG|Y&_IJ7Zb7~l&eWCK%a7r}W#7tOG zAaohd($zU`pQcbAr{~L5cf{y7fhX-0K7B}G4ZD(IMm~33$v4r*A!+!oQeJ2-)Woxa z-Yk~YR)k#H5<=HpGGa1SjJPB*Q>Wy*j8;FDKdZp8)6I}o|CHVHoYg#N!QHnXL5o5# z(MceD>QMD~Qh09cKQ-2S@qB#gGFBO4b&Z&OiqJr3aR-M%KIvKkm2X&ld|&6-o@WZH z38k>jdJ1bU0}GCYeGYpM_li>}*n~=nP*ia|t5-m+&9z~zZ(RF2k;n6cvWZAguEMav zEi{v1g&}$ox)7{j()A93;$8|itQ~eBD5?8+3)qjqKqNKlXt}Z5dUb5e7rIvb_}NK%1qM;& zCg=D&m+xE-_!uxwe|_zxDgJ$RkMDZl2KN|47rwyL6CAy$>~w)x zWRerOPE(+qgnQQU479h?*L0;Z3iZZ=m)EYMO-0W(>ymsYi3R} z&!wo>YCa|bw`8Hb>p^ElEJC6XA#|_icjS*UmpfPcFI`QQ@VVT%{c~yFlU z+lEb8o+ES}&O+5bYh5Hz92eb{w+CYMo51r{0-rx7uz}skP$r)_uH>8Ov*0xR)-JC! z7i!|!KyL@jDk(y)NC}}^E*UYY%12z1=&4=MU0AE$iodntnCZ63GJnb*dB#c}+BFDI z1+Hq6z;j`LS6B_h^O_|qTZM`BHDajMxfHD3txU86g^q=N5cUQjnLNggw07NZ9SAyjssNM-s+)KcQvBeGq zB~?6c0s9dcYP1H95-6(}kuN(-BPQL#WZM`n-3iQ_$56*UP`vPFm%nu@gL(Q?*K;>K%RzK{m-@wIn1SD+9@ z&Su7EE@#f{9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|ly;M_Ywti=U!hX2Kd7kI{u#JteL9_d8{@rhK1!JHV$On!7MAbKxn2hBbLB_ms5OoVCXKq>U{v+=da4g(9xBu4tYcqBS!| zx#vIycRlDXiA6}1CWLO){D%Bd=4|U~|GBG~626dIw|`CzBeXA6J{wMn zW}C1HD++`z!&$mI$L-S;%H#BWneC1k{VMRJox-ONDXd{vGK|URjw|^p`Zy#F-&M*B z&4rqHHqe{Jvf7G}D_cV7noCAZriu}lBxdTAbeGZUr}AeNICi=jvg)6*d!DnJ2Q9e! z_9Mtq2qroSgijr+o=^%;js0z7y&6v^m@Z?LAy(Ij$)^YnbQafe7!;JQMNs*M#mDz` zj@@~tu$oW`yR4_M<}$G0SXk+>`f#r}g@R3}ln6y3$Fq71)bd;#))L3HuM>GZKPa1s z1m!9W8{9%O8MYXr7oiKm3MO6e5Gd}YV8iBN2ZEBikJo_x2n@uofTI*D>Lc>`WLd;y z8<>0(!=*cfxiT}WNzOLQs9Hy;(nG92Q_9e)v3<}+jY2nJjUNK+KUFMs>z(#)pb?5f yD(o`OQo&oPDH*!3p+z7kY)p7aAruNW3K7q+6+xT!-C6R|i%~0`>@i9CIQa*S0vKHY diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu deleted file mode 100644 index 0fe5a161cc69870a9562465b0f0a14d3232c6175..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1651 zcma)6TTk0C6n@XII06YVl+sbQiJ7)in<2ym>qNKlXt}Z5dUb5e7rIvb_}NK%9Sow% zP0sOmF5kHv@-bkX{ru9)GW_f69^du84(>6AE_{ZkCpdmq+35uB(a|3A=39mAK!^0 z-0=WeGWr%kK^7y{*oSeM3r`swSd%w%Pf2sbS#zw(Xzt_PhJu?UG$gixl0FXSEht<2@l?f!GO@6~+g>W`^mg!YBXXTvGc zY#TFSMS;+DI7?OgtaXt>d0cc?t{#ZduL93oDSY~n!UlFDLz#T;xRS4;&w|tNTf4l{ zT&Rg>1HBzAtEC9JG9`p=xn#s-su*!eqNjFAcWJGDEC1GpW2W0CtNbZ@Tg@1+pU12>8o0_evkgjZ%Cf3)8$)^YnbQTA17fYW6HuHgNT{{+`dtCAnxz1JS6mSd8WLNCqQv81slc| zI}nuA@w^7?M_{PY8aPU!s$xVw?<|X$YzvcbW4LrDFjr<~HObj_7**;BRqV$2Go=iz z5<3EI)F@;VcEB>g{u9Nj>Aln54KzaE<5ILMJFE8JaZSn4h4C%|IbqzwO9~-Suv&13 YXV{9+*!JCJ^3qn-N+*X*Qa(=p0zU#P-~a#s diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 964ef5252061492b51c82d6a1e8694452388ee73..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1590 zcma)6Yfl~@29e}4yg{jEZFp?#RN zS}>uK8scaHqec9KbMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaL+oHgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9>6HnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM&5_n*6cr{-H%e@qP}MW3sDHk=Yo zH!%|`1VUHgEM1+G_E`$$adxpxbx(|b6?ocC;p6)h*05_C#^iIylzbI^5|W1RD&?hS zTunS3=*?njZAHkHEg^KnBqb(OV#FkgnK~uiWwiRK{8sl_6Hwh{>l24RjXQa2OPnu0>G!hT_A! zI>+ujQ&>$Xg*-3>(}+QyI1xq8Fh{P6d^&cLW9QrC`J6VF$b*bsw(*`w(caRMbc0^U1P^ z$u=b#~8#M~ugf)H)u>VxC)U9{gyMab1 z3Q5>yoTY-dQWs?C!iI`KPS}|6kU}UFY!o7%VatOy?csUy(u+|mo$OIruyOJilA0J@ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu deleted file mode 100644 index 374a8d73a7653ce40ec5f278fe50034efe708a36..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1654 zcma)6TTk0C6n@XII06YVl+sbQiJ7)in<2ym>qNKlXt}Z5dUb5e7rIvb_}NK%p@Y$= za+7oXoy&JFhkOheXFtF6vJC&ay2p3DuY-Gxp$nhk=?RWsRCYQ+dvvsey!lokJJ5QU zG@CGCk{RL{h0!8@;M_Ywti=U!hX28Z7kI{u#JteLp6W#e@rhK1!JHV$wYW&)3m2Sn}C^kfu^9^Ovakr@al#1b;FPDO~yS0f{pwO|f62j_$N5v@=Y(k|( zC=NnAt1VKu_eQW85nSuqu?XGcl8?xBu0p4PTWBW37J#Tl=t{661l2nMihC*8FuB-) zprnrHEnq(aLygwJQ3_QRBl2ZuS;S;pn0y<yj*07cCY5 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index d5795b04cf8078899a2c0186865c485e166a4001..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1593 zcma)6YflaqhXh$Hsth_VaTu%kZzuJAB*wGPuJSF5nY9J;A|?&Tcn&_xHDu*WW5+7uttO zs|6D(sUeOQFj~YvIQNbdYq22Cus>LMj%UZob#fU%*>+%*Yj#D-#af%s50 zb1Xm}X|W4Hkj02Lw_!}?+!quY*5s|+7o@f3tTooBZETrv8%8|lin!9cqIqtJ*3=y5 zo=M(F5+9p@8?sQ|^`N^X79n1m5XuzyLf(?!%A7yD-QV%{kEx-g=yR3NhEt;HCTv25 zKSFSj8;FDKdZp8)6I}o{{?+uIjwonoVl;xgCaSn zypurq)S>DjrSPw@|J+z_$9jl&YF z%2gOPxP_)NEHXqdLYJHhDqZgg3fxPrVeAgiiQ?U>58ezZ(%z{9aE+bIL}$InheptM~< zD1P8EtA}NX8s9jXf~d(@b5H2u`dgvkxTq zthwVJvSM@}Kt`5h*4%}0nTnh;XjqfB(wvgkMzhvLpSQ8)ncFhriBQB9)>`I;AzCwY zn&w=JMik|t2)HFnmAgK4*TNzs3KK&2YH>&YD0BJj=wP3tKWBy!IuI(Ik7h)(ZOoKq z8A8|oS-3i9?TZ-B+c8}9)|noN(eL6uf82%LNZ+D7bzI4J(PuG)Z=Lc=bD^d=8|rPY ztg<5Hii!}r<&qJTsBFw737I;D)J3%Ft@x<}heEeQmgQ6S$WvCz(22l*Dsa`07+x9& z&yCe0yzE!Jj#Y%Hj}ep35E|+vEZxY@OT1}$=^JIAK2$k&8yG{~lNfe&j$xx-g|D-g z<(BG=!#tEV-%utUcgr$Q;TSFWY9&~wSA}SM2_5ryfxk20QE>_xn^Gwe3L6lf<@Hg` z^~SK-4P5y;p$WaCLXV(zs{EvYTWTi#`a|#{bR}5kN%c>F;(iR)uPk;TD5?0o2OI>T z?@)YW)7M zJnTO#)n?ZvG_(PZX>c+~GO4!Vk7RzKq_w NQMJ<10h5$ZqJI~eDs%t< diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 74b75ebd8d53cf72267f8c5c72d51f4833242cb7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6YfsxS6#brGaaAP5P)bMHCT3csHbaOF)`@Q8qvghK>(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`lQy=za2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($zU`pQdoqPT}-?ndy!g{U+|yhi%xE3?uTn<4V4XK29P2Rw*wu7i!|!KyTK{ zYAZsn>)%D+|M*yv`+s(s4tdCqDYwBYXBk03=MnCK)B zt~yjbofMuM`%jJaT0Ea#x{OtZSRErKpCUBSSzN(kP*1uRK;;`2AK%wGw#S*mYCU?6q{9HmfEACWI7%OWP*z~q}4F5Mx_ zm6=&ha<*AU)jC3z9%B8OQifKI?SnRI6uJp3{19OOsbZ;H@3eOVjZhR)VV7~13VxEB plA#L=S_E>!#)O9yLZM)z5b+FK5wvOFoh7e*6t&XH9+Q-hlYhbO7V!W8 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu deleted file mode 100644 index 230297e8e11c84a1f5ac40878efeda3dce43a208..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6TTdE66n^JdoM2+qEK;#;>
VHO59uBW-=03_Al%nVp%;MGEo9cV>6Fs#QXk z<(%`K%XcmZd<+<;zrOa;6#u@u$9KJNgL{mj3t!;r365S=b~-_Oc({kW`CcMB&^oR) zn=oOL8R8g)(IWoAnRkL%i*w=(|BG`k@QfLWd9H;$)r$z?6R8Y?IWdyU38*ntsM>)x zWRerOPE(+q1Uu_^2HM-{Yr4`Ig?i(`OK%x%Xktjt95eNIU~EV;a}6Rmu_2lFKzzuW zJ02iQM&AR-$zsGB`!Fsu;VFZlR)T2#JE#f;&9JR)oQ} S$CtI&j;dC=cEBX%(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8r$L-S;PTDD)o-dQ#5u;zlefqEqyOLo_K6hNnSJB5Q#NR6Ah2}y{ zJR9iET3KyH$dw%-bj>9rCR4?TOA<46O18^r^;7w`3LG2V3|X~L**(u$O@kKPeftsA zCOonv>L zDXb=x!Y=D6tho#6!d#h|)g)(|WmK&rROunspDAT%)!060qeh{du*DAn_Ma-2y7f+bH_!-0Ar*ER vXQ|+=)RYWeSkNMn6E-G1q!0=P8-<8x*ovS{`|d1x>AR?vPWG6je4P9P=1dqT diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu deleted file mode 100644 index 04f80d335c10f9eafb47b31cb54c0871d70c8701..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6+fExX5PjdT7=eT++K>|3ikc)+vPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u7yGr zZ8kG==5pq8$j5+j_VY_G%kZzOdwkdXI=IIey6_pEp5XXdWv3IgM@M_en{O4e1Fd&S zvk4O>nIVo*7%k!-oO>sTwYVV8@IScl0?(L{m={{uQ@w~FK9R~Wm=hzpoPZibjY%5%y!4jQh9-s-%rSHD28<19=B_~mCpIMW7Kjg7 zbH@W@$>>`E1zC((V;{z4E<9y$U`^i4JtfTzXU(xbZ(+*|wa7z}-yB>5_#3Ce05ki>~zL0n1w=$PI*8S(K@6~+A^vBdNLi z)cL#y>_=ee(Hb~Pq3U8pJ|8TLm~0D^Z)3Q0CoorLW;My#b{bXb2vzLH{4=EttqMB= zZPX}K6HdS~!2XlPYUsVw-VHQD-{VcRD?h98-w930(1qqNKl(Q;$A_3GG`A9O|h_}NMN91Nl= zP44x%k8{t>9v=h7+0W0tEW^Jp@9=H!%is=UxPVXa^aKabI=kKA-QV9qUVp2QU1%RB ztrkp}WQI6KVYG-ZIQLEvYjHuG;eT-91)eb@F)y^R$9fS#d?J-$FegTGIRQ0>8kOv6 zLnZ}*t1JV`NjPU6&p~@TeaTk^qtIyHd+9BsO-&3bm}92#28<15=B`NuCpHxG7Kjg3 zbH@YZkLc5(H~R82<;1%&xTW? z*(PSfiUOgla4%h*llEB(r|lHZE|!_@iP5j(KEB_DUCS^fpF6JPtLT#y;%}AmQgfju zo(=S7t*o{pnW_c3@kVnHacuRJSa|~U=u1OLQ%)@UVQ{=b*>F-h2z@Si9B8$lubl} zautRRZlReBOAOJA(4}Anldg9J6!%iFVezm7K}p@mYruX424YvhQ3@6H5&3+wEMl?^ zOumWX(jCEEnVHojXPaeIts_+FA=aNMWoXsdK4_yxp_{P9j{)|dDwewSPJ1`d2t^?k xb{S`>;H}h@3|&~zB9Id{COo7N3I!X5h-cV}piO&tp1ky3)Ji9NOj15h{sQad7$*P# diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu deleted file mode 100644 index 70fd338f09b70d2dda029a294e7f39ca5ccf62a1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TTk0C6n@XII06YVl+sbQiJ7)in<2ym>qNKlXt}Z5dUb5e7rIvb_}NK%p@Y#V z%H);+^=)z}sdV=E@m7Pw|9v$r8kHPs zLnZ}*>nsDxNjPU6&p~@TeN9&xqj1o8@X}jG8=4qWFvrZn9vB=_Sf-FX?u?yoe7oIXWuqJQjo|5KBBDUMutN9+;Jt}M4zP)@3qS-&4rqHHqhI# zvRaCeD>FjqmPgqPOpuky7v924C(S*1_eBhOhyLpytn>^Q-lUOi=#ITU6XDKU+IR$$M?05-2$etu1N~JJg2Z> zug0&vR_RvkjbcAkIp0tw9e0b$PpKHr`En^(yIY%R1qvMt8zF2CcvPH1!6sBngyI~; zXSGD??%oJC6M}19I~JjPT=EgQ&Q<6Xa0|_3SOO5W2we$QgrIsSKyfbx8wM9U5R}yM zyant>V5rd=I7*?aVnn{|EQ^?I3zKhSxO68lS7v54$=P-oRq6;;?8f*rr3|eSI|6Og zC}b11z%s!86UD0Oz0=+eG(z9wTC^)WtM-59nv$Ul(_I8|!nlQx6hfe2wcrlVuoa=P T?Yqn5wY{p9P7ax*e4P9R{Y@<3 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index a54b66d81682c87e874227cbab51e5bbeb5d95b8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1592 zcma)6YfsxS6#brGaaAM)D5axp6EiJRn<2yo>qNKl(Q;$A_3GG`A9O|h_}NMNpo7t< zN|Sqi?&I8Zv&YAParX0bFU#<+%R7AA`!cx07%t!wJUzj|i_UI0c=z|Wkk{WTWEa|p zNvj1DCYd3QQ5Y@a3(mb0#9CYsXZRmnc!6ikNX!c@?6F=%5T8h87|e;0TuwlZp++Tp z+K@>>;3~_2auUv2$8*r$PG9p?!6-DE_g;F-Xj2nI3g(z;?0~VM%-l7J;KYVvc7ga% zHFrEf9vR&QP>{ulHMe10=E7434QujN?kQ=lIctseX&YN!xD6v73q@RMUC}%@L~CY_ zbI+w{B#Dnrzzta_?|RT(5{r;1O$cR5_(I;2-^!dnJKf*$^pB}wg!YBXXTvGcY!fzN zMS;*&xR7H7t3V##OOD1AK!1ou4NdM&mC9tP4r0$@wZBOsku-S&jxz4 zR#saPa%D#d-Ehf>$y71olEh4%lJGKG{Z#&~0>?%-Lssom_P}#i)1U=+U%v-I3c*At zfpFEK>KUc*ud)B!STDzVhIeXS4lrHDDnqP}5tC058t5!8;xK3^UCW^I4T}%&>KxnS zOkp*l6t-DUVa;V=!LhK^Vfo=faS8>SP$?0LQjYiPJ5URBZCHyO*S=2V@#3ItA`+CV zFl=xO&1BePh+c#)1uK|zy(6Hwmx2wuhaCt?>OS5A_9HM5y8@0VxC)U9{gyMab13aPNm vI7TZp6Z9Ef diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu deleted file mode 100644 index bed908a162008352e0403c19eea24855eb0eb498..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcma)+ZExBz5P-kuSDdI+wN)r9-KNe3B*?0&Q#+=ztsgBD8}LYM%P(c1{`lERfRwf? z2!(Lx=ibi;d;%CJzrOa91pmIe$9Mg2!+VUO2Vdan365S>cDr7Cc(_B}d@so^w2z}! z3#LpkO)R4@UWPw7bv7f$;G9^^|Ki+bc+RxIJXe`HRm%YJsgRn%f@s0T6yyXdDmqY_ zOmhO)NdlA;f6iE*f^uf|maa5Lq0xMB!Wl-JifEEE%XDK8#`rK(+awt$#s{+x;(XNH za*wPS-3Q3Ya?F~$7#C^gD1(O4X)ASwhvE+hfBWGS8PLw8+Rgt9_}(7jyTkv~#jK07+t=jhLwW`qthna)QuqF9}ovOGuV z+CK|b=d67Z!{?7NT=dqS9*9=&asfGS*RdPnTjWy9rFfSjMI zDMGf$2%%dp7}1H$$6S!$sZ*4?uvWblKO4g#(Ynbpeaaqr%E~fyBJiIIuKE$fOX1+T zuv&zd{fgJN3KN@SM5i-^hAIh5H}dlmZ(CmKM){`?m5<#9#<1y047+uXVOzb5-*_$4 zEvq*U^HAn|Lz%GLE=xa!Vzl6^RmM8K%0%0j&@z7)_&Wn0C8vJe(4N9${t&bXU1coyLG@37;(iRquPnBZQBuY89ykb~ z@6iSv$53`Lkgq#S0+ZC3w9as0PhcVS+$fT=dKhKu2$j_h@h4JhS|)Y~O3Sg&#_#{i zWB-X_J@n2hXNL-*AK^kYYdh=lf99H!k@ef1`{MX@^Di;@K*6YtTRg`m^Mh@UFQd0^ NR3%k(zy#%!=pQR(DtQ0^ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 91c76b87e3edaff71dda946d22f2f2fc087b82c1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6ZBN@U5dNNDaVipGD5axp6EiJRn<2yo>qNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`lO33&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#`CyQeXH`VhnE`6|;LG5SqhfSh!juq)|DRHTk8`6l`}hVWabywF^ztY9PE ztd-SPgk0GXLf2d}ViJ{4xFjJ{ucW$+RzH=0>%gJWX2`02%IJFQ!-h-Gf@6N2{o2F5;uLb0Q7I7$A`bWJ3s6gQW7xOS5A4g=5=y9Q2TsHhLfmy;y{lQb}C6T_uD zgoQG5t4Ye5WmK&rROunqpD1N$)!060qb9x^zrqh5_Ma+Nx-Fb8+(;u7gp}J=oRxx~ rq^4x-{DS759KSLCAqHP4*dRna$5wc4+IMHsYad0ebhO7L<rVeAgiiQ?U>58ezZ(%z{9aE+bIL}$Ingz1lq1e zD1^Iv?&Z0c13m$alV4x^NrHc0-Q&Cdx8Xg;(1S1V^aMvQD!W~;Jv`h)-h3~SU1%Ri ztrkp~WQI6KVZ01~aGE5|hX2J`F7TWgiFu}lJ=Mzq;!~*%g9R~?%PFV{RH*1c z8#2uZTqg-oPQ0CUJO!QG*=xGe8HGmkA(y#jw5f?98FS1u-hr_`&D1rC;Kcf5-UG>f z*4%LqSuy$^Kt`5h*4&42nTnh;XjqfB(wvgkmb2DGpSQ8)nX3u$L@444>4xToAzCwY zn&w=JMik|t2)HFnmAgK4*Tf&YD08`EI(W%6Q6kUej`h!(VT2BZO6Q{) z(X0-evMfXB+CK|b=d67Z!{?7NT=dq#9*EIz;sWHnU5DLB-=#ctT*)`lXEB8DCghdo zLQQiv)b(6hB}K>;6(MxXB_k$L*_cZbB6SM33vbn1@w0Ip3az^=%cty-r>vx*6M_F! z;Ho7tJQogjh1D)RZ&|$dRk+xg5tGgk8tNqM-N=tjyzP3a8)ct9R5A7&7{jI~G3@gk z!UJzaS9omQYjG% zOAwyr1*t}RE7-0Fu5_Kygx*oXN8mbDep0|KHIx1VK+qy|C0OQ@>Yo6`{TQs@TI@hj zQswg&a1emLM;qWchO&zR`Esx%V3Haptz)=!C$LavZZ%0+J&m$-gi3Zp{)tkCmW3UH zHfrpv@h4#AVgJcuJ@mQLxf^PPeuSH7*M8RHzZ060k@MS~d2;-^`Ii`cqF}Y)4$rX_ Vez5KFW%Sxn)k;SPOj16H{sEbyE3W_m diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 161b9e5589680bebc567641aceb957a8f5e9299a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6ZBN@U5dPj@aVipGD5axp6EiJRn<2yo>qNKl(Q;$A_3GG`U+9YX@w1b(O$!X7 zDoyVE+{<&%Wsgq)(pC`<_5s>KcYqs-aX(ZO>^Qzd*Zw@&|@8Aj+psBAu( z5zXqbDa&(&F2lWWb&orz34HpH!0Gua*&Q+ZRa}Iebn37x8K&ej$CZ2)eVjo2ty5lT zF4WYsp|01;Dl0;+=m?=}E*UYY%Ew%in5kQ^T|}#%iobQ>*l0au**<0WJYy9NT5$L6 zM^K{>OmrI%t}0YDpah;82iwMKGoB7GS;Zn?J)s14Sx;cYWnjUvu+d@j;a+hHIh#@`5sEsF_wpl9sdHo4C>&S5ZshU&plBiz zl&LUma7)c(SYn7?gf0ZjnRNX_ptzra4U2~z2ui9xUIPvyFc7;2juR-UkI3hfr4f_X zFj*bLr8|U$GIOg*#_DC1ts_+GA=aNNWoX&hK4_!Hp_{P94*~X{DptDnPJ1`h2t^^~ xb`@u(;H}h@j9ggIJdhJMCOjk%3I!X5i09afpiTSktnt!!Q7hfpW0LYo;~)6v7%Tt) diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu deleted file mode 100644 index 6eae3c183fccc5e700f4c75dc4c67fa8ab2fdd0a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6+fExX5PjdT7=eT++K>|3ikc)+vPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u7yGr zZ8kG==5pq8$j5+j_VY_G%kZzOdwkdXI=IIey6_pEp5XXdWv3IgM@M_en{O4e1Fd&S zvk4O>nIVo*7%k!-oO>sTwYVV8@IScl0?(L{m={{uQ@w~FK9R~Wm=hzpoPZibjY%5%y!4jQh9-s-%rSHD28<19=B_~mCpIMW7Kjg7 zbH@W@$>>`E1zC((V;{z4E<9y$U`^i4JtfTzXU(xbZ(+*|wa7z}-yB>5_#3Ce05ki>~zL0n1w=$PI*8S(K@6~+A^vBdNLi5Da7wOYb*1NE2*{TWY%2(-PZAMH!MQEV2xOc-aGU=x4m2Oykd|$`dZ(s`Ro}{qP za|#>wYWzBCRc^)JD9%Gw^9^Ovakr@Al#1b;FPDO~yS0l}pwO{!4#L@hN5v@=Y(k|( zC@w*KRu`lm?Tuiw9=O)EV-vc^B_EONT!l#ix6n+63jk4z(3N0CNUC=N6!%iFVQaAi zK}ns@YruX4h90efqZFzxM&$FsvWUsHF!?ryOLqcuWoA~BoNcF3m5xxwZp=SZ%FwE? zBhW^TLN(z8ECcL6S*(WMJMG;-BlJDqM7#2{8vmWplnhyj*jq%FJAg2u^G$<}DB( zs^*Rd$Rnd~0pw&kX02TqmznUCLDQP7oq0;y8_wDjecHj6=dNbN6QPJJtZSMVhG@;q zapt)c%|^q=Cg6rFm3MvUt%yZP6efgHC44S#$!}%Ow@we9JAJ1XTStG)3?pSK(f`x+k5p1U|k`;Ot_R>7E$R~4!nPy+uN2iwMKGd2TUjU-vcDnhJ}5tGdj8tOD|;V1|w*=V5jjq(rg zsvNuHOkh2s1a?_ZV8dl#!LhK>Ve{cZaSAz`QYjINI*#}9BT%VxW7sGhSH5oK@#3Ip zA`+CTFl=y3&16_&h+c#)16E-G1BoGP(8->AR?vZtO8h`K0j|{(l%O diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu deleted file mode 100644 index 3107bc36bc19887c93761ff52dc198d14e3a2a51..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TW{Jh6n^Jd9JNx_RiUhOn>rVeAgiiQ?U>58K3XO=;NjSoFJ++q_}NK-l(j2S zl*>8(&gDCoLp}kF)1P1ZX^MYc-Q&Cd*Wo?J(1XwL^aRH*D!bjFJv!P!-h3;OU1+~+ zv|2D_k{RL{h4C`};LJNgti?HThX2927kJK$#5~u+p6X=;@u^gX!GajcX6BkiaAHF+`#^k% znmZmKD@OMLUsc6zYq=^r!02ptHO%||n$SsgZI zd5+L^cowS8S^FY^PahJv=&hYS5ToD3MaX%(4!e<|P(E{9$v4qw3B-Hj@=9}|rk)LT zJyupp5pqRF2;FkYh)GpG=8{BDor3VfTJ=@DHil!Ob(3ZKls)o{l{9o35UvVbRV9Ic zg@fn9Y84hcf3I%B16NS^6myqXl2B1ncxF6YW5uV__qN%>j>!Q^?trN{LXM zgZM0$NY&k2!FED$rR&5Z^o|QYBG;J;odRyDnG8z+q86bm!SWDP{{$%RCt$Y+5pE1lvRw#mz|{%lh!a<9mAzNfrT=2t4YS{VU(#ORI(f6Pn9yXOza4> zQR9$J*aE8n`%e_>ruR;JH`EAyk89Dc?X27XnQKZ$E=+eG$O+>XJ`xClg4KdMJjYgq U#p-=?M;AboP3|yT8AMy#7`qd(b&- zwA(PHk{aS@4&!Bf!I^iQSc?U5hW)|9b3CU;VkR`V$9fq-Y$}zZupmYN9;I%f%de4oJC#VXl7G5Sqhgq(Kjuxl9xV#!UTVhG)YGA^ z*GelZLayitp&KSCF{u(`CP~cHEeJ27RZqp=I&f^X9{8m%v^uj$on8+-j21dKqQw2$gz>^`}aiyliYAv{B>GO<3f|0Q*lBE8Tjhy&GzTqL74L w#aSu%S?ZjOTv$*M$O#)09uf$Jf{jANb8LCgrae4wy!L6-N;meX%-N*z7bcGxVgLXD diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu deleted file mode 100644 index ce92e1493134d6574866f856862e5a3c3b2ad203..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcma)6VQ<58ezZ(%z{9aE+bIL}$Ingzq_kZ@ zD1P8EtA}NX8s9jXf~d(@b5H2u`dgvkxTq zthwVJvSM@}Kt`5h*4%}0nTnh;XjqfB(wvgkRiBQB9){V>yL$qe* zG|jmbjVQ`P5pYYEDtCS8u7yQN6efi3)#8r)QRed5(ZN1Pf6fdebRbkZAI*qnbiD6}53ET6JRp0ZMgP6Yl_fvbMR@X|PV zZmbsJWxwKetRlqb7%}M#p`lK~(vAGQ#M_pazESr1LzQE}F>KYV@SCh< zxutsJFb`$THr=3ip)M8Rso9iC$={9xPT%jm5e NRVy7GFiH6&`UfV`DtG_@ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 5c51f78158b051aa0c244814fb66476d212cfaa6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6ZBN@U5dNNDaVipGD5ZmKh?y3t%@AUPb)wt&Xt}Z5dN{V_7rG*T{OlxcLxIt# zN|QT3_ww9x+2IqwIQ#XbpJn*>`3=7Ae;wXn3}^5e9v@-vMQ5)Uyt}&_$jk2)vIj@+ zl1>MvOfo|pqcC2?7o2-1h_$#N&hWpu@B-g4BQYMF=BzW(AC9o)g=-k`L@44)>x$;NAzCwY zkb5pgD@lB80sIEBNblN3(QmYHse(Ocn;KQ>{PGK|RQjw|^#IwTFhRmyYCg_?Rc)XiF1 zZAHkH9U*kZB_k$N#h6PHGj&U<%V_me`L_xj8*PTH+NbQE=d7kd3+}%C2vQV+iEaYn zszcS&N#VJ1@YGnZ#q;T<%UETI)iGl78A3yy#T6U{^`vV7RK8L1>3yAJbDSxxCX~V^ z>nW_c3@kVn);X*_+$&C@U{fk3LQ%x=UVQ;-X|4@xdE?sGjXa+1l}$u~autRRZlReB z8w}Bl(79j*ldiuH6!%lGVe7C1K}p@mTfjjC24YvhaS9dn5&3emEMl?-CU0W6bo($@ z=FV!8vt}7p>j+hPi1lYm8Co^A3)-l0=q9Z2LxBCKiluJ7)7}j=LQzPCUB+1|_(^I? pMlLL95y%M}6CP3sg@TPj#CO<=piTSkG6 zD1^Iv?&Z0c13m_flV4wZNrHc0Kj8b`x4{F((1kDX{0v90DmxvoJv`h&-hMBU9XLOZ zS}m9`$qccK!e|lx;M7?`@*HQx8vYk&PT(0c67x)p{8TRjh)<+44Ccg0E+?SIP@$p& zZO9}eaFZlJIq`P6FPv6p&&L}jRk50Or(WWMbWXv+t*aMUMG*jCof|J}Qvk%1i zthwbLvSf50Kt>iL*4%}0nF>c4G;))+Qb$Q^!&z&rKc3^9XSOE9W1)yGq-&bzhG@;q zY3jHXjVN-V2)H8)W&Lw%7@>Wk(%EoI zG^@iVEXxqO@y|llKD&tF?EEr@%kC=JBQbg}+{LFl>{j|Nxzutc-$(nP{<{u&t+`MW z#|FBdE32dk*`gwZ?zm*cBq|$mNkXJ{!FJ)TdMkd`jzgh!mu2~sJ@J&4G_)h|p9);H zB!-v5{25FLlH0^M@+NZUbXj_aug0o@3as zSK-%D%W_NhMqwVxnr|tSmfJ-cr%(*%e7O{?-K|`ldkQV{=fIy0cv74~#wJusgu)Vp zXL&)Y(cTC)>wzm>J2attRPYhFPL-b&unWzkzW@-l2we-7`J{R$KyfdI+;1(mASkKw zc@NkRK;NS^a1=w?#ejT0SQ0Qv4U^U}T-pKVANm)IOvUG$>c0>M&Qihg=9fCG$ zmjvp7%Eg_ zPa87H30$QqP)@=*>v#s*+v#h*G8lzs>)uOm8Et7|NX{HH%^ff{l$p5}5uDgi%q|cg zs^*Rd$Rne>0CKVzvDP+>%S?F6plMCk&O9aUHD~Ry{&0dV&t1)k$3hWTSXVU94bhsJ zqs((DnvI5!O~4IVDDQgET@s6sC`<_5srfDWqs;ln(f&(EQzd*ZH%|YY8b)YesBAWz z63yzc3CnYYuEM==b&gLHI6gT`;Ot_V?4B6C6Ylh59d<3llzis6l5e9!((qfQywqH% ziDv^{ua#9+gj~@PLN{D8Vp5flxFj)Cr(nB?Ry`GetH80*ddRYU${u*eDjKxl?%R)` zMj@E!G$346sA@n7JU8|?jn!s6A7HYKRfJd_BPN?7G|*|>!eJ0lverQ98|I(hS2?!F znZRm732d{Tz?#dzf@5K$!{)<-;uLZ=p;96gbsX>IN1#&Y+OSqQu6&)y_=cAb_E_1g3b?cq>ZlDp0Ldxwj v&Qie}sVNz{u%LM$Cu~f3NFWpnHVP5XuoXd@_T72oweO-SUb^pO}V diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu deleted file mode 100644 index 28b32c915377c15b0e7d1ccae995b96e6d76c131..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6ZExBz5dNNDaiUVyRiUhGt2!5uAgiiQ?U>58ezZ(%z{9aE$1Vf)$Ingz1lFz~ z6vEv-_wwA!5uX6Y$+k_%=)q@reuk45mEEq_9v|-^Z@-nuF0|i8 ztrkp~WQJHqVZ01~aO$ied5$w;4gZ5PC-9sZiFu|)ex{cJ#HUgj1`A>&ms3y^s8G?7 zHe{L+xJeSAoOrw3@)WeoXRql>XA~OEM<-p*Xj2nIGG>`+9DvDvnyGCP!Ab6uIRxT- z*4%OrSuuJDAS25$Ywp9iOogKi8o5bZsiUN|<*YT)AKEzQnXL)&L?~hl>4xToAzCwY zmO3s)BZ^!o0`ACCIopTsnplKHAwnoo!e#QF{8r{_$9nLR^@CdMnEsdEyZBg#-Admjms+mm+h`xue>WkoH5Y2? z*ihGVWt9{mTU3P59hZ!lL}g_1tohu&H3>`){0J>0~6?PopyJE18VS-;(xC&#awe~G~-3b_{C;yKQR VAME_yRrK0X)k;T4Oj16H{sNk(E3Nasp}$6{@kP z4Vef6*J%nlBjKENECcQB^fg}@EQe<6!Aoyx-qOSnK^-;C9WXYOnYk9>jMz}jE)XB8 zW{w5O6U}!42(lQ_);5gEjQgBI)0(WE`JA-ZoVCaL{VBFgxSA1*xEx8cj8%kK9U~^2A~eux+`?fHP_ouQ=^KiV@2VWz z<4j;Rp#-*BPhibuV8PL_(P8u9QL!8Zo#av?6m=Z$zGSbwUN$;-y}K^rv+-GnWE3b6lFvDB@1+Pi^9C<;l~ wWt^pgH&W+h=)!`EKu*}0@Q^?#6l@eCo?**_Htpd>58K3XO=;NjSoFJ++q_}NK-l(j2S zl*>8(&gDCo13m$a)1P1ZX^MYc-Q&Cd*Wo?J(1XwL^aMvQD!bjFJv`h&-h3;OT{wH! zXt!a?Bs0V@3gcz`!I^i0Sc`Mw4F7|3FYufhiFvMtJ<-bu;!~*%g9R~?%PFV{RH(** zHe{L;xK2}`oP=}M@eH)Lv)6Q`F$&GrgO}bi+S0_3oH=Hidtht`GjlB>II$s^eIPzW z%^eSr6{GtAa)S%!aI+~S-5m*FkOa1Ni~=?V5;boP3|ySuxAy!uuldvLs; zbUHAlk{aS@0pmq{!MS&wSc?U5hW)|9b9_&Y#7t;z5A`B~*iatdk!H7eQB zhD-&4%PfO}k#NpBmV@?o_L{E>RzRzL=cTu_Xlr7KppKf>78o1K%w3ysMrtQ{uo;(T*HVbToG4VS2WKJ(VCjW z+%w5rN#bJ@a7`ATvLl49nWV&IN{pE#F;llByo^>qm4B~=Mk)Mj96UGH%dwu}t(unuOqa3B5UXRvKUFMs>z(#)s1b@n5_TD9 uso-a+3o>$HK}8@ZY)p7aAruNW3K8F9%Y!!U{#o+cr%@}N>`+;-N%9vWIT>L9 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu deleted file mode 100644 index 225cdf3b09b4dda07f262e43c5a9790e0df3b427..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1646 zcmbVMZExBz5dNNDaiUVyR-vq|Hf=5-K~_zj+A)=F{b-rkfQMsSeklX>$Ini}OKH2J zAr!)$pS$OtyYm4b1IF3U&%G?ezpn1^ZSU*g4rA!TCwP2>qZgf>PVgQc?jWzfRmcvU zA1AFAOqgVbI7VT#h<|YIogmiYf;hwf;KB<$V@6_LXkkzFB7*otD#Kt-jO20xY78|h zInaho3IboU3@9hzoOL`0?d|k6Uloi(qj~S8w~RJ5F{EIQnZ_O%8))XPNdzZ0kl6>~ z18eShfGio^2T+j3h&6X%T;{@41`TWSR_-ZjZ8U3*^~E{1yl`7aJQj+$(z=#;Ziv>* zoaUZO(MS>>i+~%lP~P>Rvl12|QJN6CQ}bK$TbauzNBjF6{V_F+(7sUlY&a#FZDS^^ zC=mJ*o~5gO_92Cf^AtXISBdV4(fgm`=0Z(88|ZDSthOTL z%7PHO;gS)Psba(>iJ97^(q**zt^8RBj&*K_tg5H%f#vYWzBDRc@u; zD2_u_^EGACakr@Ql#bz?FPDO~yLE{3AfaR7DhO8wJSa|~U=u1OLU94&v${R%vECRq ztAT4@J2s(vRO%76&Q+Kca0|_3*nWs!gsucD0;%2!P~1zwhKlR*82t>hZ!5yAqE5cyg(wvW4qXvHes+>RC@`u; zRhr!Eb06oPnpLT@c0P(FFL#3;N9EXLSB8VkX<-A zNZM_fFvTqKjKXLUUvOS{L7c;qc+3A_SqMC1R$(rUa7ShlL42aLWiTgJaWw%wh8mSL zjU^LF;4;gAauUut&vP(^o4)3&f>CI+?g~{nMq7qhBAI8lu>;11GIQS|f)f{t*#%OB zs=4O@^1$dW07(`j*4l<~m5YKhXgHg<^MaE0nzQ!UoStCErQa~(vCzbq))mcjON?Rm zC@;7YjU*{z6L3uyy6`>dE{R1*lqQ63_56nX*7oel(f%$+e@rbSv@di%8%~K~o0tic z5~0g*FI}DE4=J3Ur10^4ndpvK^B=G)6(&^Vo@@0sI)&r#Tcx}(Tb6oVx~^XbQ!IFD*sl2W1pKLtL`bg=Q*ocFoOH9--8l`V4{;ixav^# za8h_~>_0WuTk(8&=`vOsVs(tze2UP(WN`zBK|JYt{*`Yi-@mJKY)>6USp3VxEBlA#a# lDFZoSW5PoUp-^y9hWj)tV$yvPBgYG!<#fqvfof#EsXsd?8K5$JgH7T!Dfb zMVrmcoVlF29PlwJH!bz7Fm%hAw=9$45ANR@vzU?cw1b^7>na?7;bP z(rUqkNoI&+6h@2q2j|`iVl6I+GyD%OyudSNB<6({_Eaw-h)<+44Ccg0E+?SIP@|Fq zZOEh`@FmNDauV#U<2h(=r!VO$XA~OEdoR6Zw5f?91#`?a-hi$y71olEg^ulI+r3{Z{_09mhJiT~^go_P}#ivCvK+d}?srkrbW^ z`@6z=6`po1UHK|qtj&nYrw9#n7T0bVCMMl9z0wVf_wVW$`~6E{-IEmdc}`)&UX5Qz zt;(&~8^v*`YQCmSI_?%#oKi8I^W{>ocDHtM9w>AyT!U~m;6ZT;1)ESQ5sEtypVa}W zCwn8=3k3qfkw_ z0LuXTM;5E0_fC5^&`aRm|rl+pp(w3!yE%@AUPb)wt&Xt}Z5di6_w&=v9HXD8_s7-&(I zCinW>$GPWbmyZDB{O9Lxp5tE^xA>;}rGJYtoWm!0e1!dHot;kf?(J8ma1p#9&S6Qs<$tgY0#BJ$m`fwvk(nnDA1iGc%!pN7jX{r~Mm2Vg zC1Xk8GS7i>63;o$3oyY=Uh-AJDC{)vf(nk&rXiL{=9%4j1IEQN3*RJy6Bmnl3nav< zx#tn`!01~5lFWy!xeep05P>q-akgj`fs)pWv)0I*o?yqNUo+y7(8QP4CCxKSjA8bu z2waJsMkAyq;F`>J@LlLEh($=0CWLPF?1uc-_H5&5@42J77D1|w(?2Gb5!w^Fm<}ey zuyxp&Nr}*9yqB)_@rMjfPcry;zQ}b)toaYvm5M_Oh38tmj?UmX{+1~(3>SJFSl_Id z%4#b@zHA7gYpxivxt2q&NXpbM$u6VSPvze-aO!hCWYs-o_q<>=3r2AN^?MYf5KOci z5U)B^J)aDo8he|@dNH2PFI&VaLoAOGTTBq@n>;PyAnGSuNucr#;xe+}SzPG2_;9Z|1t`^B5KneoHbthEE;VZyG&6&YWxK}f*0@r diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu deleted file mode 100644 index e3ba197ff404d56cbf8547c35f3ea3f2affc05bb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVM+fExX5PjdT7zGId+K>Wj)tV$yvPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u7!dc zMVrmcoVlF29PlwK@w@f@_b)0cFWGYXC7gO}bi+SJ65f;nayZ@}1)X6~9qaAHF;Z-Mxb zHFrEfmW;jyP>{ulHTPj$=E7434QujN?kQ<)IBSjd#W}XTa9ct=7K*r1x~6$t*SdCWLiecTBXXUqFe%^`n#phgAZii1608VG^-h4|UJ5oWEp{L%sq=Xa z*pI-_qcw1pLe<5Hd_GtfG1(R--^Ot1PGGLg%xaRe?KG;=5vtgY`DaQQS`~H(+Ne>e zCR~7Jfc+zj)zEvVy&GtRzQvPhSAJIGzZ060p$p4h1aiW`aaAM)D5V3mX)`TSo1uvf)`@Q8qvghK>(wv$L080&pPi)7!9a_u zG`ZL3KF&QidwdKS=RZF7@*MxXyu~-Y&x2cx;Q~Iu<0BkA>Fjo+cYl8edG)nIcH#7} z(Qd1@$10wwJ=XYH{$JH?JmzhT5IS@IB}*iA6}1CWLY=f>bx;m$v6yr~6NxzSZ-squ-~N5!x5Jm<^}I zuuaT_Nr}*>crRU@lXn@Moo4Xs#7L4Hj%eN>-A(-ei zAYOH-dOjKaZR~Fw>%~~lZ#j`{8LJGjI!0_UMQC91w1mT`pKL9G$~Tm6-_$vFrS$bx5aq2uDiz2+1on^2_?N-|FO>Kjmtb8T1)9M`^1;_>33Y$6d<=s0Zf z3&T|0VMtztE(McJ`Q8yw+{?hly~7>^C3PP!0s9FUiCqCl8C29KpnCRDZ6urB!45V5}a+ZsHO@MA$#7Sn4)-6a2s+l!PSRGR{)L sTd64-`naDmk`p&3K4cIJ1($?~XV{6TO?PqNKlXt}Z5dUb5e7rIvb_}NK%p@UH^ zigG!}-?@C}a=^!carWbLFU#=Ht9yLc`!cx47`pHYo}S?7MP;WGw1>;5y5IauUv2$8*r$PG8ei#wawJ4_On!7MAbKxn2hBbLB_ms3YoVCXK;v8FExGfrQJKf*&^!KS@g!YBXXTvGcY#TOV zMS;+Dc$TX6*@qM^&Qtie3#bhuky7v9P`{ZS!GYzBhOjILOX$Q)!@1)Df}($KNr^1 zuqoQTnwJ$zSGGzM>tn>^Q-lUOi<37DO_OfwUg?I#`**dE-TI}lu1N~JJg2Z>ug0&v zR_Rvkjbb}gIp0tw9e0b$PpKHr`En^(yIY$$4-`5U7D8AY@TfS2f=#HD2*ojo&uWL% z&Akz91_al-b}T~osN^GZovY9(;1-(6umd1!5xNqr2toBufZ|>XHq0${ASkKhc?;N& zz)+(#aFjw-#fW^_Sr#$b7AD`uaOqB9uFTA8lC$kFs?-sx*p2aLN*P)ub_m+2QOG7N zfn|XGBZ^hid#Ak{XoSASt!P(vR_*`HH6=qAhPw#lgmDWWDTF}5YQY_zVJkvo+vCgR PwY931P7au)e4P9NMKUa@ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 511cd5573f9d5c5bb1bec1c39488d74422989699..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1591 zcmbVMYfsxS6#d>`aaAM)D5V3mX)`TSo1uvf)`@Q8qvghK>(wv$L080&pPi%+IvCZW zDoyV7xsP+t%^n{E#`%wry*$T1FK_Wp@AKdmW4M41@c0M^&pNx^=-uDnLSB8XkX<-E zY_!`jVTxJe8HLdzz2G8vL7c;qc*}od83dj&t1y>FxMQ;eg~ zYVLW2JTSToK$69XwYFhg6(UduO=pXC5h!V|Icty2*(r8h`VAu<3r&1!UC}(Z#299e zi@=p=HX0!{0oP=qgYQ9iNi0I5G$E905u~~yzqCDna=O3c>F-m^2<;19%!X59*d}bk zq(tabyqB)d$-4~BPBVCavCMTxtoaYvm5Or;h38tmj?UmD{#Ge34HtS6*uZR-%4#b@ zzHA7gYpxivxt1fYNXpbH`7Wc?Pvze#aO!h2WYs-o_q<>=3r2AN)lw7@K(>u`DM#kWr)==Vv8w41Cysc97YLcYZX+!p?v$M&apks3|14$ zV4L*})?7vwJc~OWcOULGry$vcDuqx|a=KTafm)wy!&>9G_H`1E7YAh%iJ(HqVS`^7 zrs5hy@*;F8m}JWLj)3A`1}?51_8=&!`*;o5Pryj*3OLH3qCO#?PnIW4zJVz=F2L_=eBUoadON6BA^U@Aud diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 07296250401ae1350e793015e0e797a4e5da420f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1585 zcmbVMYfsxS6#brGaaAM)D5V27w3!yE%@AUPb)wt&Xt}Z5dUb5e54s|L{OlxsP+(Mx zsx-OR=RVFoH#>X;7-v5}ce4!ty12zR-7o!HjNu$U!Q&(Bz3A+8f_Ha!19|nWLU!PI zKWVjK%p^0!F$%+Ze8IVQf>?_S;tcv#^@+sSLbDj0=E^Uh0e8EtA}NWmO4jV&-Xl$pCG5uDgi%r+1o zs^*Rd$OEI>017f6vgRg?%UpQMpkYnk$~`5m6=$uHJ~_sg7jDgnM?w)-T9-7>4AGjI z!`yQz8cE_~6L3xD%DXOf7Q`YXN)tl2YIZ|@D|7bbXm6XNKPH9|+7l|D4kkpib^(KsTk(8&=^|DcVtI_1e1cG4XK@1uK|JYd{*`Y~e0W#q*qmkx%L%2h$$APa zE&~gWg>4R75BG{wDA<@viBQyVyjLH9TA3@uO5M2jwIh$`du0=mpj?GvgPUt6!vaI} zB6J~G!KCXR0L9%DY*;$%Ku}Wm@fNTbfq~d1aF{|xeMG*TEQ^?I4U?~9xO4|FQ)X&4 z$=P}tRqF^Iq z(PlFbyu-KMul+lWp#z`c@evN6RkqtfdvLIWy#7`p+i-T2 zG@CGHk{RL{h2cE@!MS&WSc?nd4F7`*FYuHZiFu)gJ<;}hMHFjZK=E743d)DO5+*8tAbJiT`_h;Dh!fgogNGRe;>5Ar=AzCwY zl6x-2UXu7&1l*9h@~#W*C9w#JQiRZ*n%$D$%3N%j_MS7%l<nKjL@D?`E)QL znr*_ytSAt=3eQs2Iz3O}{6h+#I?Gu1#OVKkUCYoUpF6JP>*y3t!*><(Qgflkp7r%+ zs;rhG0(!=*chnKDzWNzOLYs7gnuVmIcWDP?F?*a2vxhM}5p z0Tu!Fk1Uo$@16FpuMzqlPoiD=S&si!Xi5eyEO!ye3F{VKQV5BH)q*=b#a4vDwnrDq POE*<3o$ND7`6&4d^=B)6 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu deleted file mode 100644 index c7774d6445374273224a8707b960eab0d91b5742..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1588 zcmbVMYfsxS6#d>`aaAP5P)Y~e&}LesHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{wEv zXt!a)6tl!L3Zr>?!A0p3&C;B$_GIH||EZ(77D1|w(?6z`5!x5JxE)T3 zVe7C7lM-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3>nA1Ll+;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!s3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%M)%}L|A&!W+$vBMPQ>;3~_2auV#U<2h(=r_bpsXA~OEdoR6Zw5f?91#`?aUV*V8&D=GK;KYVxUIXzV zYwmb}EE#6+%bAzCwY zl6x*iBT0NL0&d7cdDnx^idck1DMBby!WZ(E{8Hv($GZQN^_`mUn0}ucMrdECd^VgC z&9*TURul+bg=eX1pPr|1{yv3|-BqA_V)TE&u4QPF&mC9tWpoOs;kyobsku-S&jxxs zRaQ$8a%DjX-Ehf>$y71olEg^ulI+r3{Z{_09mhJiT~^go_P}#ivCvK+d}?srkre(G z_IHK#Dr`Eo>Os2lRk~Q45tC058t5#p-7ri{x@mf)8x|kl)-m?`m%_RyDeUu{!iK#X zzm8g!Td_Ba<51OnO__AuEvh)BVmRl^rC{xD?cyv@=vcT0;cCEx;uH!tp;96gcOX8i z15!`+MzGlqT)^66k%#AI8Td>g~1JBGP3Gpk9?w$rFeN2p>q=AS8LXjRxDXro4< zns5P@0rrn9RzvTd_HLjN`W8>3UHMs!|4wL1hAu335y%Pa7G6>aiGtOFJ3PZygu%9N RE|TYNs#ZEVV3P82@(23=D}4X} diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu deleted file mode 100644 index fccd72b0645add80e87fe857072483ad0f0787eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1588 zcmbVMYfsxS6#d>`aaAM)D5ZmKXfrKRn<2yo>qNKl(Q;$A_3%r6&=v9HXD8`%FwmkZ zP44x%k8{t>4j%)?`HxS%JjXxJZ}4^R^WX+!ID?Py_y~JXI=kKI-QC?nUVg2RT{zxv zwA(OYido_rh0#2{;39ZIoWqiM%YS1T1ioWdVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mpi#!L)Nj&E~FTeyhea=?}qtI;K1{EBmEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tn`!02lLlFUb}wGHE{5P>piI$N}hKuLSWS$k~W9%IL)Uo+yd(8QP4CCxKSjA8b$ z2waI~qY+XQa7E@i_#Si@#3Ce06GFKbL8@!=OWV_p)BUGT-{{%K(eG2s2<;19+zqG1 zuyxFYNr})!yqB)d(Mbj;?=tvswn%hKtoaYvrHVreh38tmjLzUF{+21v4HtS6*ubop z%4#b@zHA7gE3O!^xt1fYNXpbH$u6VSPvze-aO!hCWYs-o_q<>=3r2ANnCRDZ6urB!3QV5}a+ZsHO@MA$#7Sm-u*6a2s+l!PSRBF;j= s8>uN7`naDmk`p&3K4cIJ1($?~@30e5n{NNK@!V(8Xw%qXit=&e4`w|V`Tzg` diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu deleted file mode 100644 index e5c3e2d2ea69dda538040c88a9f090678958a0e1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1652 zcmbVMTTk0C6n@XII4TliD5ZmKXfti4HbaOB)`@Q8(Q;$A_3GG`FLbT=@w1ckLIWVK65~aybDth8mR| zXhS9ifvYS7%1Jn99nV2~JAFx48Kcl>-h1gSqfJc=DVSrXu?NP6Fmu-=f)g8p*$3i7 z)ZFm^Su(m0pdgD8Ywp6h%!Q{68rI~k+*8upaMl{@cW2o0!fgrhSSaF3>6+%bAzCwY zl6x*iBT0Nr0&d7cdDnx^idck1DMBby!WZ(E{8HxP$?5){r@v1PBeXA6J{wMnX4|j{ zD++|J!n0JhPtQ|0f1kp~?#kCaG5SAX*D}<}=Z-7+Iy!~ZaIalnYA)2ovw_|YmDN&& zT$vC;H(WAeGF6PYB+*m59|`|eoDn~&X-HU+TGg3S)kCdun@xHfCt4X6l_AJL@16yd{#T8 zZtjg>Ga$IuwPO*wMs*CS0k_ahh8+M=i_oQDMF^^Q3>5cLuwibo13^h0&uhSb z1cn-|fuj_vDn{h<&a#NfwlMiNhD&!0b7f{$lbmgbQKgPh#cqs0Q_9dPu|v>CjY2kI z2`mHbA5pBD-aGBxKqK@mZbiGYvugh*t|=M1Fx*8TCyZP8NFf9YRtxU%3|kQz+rGI- PURtYK>EwV(%E!qcYqu=3 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu deleted file mode 100644 index b2fa630980baf4fb37d268e0ff2cf73b6967bc71..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1591 zcmbVMYfsxS6#d>`aaAM)D5ZmKXfrKRn<2yo>qNKl(Q;$A_3%r6&=v9HXD8`{4o0=8 zN|Sqi?&I8Zv%|-LasJ~|FVFGM^Ba8K`#iY87|!4$JU+tSv(9cedUtm>ke6R8WEYP2 z8|^ksm|~WAMqxBhFSrO^5a+NY-tymA27&LGRhUa7+@YB#5T7V*8O(@PTunfap++@! zj3pCE;3Cg~auUxu&kHcYO<(d=!6-Caw?PHRXv+{wB=gKRx4^hqX5m{zaN=Sy+dx9B zntL804~%XDkYqk$txXtLg$R^E)7he31WMW~&e~)1_82=Z{hASvg(kkVE@_@wVhpo~ zMc_&_8;y{ffGaZB!S|rMAQmA}nh?sh2vS{>U)r8NIo;p#^!KS{g!Y9l?uJuh*g9;& zq(tZ<-b+{K=p=)acNu&*TjaVW*8B(TQpGui!gH-&M`v&pf6J8Th6_CjY+%+)WwjL{ zUp9o$6<3VdT+0zxBxUN9e3#Mcr}A$ZIQ6+6vg)3)dtR`b1tYls@-6C72qroWh*uq| z9#IB=8~aa<^=_<2c%x_K{IW%?GQ{#2vBeajfyvVz4x@y!l?p1~P`-at=h&QP2FnR$ zu*rG`D=s4op2eMxyASu8Q;=*zl|m>fIo+$zK&{V}VWn|g`#Ooov%Rv3L{OpQu))s_ zQ*n(Uc@a7nOfuzr2S9Nz0~c2hdk~b=eY^(jCtxIY2^?imQJ;{{C(9EiU&9pZ7_R&Q z%(T68h7@eQjH-2nDm|q7bFD3{8rub9^(b}|_xK^g{!zt3x51m>2L_=eBrxeHo25jUA>aA2cDr7Cc(_B}d@so^v_C|x z7EGCBhB!uHybOPEnma+P#Tjvi|HWA@@SGWmd8UOu(aQkwsZ@r+f*8r=6x0MNDmu`H zOfv%4NdlA;f6h9dg3j&iEnR7hLZkVR%iJ>B)WndCIc6GrFxH2ex+W2vSRc$jNbaNN zj(cRq=srM3mSfi3#kfpGP8l?;Nn2@7No!NH)fo<+G!MeUAQ|8Aj+psB}J>5zV%lDa$g1 zuKlx6b=v0_F`Tty_}p82dLTx>%Uyhm;nbrW>_++)<*DOJzDu8a*nex6SDFho&Dl_I z$I2=xLaxXNp<6B)F^S5?T$13aQCjeRzL z|5qOSPZaB+&z;WQP$TpsT!?mUXFdMUTvIY~e!DYY9KUYwA|Qjy*jq#2VD_Aes+>RC@>mT zX>za6eVlu4cK8S|&VGIAW*Po{d53SiU;B3$!v%bXrzhBZ)!FF;@9ypf^7?y)?7;DU z(rUq&NoI&+6o&Kof^+W#u@)D^8U7a+Uf?M+67xa}d#L9T#K%$@1~XzLmt#;Ps8Pv| zHe_57xXLo1oP=}M@f@_blec_TFba+4y_eoH+SJ65f;nayTVQM`Gj~lQII*FaZ6H2W z%^eSrM@F{+6l6YR%}p4Wx$u-h!&RQdVa*Qo6+?o-Ogd(oAE@_?_qBS#z zx#vnP7EWoCsaNiOo(Rdm@z8~ zgs#H9bhVF9&r&!!PT|wVBGWxF`d!@F#}tkNbOpPXVMIQ6T*-IQM*$pu%aoUz3pMtv zuh(m3wG|;(c7)IkmyDQ96+Hd0hbdIlN960tvWUsnF!?%$ zOLqV>Wu{h>oUNBpwT@7whgg56l%Z8)yP%C4hHk=^W?3MqE6Fs#QXk z<(%`K%XcmZd<+<;zrOa;6#u@u$9KJNgL{mj3t!;r365S=b~-_Oc({kW`CcMB(E3no zHetdfGsH0pqec9KGw%ek7U#qn{uk$7;2AR#^IQviq8AawCsG*(b7CZy6HsHQP_+YX z$RsClou)uJ33k@;479h?*L0;b3iZZ=m)EYMO-0W(>ymsYi3R| z&!wo>YCaYLw`8Hb>p^ElEJC6XA#|_icjS*Umpi8YmrPS7d@gsae@+b}v@cXP8%~L4 z+pr1CbA+zLvrx5L=NAc_wG#N;T?Km}M!$)>_>{nDfNo$nGIYsjjw|^l`ZR#UcOCLd zbD<`l4fJ-dtdb(+ii!}r<&qJTs(i#HiILg`+l9C4t@v3xj)iW!EX$|tk!P%=p?3crq8mRqtnit|v`d_$RZ+%3vDg`6n@}kc zic1im%Oty{T(jCKGnVHojW7}zzr6W|b8}m<pmR(XRZg#(yU?B|{gsI}hZ9bqg;Eghat=!5yAq WE5cyg50|yqj;dC=cEBX%wA|Qjy*jq#2VD_Aes+>Rfk9NI z$-O@JaqhX<;UmB}`}L)pW%&2y9lq^;?cZSx7w{RLo?!1qXQvaqySrP+>+cn^1IPPG zs|8~wnIVo*7|!Dh&b<@FT3irk_+MOjfv3z!%nL2-p`J$&A4_Ez%!rX(jzNu}MkPDi zka0oaD$9U!63$u2bI{&SUh`GKC^VY)UV6)DQxii9=9p=`17ky(xoZ-^i4DcP2jWB3 z-0=W;Wb{3Ng3O1kxeeno7oIX`Sd+JMPf2UVS!<+Ejp8Qt_z(7u?UIMgwUOu-I71boNpZMy>v8F!WVMm^v{W5g!Y8Wr-KR6 zY#lacMS;*&xR@#$F#C&wv#x>zK;Cq}=CJNuZzQGl*s*D_4W=Z-7+Ci*CV!*7}L zQgflkp7r&5t*o{pL>_AXb_wg367lDD;C2*KRMSVoRoGgo& zYz>pIW4Lq&FjHn~HObj}8CB~DReFf^XG$4bHMR@dsA1?PZ1H1&{ilkBZoSjq^)*6K yNQGU*StxiTH6;TV7PJWDgpCOgDTG47Mj_%UwjyZL?w=>GeHXRT$qtj0kCK1%$rvjD diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu deleted file mode 100644 index 7880c3cb34566e30bc90fc25b89235434f38b601..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6TTdE66n^JdoM2+qEK;$Wb_5dG8e^lSk+wcghMfVX%+5^aB8B+lJF~l7t5rgl z<(%`K%XcmZd<+<;Kfm)x zWRerONmHPl1Uu_^2HM-{Yr4`Ig?i)BOK%x%Xktjt95eNIU~EV;a}6Rmu_2lFKzzuW zJ02iQM&AR-$zsGB`!Fsu;VFZ%$CZ2&eHOssyAFA+ zxlj|&26{VJR!I?ZMMVhRamk2DRX*a9#7OOe?ZR92R{X3T$3nMVmgQ6S#4}dX(5^xF zRN$&53H&SU?+UA3*tBfbgkmnz`7?1 z?DL$!hP?{Ej#`#mvNwwJP}Y1)nRMJO$~c8$IOofyVC`<@q7^7~ES!UIHsDEd3OSol zDG`cG5TE4*sYZJv*sKSxbnV!L?oq)<nD)FN~(SRRtz^ z>_AXbAAzAqYv3q>vWpSqNKl(Q;$A_3GG`A9O|h_}NMN91Nl= zP44x%k8{t>9v=h7+0W0tEW^Jp@9=H!%is=UxPVXa^aKabI=kKA-QV9qUVp2QU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~fvI3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qS6K#>lW@*Do`d#w`jW2-MxoKX_tINNo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*BcpEt6l5`C&0QFmx$u-h!6x#3Ce06GE91zL2-%w=(Bjr~A*HzEkt9qd%sG5!x3jpADx( zvrWu|6$L_9;a<8r?bEXqPTDDayjW(sCq}=DJNuBrae%I2*D_4W=Z-7+D*8Bp!*7-H zQgfjuo(=S7t*o{p?~= z$|fQ~xeCJux6n+6C5Grl=u)tPN!L3BihC*8uz1*kprr2OHDEsi1Fap= z2@@unA&yZPE#e=Xc_)aqI492VKREXS&zO;z=UUiPy@(K>NM#tziIH4RK#ifIY6seo zNlxGTbM@kOf6v=LriKyP7b=?#r$n=DY{K#! zp_}k5MD5naWdi4|1U_|F#vX~$?{b$P6F3X#2D_D^P(E{9$#?0q0Eh3|%AoZM#v1j!>E1=zpq|p=Dr) zpp6=ZXu=j)2JAmxteW0C?cG2l^gXUcyVA32{}-ky8M-jtc}PwexA2lca1^W-+~FCv WA~d!=zN)>oSGCf$112dS*Zu+w%`D^q diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu deleted file mode 100644 index a3a9ba6c2adb21e5b47ed8e001fe0b0589ac75fa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1592 zcma)6+fLg+5PkPojDmzHa!3gUwM&AOxCqfgQ<1hjt~T~2*=BFr3uzD^-(B0ekV1(S zIrhw+xtuxUJvIi6v!9=PS%!aI-r?Kcm%$yzZ~>p-=?M;AbauPJyT8AMy#7`pyU>1@ zv|2Esk{aS@0i#8H!MS&wSc?U5hW)|9b3CI)VkR`VM|u%KY$BDRFegSbIRQ0>8kOv6 zLnea2RhB`)NH}L5%RzfPea%+|E1=Q5_tINhG&M0qP)AK;2aFA6=B`ONBQ_MX3&e-2 znPUO+NQ+$nf-FX~xea47=f0rOuqJQiz96kNXRWb5X=BTT+c4rWSHzXp70q))w5H}L z_e}CelK9vJ+>nLxt_R&Eu?X?fgixlq7xI?;R_6TK>HdzVe@qP}MW3sDHk=YoH(?Vh z1VUHgUb;H%)3X##+9`azSSGtCM!$(W`;fwMfUaTJG7QS+jw$&j`Z$2YZ=Gp;6{ z4)kWNw6-GT%8n4aVUiM)DKTP_#7v!%@G@HcRQ|03$3{0pR_zz`f#tNOL38H5eh-4= zobpZr;i^N`GfLrKWB<9aUXJw)@6@~;V7iP|hFBdVCZ8fS&{VbD;zmO(+fM%6k(l^$aKnNp^x8ruhL)F^Zl7Wpy2{!_(Lx87;*1{$F# zBw?3vmI{8Bx*$Ur7E}as!p4M$6hfh3qY&{7TOPD&-<>C~eHyjW$sUyj8z+AOB5@gE diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu deleted file mode 100644 index 7e4b278b4811f5a512309bae6c3bcaec0e90f2f4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcma)+ZExBz5P-kuSDdI+wN)q^U8~InB*?0&Q#+=ztsgBD8}LYM%P(c1{`lERfRwf? z2!(Lx=ibi;d;%CJzrOa91pmIe$9Mg2!+VUO2Vdan365S>cDr7Cc(_B}d@so^w2z}! z3#LpkO)R4@UWPw7bv7f$;G9^^|Ki+bc+RxIJXe`HRm%YJsgRn%f@s0T6yyXdDmqY_ zOmhO)NdlA;f6iE*f^uf|maa5Lq0xMB!Wl-JifEEE%XDK8#`rK(+awt$#s{+x;(XNH za*wPS-3Q3Ya?F~$7#C^gD1(O4X)ASwhvE+hfBWGS8PLw8+Rgt9_}(7jyTkv~#jK07+t=jhLwW`qthna)QuqF9}ovOGuV z+CK|br+t1A!^NiQ3(VXb;Ael~_fqIHvH`jkELl$B-ZMBqOaT=gS{m%_nw zVYLV^`xUQk6(%;vh)!n+4OJ4BZsg}B-nP8djq>vkm5<#9#<1y047+uXVOzb5-*_$4 zEvq*U^HAn|Lz%GLE=xa!Vzl6^RmM8K%0%0j&@z7)_&Wn0C8vJe(4N9${t&bXU1coyLG@37;(iRquPnBZQBuY89ykb~ z@6iSv$53`Lkgq#S0+ZC3w9as0PhcVS+$fT=dKhKu2$j_h@h4JhS|)Y~O3Sg&#_#{i zWB-X_J@n2hXNL-*AK^kYYdh=lf99H!k@ef1`{MX@^Di;@K*6YtTRg`m^Mh@UFQd0^ NR3%k(zy#%!=pQp&DtiC` diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 05437d7de46595df005f5fce04882b693eae9f6b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6ZBN@U5dNNDaVipGD5YB&XfrKRn<2yo>qNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`nZcN&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#`uC#Nx-evIMM`6|;LG5SqhfE;(5uq)|DRHTk8`6l{94B@vL|MocJFQ!-h-Gf@6N2{o2F5;uLb0Q7I7$A`bWJ3s6gQW7xOS5A4g=5=y9Q2TsHhLfmy;y{lQb}C6T_uD zgoQG5t4Ye5WmK&rROunqpD1N$)!060qb9x^zrqh5_Ma+Nx-Fb8+(;u7gp}J=oRxx~ rq^4x-{DS759KSLCAqHP4*dRna$5wc4+IMHsYad0ebhO7L<58ezZ(%z{9aE+bIL}$Ingz1lq1e zD1^Iv?&Z0c13m$alV4x^NrHc0-Q&Cdx8Xg;(1S1V^aMvQD!W~;Jv`h)-h3~SU1%Ri ztrkp~WQI6KVZ01~aGE5|hX2J`F7TWgiFu}lJ=Mzq;!~*%g9R~?%PFV{RH*1c z8#2uZTqg-oPQ0CUJO!QG*=xGe8HGmkA(y#jw5f?98FS1u-hr_`&D1rC;Kcf5-UG>f z*4%LqSuy$^Kt`5h*4&42nTnh;XjqfB(wvgkmb2DGpS7{&nX3u$L@444>4xToAzCwY zn&w=JMik|t2)HFnmAgK4*Tf&YD08`EI(W%6Q6kUej`h!(VT2BZO6Q{) z(X0-evMfXB+CK|br+t1A!^NiUJzaS9omQYjG% zOAwyr1*t}RE7-0Fu5_Kygx*oXN8mbDep0|KHIx1VK+qy|C0OQ@>Yo6`{TQs@TI@hj zQswg&a1emLM;qWchO&zR`Esx%V3Haptz)=!C$LavZZ%0+J&m$-gi3Zp{)tkCmW3UH zHfrpv@h4#AVgJcuJ@mQLxf^PPeuSH7*M8RHzZ060k@MS~d2;-^`Ii`cqF}Y)4$rX_ Vez5KFW%Sxn)k;SPOj16H{sEzxE3p6o diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 55eac3eb75786e1ff729ec14a029288894a0e7d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6ZBN@U5dPj@aVipGD5YB&XfrKRn<2yo>qNKl(Q;$A_3GG`U+9YX@w1b(O$!X7 zDoyVE+{<&%Wsgq)(pC`<_5s>KcYqs-aX(ZO>^Qzd*Zw@&|@8Aj+psBAu( z5zXqbDa&(&F2lWWbvq}g37mdR;M4gk*&Q+ZRa}G|cj~Y!8K&ej$CZ2){UL$)Tc^Cx zT&SsMLtU?xRaS&t(Gfz|Try%(m5;e3F;ll-yNFgj6@Tl%vC(?SvVF?#dB!RlwBYXB zkDx{&nCLbjTve!QKnXlG4z`WeW;`8WvWiuNSRW%Mn;|sRY23n55KywwKn?J)s14Sx;cYWnjUvu+d@j;a+hHIh#@`5sEsF_wpl9sdHo4C>&S5ZshU&plBiz zl&LUma7)c(SYn7?gf0ZjnRNX_ptzra4U2~z2ui9xUIPvyFc7;2juR-UkI3hfr4f_X zFj*bLr8|U$GIOg*#_DC1ts_+GA=aNNWoX&hK4_!Hp_{P94*~X{DptDnPJ1`h2t^^~ xb`@u(;H}h@j9ggIJdhJMCOjk%3I!X5i09afpiTSktnt!!Q7hfpW0LYo;~xNX7%l(+ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu deleted file mode 100644 index 0227b52148b0c738c5ef2bdc979bf8ae595c2017..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6ZExBz5dO}uIJHvMRiSKjtu_~sAgiiQ?U>58ezZ(%z{9aEzm$Rc<7X!U0&CYI z6vEv-_wwA!A)f%o>CZ3yG{wKJ?(tp!>+l|9=)q@rdV=F;mECU89v$r=Z@!htF0|h@ zS}m9|$qaFf!gv|~;LJNgti?HThX2927kJK$#5~u+p6X=;@u^gX!Gajc?K|4jKV?l!Aox$ZE9jj&KxrbZ@}1)X6BkiaAHF;Z-Mxb zHFrEfR*b#{kdx(@HTPj$X2Mej2i9b*%u~|Za@LyYvo^LocQqlN2t`~W-O#)+L~CYF zGtZ?sXf%8*0&dAtdDn;TnplKHAwnos!sqgi{8r|2$9nLb^}Sl`nEsdKXx(L5K4p(QVlR)T2#JE#f;&9N WR)oQ}?=BlJ9aXJ#U zyZ6qyk8{u6JvIT1)1ROFX^MYc-r?K+m*E}8Z~>p-=?M;=b@qC}yT8AKy#7`qd(b&- zwA(PHk{aS@4&!Bf!I^iQSc?U5hW)|9b3CU;VkR`V$9fq-Y$}zZupmYjMz}jTOdAE z%^VAmN1DF{Ajon|Te~nOGwyQ=O>44t=5x~CaMqsalMc2_xSA19xFW8wu4!HvqBS+g znP-wW8x0?ufE%(@-u0okA{HTDm=H=8_d?#1-^!eCogO@Q`c5sjj{cY#O7a0$*?cr3 zn$|H>Dg;7T;a<49ozt@f&ORjY@nV(fo*4ZqE<#Q^b=b8GQ}UT(O1_GIpFsSrQ(kJu z)zs6WuGdN{D?+a52%#G$DKV)MVvi-Kc^2YqZJLBGxzm- zP$TD*cN-9{DpWO~1pYM+wvE+hYzDX*NwSJngjgRVCYvEN)M?zpQ4mnF(Lm`NiPLvg zj@@x4u%1u?yR0X$;WDt`XxQkm`S74v4uVc|DG`b~j`#8-P^oic*eD!VzHa34;-F|E z5|pVhY;a3WWmsZ}UW6_=6;!(Z5#+d^fDMa>9q^n~eY^%7L|`Cx4IC#>QXi4eCrcwH ztzoh{hDmn>3uWe3lZ@8OC|gIU)I+R4Rm$XLWBZ_u8i#Je7C#2qf2vsN);sOpP$LwD yB+lX^=)q@re1xNCmECU89vuqJQio|4voX6t`FTcu?UG$gixl0FXS!xt<1%f(}Opj{xLI*(1B3-d^97P?ZT$4 zC=j{|&r;QCpPi?0{xOA5y|uG@V)U!H2sv%TSQ6ym*cd8xTjQ_qHa zH&#|l5prcl2;FeWh{;qj=8{BDos#g#s zL3~zAr0(vmU^^kW)^%bLdPgN6k?UNAP64;nOok-@QH#)}U_}V3e+(4&Q?Oxhu>(O# z9nWjPK?H^xZGhtxswzh0^Ukt}$#yXLE{02Y3=3uER+F6VhEb)CP{nSHKU2!kDzQV* zMvX%@VGFDR>_1Vgo8CL^-B2U+J+4K&wzF>kC$1?OxiH;DASaAl_(&lH3RVm5@Elta U8r#0RNM72jTIu9~Ny;b5Ul|`P=Kufz diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 3b268760120ec72e08baeb6a55edc121004c388b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1592 zcma)6ZBN@U5dPj@aVio5l+vvXw3!yE%@AUPb)wt&Xt}Z5dUb5eFLXux_}NL?h7Lxf zDoyVE+{<&%Wsgq)ac4W2IVuym3$NZK7sgKr@Yi$sHtZ| zU9Xi@R)k#95kfayGGbDdkGUitn=ZGlYgZjf*%68cH@YD1D>+^j(!> zdz=ZZCzQZ8>j`YQ3@kVnmO3myJSa{fXHzOALQ%@`UVaBEfo=>Nk>kqOjXYi)6iq~e zG8KjmZmF3JdkoQw(4}BGldgXR6!#ObVfU~DK}preTfjjC24dI1aRMdv5&3emG-A>k zCaYt(bVsmIW^OgfSiOw0b%aVi#QIaE3@sbm2W`|ibQ2c&F~I&)#Y(r{Y43&_p(v!> xuHvi|{46ykBNrAl59EZ62@eT`LcvBM;yJb=Xwx2^H(vWRYNZ={Oj15+`~@xy8Dsze diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu deleted file mode 100644 index ccb3a6fb2b97063ec110cdbf3f35c0befcfe8ae3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcma)6VQ<rVeAgiiQ?U>58ezZ(%z{9aE+bIL}$Ingzq_kZ@ zD1P8EtA}NX8s9jXf~d(@b5H2u`dgvkxTq zthwVJvSM@}Kt`5h*4%}0nTnh;XjqfB(wvgkRqh2&YD0BJj=wP3tKWBy!IuI(Ik7h)(I%dkU z454fPEL@%Qb_{3jix@6?>r4;C=)G{CKh|M4(zhs29ar*Q^qEiUznhd-nhQ0}*-+QH zvdW5(D=I?hmP|JvFlimbr8|LzGIOg*%IY-A(h(}v4g3?O3@r;g1Z~vVSL648 zD&!9LO;TVXxDMpqNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`lO33&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#_Xx-lGgPh&VeUuC)@M(>3C^q~p6l72)*>bR0`qK|!2|68ZL&|IjjU?bhE zmDN^+T-gyq*IY7U5|vN5Bq39;q`Hh&Kb3#$z@gD*$f|wH?s>{;8noc<+Yc{AA(-ez z;IBGVJ)Iby8;4Jg^;$fiUc8D`hFBjXCY>QP(n(msu~$#L5kTb|=O5qKIkv|c!+Jt7 zY_lH2hD*@H4)^K{P)l=T*vK2#zFy$*{Ge zq~Bl&UW6_L%b9e8L!fvNgY{d79SBP5KHdTj1JDz@22NtAs1L}OlO+L@G%#rs!=*cf zg)(!iNy?gKRIMXa=^@mgC}n8X*gj~ZCcYcL!VezypDI?mEu1dgNFx-4l-pIDm4ct7 qrey5=g65tazcKzH245)HAVfUJR(NgNcW2RSA4RQnw8tdn)94?;vKH|G diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu deleted file mode 100644 index 296e163de22dd04e5c23ad2bf79cab37d616e187..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6ZExBz5dNNDacZTitwLGpHgzr_K~`0r+A)=F{b-rkfJb6meklX>$Ingz1lq13 z6vEv-_wwA!As++A$*-@yB*DM0AMkze+u#9X=)xCxeum>$m7R{)9v$rL_V$IBSm8MGG6A+nNxMGf8YAUDG_*L@B1v zQpZJh5JfH&0e55}o$WzqMJz&DAwuXu&hN<|sjs$7`>&ZMGIP1uvi>>MjL?22)7fxJ z6syA~EYA_T@y|llzG%g8-nxw8vbzfQNVM7u_xWQTb}M|BTxz)#@1oCrQ2$+ryjDDu z6UPRso-3=Q2-%_{gzmUtL?<#IaX~_)cENVxt$Hhd){aA=b(dxNls)m3l{B;?@Sh4? zwIqg@!v40f+J%=bi&wr17i%-3(+Sob7`U7lmu zuvg*NQOj~m_C{eI%9?K}6PDXW8K+PT=X|-$Si4)fXn6`P^XI^y4S15ALe3^s2!z5C zglBm{s?pvEHtT^aT{|?PdtC4lxK5>?6tD}$gueg~vTQbmVMP(F_S0fn0@ssI20 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 7d9dae19af858ca5afe6248f06f6c2b8b9d179fc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6ZBN@U5dPj@aVipGD5V3oiJ2Cu%@AUPb)wt&Xt}Z5dUb5eFLXux_}NL?rUgc$ zDoyVE+{<&%Wsi>mjvp7%Eg_ zPa87H30$QqP)@=*>v#s*+v#h*G8lzs>)uOm8Et7|NX{HH%^ff{l$p5}5uDgi%q|cg zs^*Rd$Rne>0CKVzvDP+>%S?F6plMCk&O9aUHD~RyK0U#f=dNbNW1)yEtSg%5hG@;q zQRcZ6%|^q=Cg6rFly^PoE{R1*6efi3)cltGQRaN(X#b_7sS-Yy8>fFx4I{KKR5lw< ziDq@!gylIxSK(f`I;STI9G{#eaCWgwc2A7n3HRwk9d<3llzis6l5e7qL(=eDrM%Q! zsEKC-U9Xi@R)k#95kfayGGbDdkGLc;Q>S3Nh*munf2+W;(R#?Teaaqq#wr@L;O^Ux zphh8>=rkZ)Rj6t}2|PFUH;vV1JRe}Pj8%kK9U~^2A~eux+`?fHP_ouQ=^N%B-&Z-d z$CLIgW?o&Hlb1?6m=Z$n>3WAiaW4TI77sfRlvI7Z1?)#)Aa(^DB~VfykuN7pBPOk3 zvO0!KcL;N3W>%Ao)ypVbN2t_8tUp!C(6X_8&_<0yH(`q(1MELlEOqOh_HLjNibBfm wGR{)L8>uN7y0D;mASY~0ct{`=3N{K6&#)CioA%v#rVeAgiiQ?U>58ezZ(%z{9aE+bIL}$Ingz1lFz~ z6vEv-_wwA!5uX6Y$+k_%=)q@reuk45mEEq_9v|-^Z@-nuF0|i8 ztrkp~WQI6KVZ01~aGE5|hX27?F7TWgiFu}lJ=4no;!~*%g9R~?%PFV{RH*1k z8#2uZ+$0H5PQ0CUJO!QG*=xGe8HGmkF_*bzw5f?98FS1u4!~HSX6l+maAJKjhd^?l zHFw-YR*W73$jEZcn)@&=Q;|~!4QtX?np4u+a@LyYi#E1Ab2TBJ2t`~W-O#)+L~CZw z(ws}th@w0c0e57na@U9MnplKHAwnooBG2SK`K`>=j`iRr>j$;iG5s+!jL?Bl>3lRJ zn$E`}Cm>yOq96dFr^5Z=%nAQ2*V8yw+T( zY0ieao-3=Q2)UvngzmUx#3U*kb4fy^PQiBJt$HhdHjYD~b(dxNls)m3l{9oB@Sh4? zwIqgrg@avTwF}#pt(y?9eHAV?X2hg3goZi^dpGhU6K}g->PFee_f?Gj2F9@INeufu z$FOCu!f&FM<(BM?!#tEV-%=(Wcgr$Pp%^XrY9&~wSGj0=3LW$3z@H6xQk+7@rc_FV z!V-jMc|oet-U_zsfh%1nG@*A=@DaF9m7f%FOU}^r23~oaX$v@w-!4P zlvMe=1snvR@6iT0j-l*gK)xI-37DjYN$VIc-6<@TnOjX#R!^fW9ifulkbk0-p=Dvm zpp6>)YWxXUdDwrlSPy;fbnb>4q3_`)+O?nc`0s?KWaRvIXPz9tZvG_(pD0)@xWjX7 Wg&%DD?kalisA{F7BPJ=IM1KK`Gb^e9 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index c6be5d7b5c7725e43924fb9408db1dd805010f79..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6ZBN@U5dPj@aVio5l+pp)#7v9SW(cvtI?-)>wA|Qjy*jq#7rG*T{Olxc(+)mB)e zrwy6p1g_H*C@0~Zbvy&@?esNY8H_@+_28wqjJ7l}BxjD9<_;Je%FJAg2u^G$W*3ML zRddG!x8>jm(oxWG|jiW!Nh7sBqDw_?b zM6)_(!txxU>u@hzowL&fPEOAgIKNzGdLTycg!}lu4!ez(#)pb?5f y%Iz}FQo$RkDH*!3pm`uCY)p7aAQTEV3K7q+6+xT!@S^eBcTp?d*kh9NapN!RsTd~! diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu deleted file mode 100644 index 9bb8ea8a2e35dceaa177de902b3c844ae34f2814..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TTk0C6n@XII06YVl+sbQiJ7)in<2ym>qNKlXt}Z5dUb5e7rIvb_}NK%p@Y#V z%H);+^=)z}sdV=E@m7Pw|9v$r8kHPs zLnZ}*>nsDxNjPU6&p~@TeN9&xqj1o8@X}jG8=4qWFvrZn9vB=_Sf-FX?u?yoe7oIXWuqJQjo|5K4b7x=7)oyK?qGjNS|P>BBbcMutN9+;Jt}M4ttx;a@e0*Q~*ezfR>zbsn%X11F z_Gs*CS0k_ahh9v+|i_n!|MF^^Q0u=XBuwihq13^h0 z&s)HL1cn-|fuj_vDn{hX&a#NfwlMiNhD&z>b7f{$lbmgbQKgPh#cqs0Q_9dPu_Mq% zjY2kI3oHZdKT)ij-aGBxKqK@$u0^}Dvugimt|=M1Fx^EUCyZP8NFf9YRtxU%3|kQz T+rGO@UfZi$>Ew_}%E!rH{SqwT diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index d5f9afb33809e9427f47e314f2d2fb428a611b2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1592 zcma)6ZBN@U5dPj@aVio5l+sbQiJ2Cu%@AUPb)wt&Xt}Z5dUb5eFLXux_}NL?h7Lxf zDoyVE+{<&%Wsgq)mbj}huyI3W=Cr0mt`}n>NyOv>4K66~jH_;~{Y51*EUTQAX)U%P@&J!IKFWe+@K6%AT&_w{=aq!3JW z8xXE4R5han{xuGs8>{8m%Reu73m+_Y<&T_pk#&N!7<&z(E8CV%NZN0wwhk`Es%}V$vEW zt7EuyN3c+4ZZ*kRy^OMTgi1Zc`ctJ0EgRbhZPYk)6BhX~!2VOkO1Iu=?}i$oD5Tu3 w;;a<>EHxz~7Zx-Rt<8 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu deleted file mode 100644 index 18047dc247afd53e9bf23c57273dd9ddc77eb3a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcma)+|4$k*6u^JKzv2ZGcDr7Cc(_B}d@so^v_C|x z7EGCBhB!uHybOPEnma+P#Tjvi|HWA@@SGWmd8UOu(aQkwsZ@r+f*8r=6x0MNDmu`H zOfv%4NdlA;f6h9dg3j&iEnR7hLZkVR%iJ>B)WndCIc6GrFxH2ex+W2vSRc$jNbaNN zj(cRq=srM3mSfi3#kfpGP8l?;Nn2@7No!lP)fo<+G!MeUAQ|8Aj+psB}J>5zXq%lw}!0 z*Zx_kI%n+|+UFNBeD1A1JrJYcTk*3o91^XYEYqj#k*BOILni|Nso<&~F}xHG zo(ro*c-gObZL2V`IYvx6Lujazuyi9oFY&hJrEZj+f2@4$HZX=wPh!}ua}3++Rs6zCgu(`dXL)^8 zbG;R8cLP_tPG~~!sHjJ%b*lWNfLm%N{rW@DB6KBK=7Z`V1I7IqtY2B|Ku}V}^By<| zpzqNJ9LG?0F_5o2O9GSBn6%Dt>5gHc%-m{{vU(V0>Ijw94e=*R8CoWG2->Kz&&Kcn z%47eDVmR$N!mYN=D9acjk-Z*Ui7g-~$D#1$TIkt?+|wKU_v{ O-KbjW=zvMeC(%D9&nkET diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 7a66c1632b513fc2a87f2470cc87e705c293ac59..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6ZBN@U5dNNDacU(5D5awew3!yE%@AUPb)wt&Xt}Z5dUb5ecIb-u@w1b(4FyJ{ zDoyVE+{<&%rO79Naq{a+KS}WK%R7AA|2n+G7%t#5JUzkwtIl4}d-wLXkk{W!WDhzA zQM(OOCYd3QQ5Y}73r=$I12u`doW*12A ztLBb-$Rne>05Y;1v(`3@%T(l)LBpD~o#vFZH=MO6`lN#`&s@!jCqfZdSl2W!4AGjI zqcrDIG@>XEO~4IVs@(OVw;~oHQJ4_AQ;S>jN15|yM+dtc{W&v?(1B3td^97P)iG0+ zWe8pQd*SMybYke7p2hI#VwLHh82v8p*h9`f)?wGuk0?(aSMq&y49EVrPI;-hP}7_Z zb-h+rSrKwYM+n_;$%si*Hs+FqOx=R&B3kuS{H+6rM(ZKV_9=VdDXVDEg1c`&ycC6C zq8ov~s!-K*Vt8pBJU3Qr@p5|cDpnCD2O=R%P&AB&5dCrZ(RAhfyayeqKQCIs{F9Q zEj5#VgCTekx)dy9()AC4;(iR)Zyk0ZD5?5*4>$-wPwW~vj-jMJAYV_G1WZ!Hq;(9J z?hqEr%&jIVtCvx>j!>zGP=BJ7p=D!xpp6>)Zu|;Adf0!eSm`!*I(I{jP!LjPS8-Mf sewLb&k@E|hd2;;5_=gyLpwP_E|qqjbaTIs0CB;}LnAJ8ur^#A|> diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu deleted file mode 100644 index 86899145251d59b31bd0027a459b066ac3986d4a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6ZExBz5dNNDaiUVyRiUhOtu_~sAgiiQ?U>58ezZ(%z{9aE$1Vf)$Ingz1lq1e zD1^Iv?&Z0c13m$alV4x^NrHc0-Q&Cdx8Xg;(1S1V^aMvQD!W~;Jv`h)-h3~SU1)!Z zS}m9|$qccK!gv|};M7?`@*HQx8vYk&PT)B+67x)p{6sGUh)<<53>L&lE~lU-P@$p& zZOAkuaGfMTIq`P6F&tB7&&L}jR4^FzA(WWMbWXv+tcn2o;X{NSG1Sh#q<~&YD08`EI(W%6QNm?%$NJ~YFhU1HrSs8@ zXjX?!S(YJm?Vp9JbJmWbeSQ(c=iWNl12Otd+^L6Le5%84r0KkZ@SCV*xg~q!Fb`$THUSAu0ess1rg+>asmTZ=6S zN~(O`0uBPu_h)YWxXUdDwrlSP#9k+S#E-=tsDT`P$EV{C7fAGO~WVGf$3RH~$iYPZV-3xW#jv W3qRQThs)@-qpFpT4w$5T68!^}3@fex diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 78c0283e0865719001348d449c53abc1c2bb6f14..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6ZBN@U5dPj@aVio5l+pnMZKg$PGlbY+o#-|`T5jyNULD)=3tbUEes+?!X@Nmh zrOBP2dwK4;?C>#Qoc{XKOH=&&@($njz7Fm%h70%%PfxJ-qO;o#-re0T}t6z*U+8Fj>YbLadGvlT8sC=rnHOFbF7FYoPQE^V1Ji zj_q+Ku$oW;+pH(B<}$G0SlH;W`S74Pg`7>Oln6x~$9wq^sMNVOtQC$cUnlZ-u~#$^ z3CdI$Hn@dmGAuDfFG81sQ5{PZdkudZ)b`XoRAW ya=VPPRPaV>N`@{hXdcK38xtN92!(=;Lc}v{MbM_*KX1JDUDQf9c9^7m-1rCdju58ezZ(%z{9aE+bIL}$Ingz1lF!a zD1^Iv?&Z0c13m$alb>JuNrHb}Kj8cR*Wm-k(1XwL{0v7gD!W~;Jv`h)-hL~QU1%Ri ztrkp~WQI6KVZ01~aGE5|hX27?F7TWgiFu}lJ=Mzq;!~*%g9R~?%PFV{RH*1c z8#2uZ+$0H5PQ0CUJO!QG*=xGe8HGmkF_*bzw5f?98FS1u-hr_`&D1rC;Kcf5-UG>f z*4%LqSuy$^Kt`5h*4&42nTnh;XjqfB(wvgkmb2DGpSQ8)nX3u$L@444>4xToAzCwY zn&w=JMik|t2)H9lmAgK4*TfQP)JfR8ksq0O+x1d6$}T=sG4>l6!=@)O z?DHJMmc0tUiCUIhvNsO%P}Y1)nRMJO%Q%H%wBW0iV4Ys&qU|Yk%%1~)HsDEd3K^SH zDG>@w5T4})sYZJ%*scezbe+(I-ci9v;5t=)Qot=Wll}rg&?0m#Smu-Jp8&=E7_8q~ z>_AXbFOjJ7l}BxjD9<{L0Jl$p5}5uDgi%v&Hn zRLva^kVi(}0?5g7%v!rJE;HdNgQhiEJM)yZH=MO6`lN#`&t1)kCqfZdSl2W!4AGjI zqs((DnvI5!O~4IVD)0KxTM>(pC`<^YO88vflHbalZ=D`Iclu5(wvPUo8Aj+psBAu( z5zXqDDa&(&uEM==bx%48bWYC__;|6(bWe^1P_ofL=^N#z z@2ecU<4j;Zp#*kWPhi7kV8OAl(P8u9L2(K>n^GweiaL(>@*_~Gb7R;j99O<>SdIzBUI`k)}Jb6XxZ34XrsoVo3O=?0rsCNR=V|0dpFbw zMIq&O6=$X3t<;o^Tv*UNkP|j0JR}eb1sjEk=h%v%P5bV=@zQruE8W;*lJZI8FZw1J AD*ylh diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu deleted file mode 100644 index ecd87aa35bc9b3b21bb70450f4480ea16b1f39a2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TW{Jh6n@XIIHFS3R-vqPtu_~sAgiiQ?U>58K3XO=;NjSoW0!&Y<7X!UQr50S zQ7-5BJD2ZV4)_ExPJVvrCkg&_{ebWLUxyDELk~W~^D`X1s_b^X_V92AdHbzIcAL&lE~lU-P@$p& zZOAkuaFZlJIq~Om%Tv%UpS`6kjZtVcADwhLqfJc=$(UuPu?HsiVWzf81Sh!<2 zQFF^ZWX0${fQ&51tho#0G8K+8XyhhsrH+!;mb2DGpSN+&Gg}kliBQBA(hbcEL$qe* zG<95xMijY_1l*COa<&iMHL(bZLWEGFgv;bT`K`>=i_?QWPyd)1M(9AObUvC9&FZiz z%QA#+{IgJX&f77xFD_&F)LT1yBu2lBJM)msk9F9s^o4S%vx*6M?@fa8;EU z{uK^h3afG0R_#G8iju`^TZM_uF=EmgLPMQ|(Hr@$iMNF>b))R!L*-+)fH7=p62mUf zF>KkZ@EfmXx+Qz#&<|zKx0FfC?XvV!C`JpuS_#(aRVLb=Ld*O{@S6jk6sM4}DU}kT zFbCmTE|IFcw}S12;7ZpCMd%$Bd<3pjU*Mem}sQw91+>asmgNrQ) zN~(C?0}cYv*JuMA$52)=AYXTu1WZ!Hq;(9J_5>En%yUgrRu7|09iful5PzbSp=Dx+ zpp6>)Z2T5jdDwrVSU0`1+S#E-=zCa;`P$C9{a?7IWMutxXPz8CZvG<%A1LHnaEs?S W7rwFcp-=?M;AboP3|yT8AMy#7`qd(e5; zXt!ZXB{js+9LCG|f-~@{B*EQe<6-b-(3-qOSnK^-;C9WXYOnYk9>jMz}jE)XB8 zW{w5OBh7aK2(ld0);5gEjQgBI)0(WE`JA*joV6$Vq=PLJu4cp&u81qFYnm5^Xid#g z=9%QpM#IM@;D#)fcYWxsh((AOCWKPOy^y!$w=(C?P7iiG{bOb*$p>6z^U;iGT8B-k z5C~m`d*SMybQ0*Co+a?{VwLQk82u*hI6%%m)M3{$49aJYDfu=!f#dL7r@YjRtEs0$ zU9Xi^R)k#95kfaiQesji#!Ql!sap_UM5~^Pzjff)Xgy@veoh})Mk^XLXYT9wAV|(B z?=~P@Rj6u43H)muJU3R$v6tn=ZGlYgZjf*%68cH@YD19Sw`o7As zJNk>kqOjXYi)6iq~e zG8KjmZmFpZdkoQw&?TpWO4mPx9QPBjVfU~Do|CGNw}68P48*R1;{;0TBl6{BX~d*8 zOjgG*=?-C`%-m{{(RvwW>j;&4i1nvRnY?UlAGA^9&`nt6#{m0J6)W9(r@b3$grbmy xUBy``_*v?lj9gey5y%M}6CM%>g@TPj#B*$U(58KN-gxcPsFiN)QJJ$z<1ZpM8DRhb diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu deleted file mode 100644 index ebb5a2f3abc168d6e1bf6e3f1d5938f8fd5a03aa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1646 zcmbVMS#KIK5PsiZF(OiwL`!IiDzy$E!8WQ=C8Y{&^Jtm10T-`r`3ME&$JgFnj*z55 zMav#D{^t1RIN}q)IQ#jzpJn*h)g8X=e;wXo3_bV+kB@NjqO#i!+T-Ir&ms3y^s8Pw0 zHe^~5_>yHnISJ>i<2h(=XRqlhV-y8zvSz5R}yMyagOYV5rdsI8LFe zVnn{|EQ^?I2b1q&xO8W*P-bp5$=PlgRq6;;?8f*rr3|eSI|gmkIAjyXe-&W=h+^IJ z-f8cK8lmrTAKJB@b^AYYP07fG;VuF>Vcfz?3L#LiT5yNw*ox5D_VhA&ZJ}zVlOrZ6 HpCo?)QfVqV diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu deleted file mode 100644 index a678b28cbaa9cafa05a668d850588ccc0b67fed1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1585 zcmbVMYfsxS6#brGaaAM)D5V={(q>wuHbaOF)`@Q8qvghK>eaC=Kj@11@w1ckL4i@N zYLoO{pZhrX-0blQV4VH@+|M%n>*5yQ^uG*mF@|&a1doq!@S?NV3*PpNec1z@qC%+ju`zPuqzoR`A);v|WRp8j?X2`mG%I4m%K(G=01U97JFsb_E=#P*Wd~FDJ_)CfmT|n;0(L5zLjDSxs`b zSw`JDLbV=Z{h3mR){X6hHfkKY2^;(nVE?FMsax-~cSDU(6jEuIah3{xlA4l{3;S6H ja>B-hhZI7gV51Q63|kSjX%Ekm*WQU*>12;d$|uQR3fmU8 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu deleted file mode 100644 index f330b6d73cc9eb10cb2d133068565f883618118e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVMS#KIK5PsiZF(OiwL`!IiDzy$E!8WQ=C8Y{&^Jtm10T-`r`3ME&$JgFn&LmA# zwCpkCZ;o${BR&C)v!9>)S%!aI-QnB**Wn$;(1TC#_y{M@D!bjFJw84_UVp2QU1*;s ztrkp~WQI6KVZ4kVIQLEvYjHuG;eT-91)eh_F)y^R=Xx1Id@7Y;upmZqIR!O=8kHPr zL#73RFIfhZlW@*Do`d#w_L8nLMxoKX_tINNo0=F>Fvm>e4Hz53%w3ZRPHYI~Ef61~ z=8gx*iqW?K3bGut<{^yBTzJZ$VNKr3JteIzXRV38Xk*I@wz)|>AFyi~isW<0m3$TbK81K~TwZA|)YP+~-VK%2 zQiNQY5JERxGGa1SjJYJyQ>P@mv{t{BkB#A&=dQ^rd&(Yo&MFo<34~t_t}BwlQ{iA= zSdYTfilu8?rHRckV)7Y6L!HI38-5PswXPG3&^sylh+O9?bPBkoW-=@Qh+2fM1S>*N{WGArpMniTiya6`>UdrQ4k9qr zXagLlP*pJ^pLdo;Otyo`cQIVLGgv4yx0>W^H;gKEgerDp{FzdQR*4;hHfkKQ2^(M) zVE>3>-SpmR?}i$o?{OvCwVieQ@42RAKa zmwV^j$GPY39v=h7`OnY2JjcH-Zt+d;%itDcIEPR0_y`BjI=kKI-QV9qUVW>OT{t;x zwA(OYido_rh0!AY!A0piI$N}hKuLSeS$k|wPq5?CZy51dXyQxjisrc`#xQ$a z1g=E0(FmytxF!o7d=I)yVi6Lh387m(zahW1J=;3kf9`0mMUZOi^pB}!g!Y9lX2U5l zY!fzNQX+I2ucfPVdXhocY?jJu zD?+|(2%&4P7_qsQBd$ow)G5g>qt#F4Z5256xf!zRp0ayhu$l!Uxc~Y+icttAIt_@Q zI#fNM44xYM+s1k^p3W~@#wtUsjuBf-5gM30E#WZgCtFLP@(tyOcXf{4X=bpRPzJlK zXRzinvfx=<=(zZBuQ>(DCR8bel8n=}dI!|vTpQK`$F;AMcsxHSn@9u|Iu0BB!Y~zg z7?KyE3&A8)zIOx^_cCyC@303!N!`b5z;M1& diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu deleted file mode 100644 index db6e22b935df5481a92cf228bd0af5c2ee891582..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVMS#KIK5PsiZF(OiwL`!IiDzy$E!8VFgC8Y{&^Jtm10T-`r`3ME&$JgFn&L&M% zwCpkCZ;o${BR&C)vmc-PS%!aJ-Q&Cdm*G9e(1TC#^aLl*D!bjFJw84_-h8c)U1*;s ztrkp~WQI6KVZ4kVIQLEvYjHuG;lFX=1)eh_F)y^R=Xx1Id@7Y;upmZqIR!O=8kHPr zL#73R>nsDxNjPU6&p~@Tdr4OrqtIwRcuqJQio|4v)j8t`FTcu?UG$gixl0FXSEhrOf5t>A`cS@6}@O==Ygngbswt=c5_X zY!@?SMS;+DxRt<`VkV`Dhxxofh@p0Y=tvxx!iCw{Wm8 ztVdy6v2_j7wXM>`<`^;g456XU;@FKs$E4e$SGrO0;a%Qz}LazFG;^>D4CMfkMZ^HV9h-9u=ohuql-ip*REaUM-Ni zv$ulnbl_UoiACt0lzc?4a}_!T+)^_c763#oLRW$nA*lWtP~1(wv$L080&pPi)7!9c6p zB)!+?KF&Qi`+N)-7e7AsiUR+Fjo+_uya$`RQwo?850$ z)^5XuDQ1ah6h@2m2baMM;vAO5TmBo%An=S?g}F4sotQ-e@rlxw!JJsd)dchy8dSD# zESX3GS49DolX%W~UV;g3`kb#0Mq#gY7gTVJwhXaEGSBSZD=;pWS^5?coVZxbYak(3 z%{`Bh2S#55kYq7ptz8&br3jS4p0j1U43xCjoVCa1>=ZjL{e}^bg(kkTu4tZHVhpn< zW#CHeWm!l~z%^Ou;Cs+r5{rOr47CDq8bY-By89pPM1;?kT(HC2LqPg8MJuq8No>qLV@V zG@+XLw@fhp1w4x@hZwFGM4P`-Q9Yu)!}3 zQ*nnOc@er4Ofuzr$3Sr}2N(Aadk~Z~eY^zhCtxIY1svs2Q=gDeCo2-B*ua#V7_R&= z%(b04LrS(;M%_9>wH{Lah1QnVjU9lodK9~fOZ*UF|EOZA+u%*`1A|Z!l61>BO9gMG qrex^je#%Hr+?e=~Lo5_r5+a^qC!#jp(RudVYtd+v?K4IBIQs)LoEPi> diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu deleted file mode 100644 index 161e1337315dd4ac59f5b824d5e3d6e3dc370a54..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1652 zcmbVMS#KIK5PsiZF(OiwL`!IiDzy$E!8VFgC8Y{&^Jtm10T-`r`3ME&$JgFnj?kn* zMav#D{^t1RIN}q)IQ#LrpJn*x)jhuJe;M9m3_bV+Pfu|2qO#i!+T-Ir&ms3y^s8Pw0 zHe^~5xXv=5oP=}M@f@_bv)6Q$F$#_5gO}bi+SJ65f;nay2ViUnGj~lQII$s^Lm)mx z%^eSr6{Cj$3bGut=01$eTzJZ$VNKr3JteIzXRV38Xk*I@wuJ~) z?OrX)ilu8?rHRckV)7Y6L!HIR8-=Dxw{@>{qvFH6+Q)wVQrOfag?*k=*s@pSH(slB zEB3~*9jcsfD3gx6W#y+-j23*g60Fm!O|%1rj)jE~76&{kPN85^DkVa34C1}oA$4*N{WGArpMni@iya6`>UiD)4k9qr zXagLlP*pJ^Uv`#7Otyo`cQIVLGgv4yx0>W^H;gKEgerDp{FzdQR*4;hHfkKQ2}@uV zVE>3>-SpmR?}i$oZ*eQywVieQKXXmV$c5o90y$ya!b=JvP_SBXhv(Re(Af6$GI?#S NYNeASCMlmJe*iVWETsSd diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 9269ac054465c4a44cf407ae57d5db1ae43e2374..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1591 zcmbVMYfsxS6#brGaaAM)D5V={(q>wuHbZC|tP|bFN6U@f)~jFggRY1lKRZbubTH7W zHc9XGxsP+t%^sfs#>J11{i47>FK+O4|MTz$V>pKo@bCZ!Pda07;f(*4~A2Rf<3vw45zFWuT<9;jA+;rzhBP>9>q{A~f-pbxrfa5@VP> zE(2Ghm1Q9{0as+HgYQFcMJz(1G9grG5u~~%zqCDjbb9d0)8A*75jqgMoR4P2ux;3s zNr}*9yjHI6=}8Xh_x^m9>y}vaAFxj<&MB0hYxN@fZ4T+XPI+Ot(9^(%X1i3@SP}A7 zLkL}Q#fUAm9CJldrf$V|6|H%yZtK9Q&+U+P_mti7k~J(C!TpzSQIA3}(aj)!no!M% za`@Xgcx-HTV>7}Vy{P7wuVPgp*2jn~X9x{Vk@j#DC6sSeQ2R#m-J2%I?lf~)Pbi07 z)^pf!8Cmcw?sVLJxYL}1WK*gXLP^Q#TE7MweQpdJjpN4GO+20-R81s;N*#v{ercGB zYYfSY(1l=$Ini}OKH2J zAr!)$pS$OtyYm4b1IF3UFTE_ozbF6a+4_3@9hzoOL`0?d|j>Uloi(qj~40w~RJ5F{EIQnZ_O%8))XPNdzZ0kl6>~ z18eShfGio^2T+j3h&6X%T;{@41`TWSR_-ZjZ8U3*_1P)5yl`7aJQj+$(z=#;Ziv>* zoaCNM(MS>>i-2pgP~P>Rvl12|QJN6CRr4G2Tbc7mNBjF6{V_F+(7sUlY&a#FZDS^^ zC=j{~&(hUCJ5Ax^`xHKPSBdV3(fQWxL2G)!6sBngyI6kXLWnjW4$qK zRs+|*c5FiTsMI5BovSb@;1-(6u>BCd2wezP1X8_YptzTU4I7Ic2uf-`uL1iJ7<#k@ zj#8+)7?IB>%OWP*!sOc+F5NNAm6=&ha<)yQDjlIp-N-*v%FwE?L(oQzLN#IimjU(< z7OSE6PJ1`d2z`(Dpx{_ Ka=;|zfSZH5pVtP|bFN6U@f)~jP%e$W;1<7X%7g94*k zRHeziKKF6%xoPq-V4VH@+{-fj>-+{^_r45nForYu1P>3e|E#my4c@)IE#&353fYC@ zgQVSt36sna$0&>z@dfAJ31TfSh%@{TF1)}qW+di?7WPOlB8X3O!#!#b@ zrZ!|!5V*)Ppqzwr*6|#)x6_w=RWJ&T)~%P`GTPF_kb*g88arTYC^L60A~><3m|Y+~ zRLva^kb6dV0Tg60Vy$f$m$~qiLBpE7oqI~!YtGtZeR7N~FWiO^kA)(xw618L8=^Hc zN4e)xG?K)}Cg6%Jly^PoE{R1*lqQ63)cl(KR_654(f%$+e@qP{v@cXX8%~L4o0thJ z3WP4gy>xX>j#K#XE`^V0%S5-t=>LFS$}l0HJFeub=(j1v-zw#~=0Z(88|ckaS#3qg zl?@?u#U&#qQ^kl&5;Jv5rpsvcQ~9?F9Q)i1S#?j@9nV?Kf)?C;{T`Gk1QVSE!c~W= zhm*onWB;+S-ioKgOP8_A5UXRvGZ+b^4l1m!9W8{9%O85S6# z7ol^(3MO6e5Gd}YV8ha32ZEBikJo_x2n@uofTI*D>Lc>`WLd;y8<>0(!=*cfxiT}W zNzOLQs9Hy;(nG92Q_9e)u|3d6jY2nJgYN_EA5|=M>z(#)pb?5fD(o`OQo)Z>Q!;d6 mKZ`(4*qHE;LMRk$6e6BsD}pxd!D;f+H&H8{G?}D)ocskY{uaOh diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu deleted file mode 100644 index cf467004508bd41551eadc14db74497fd20d4eff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVM+in^$5PjdT7!fH-qb0OSTeS`#!8VFgC8Y{&^Jtm10T-`r`4S4s$JgFnu1T7x z2rbLZoVlF29Plw>;4;gAauV#U<2h(=r_bpsXA~OEJ1@Ouw5f?91#`?aUV*V8&D=GK;KYVxUIXzV zYwmb}EE#)qF*nblfefIHh7Z=gXyF?QZSjG*IYRxCY^Bz`f!W3O1oqA{2KZKC1&# zPxeNz*$!Om+OY}Uqmqxvb*{psfLmxL!vTP(Md(7XA|%y228w$r*s!$NfuN+$=Oti2 z0z;41z)=cS7bEiNU|GauTbO(s!=*chxiT}WNzS&@s7gnuVmIcWDP?F?*db`6MxmN; z0hR&wk1SS0@16E;pb`2WPoiD`aaAM)D5V>0(`H(vHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{zlr zwA(OYido_rh0#2{;39ZIoWqiM%l}{*1iocfVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mp^E?O2Nj&E~FTeyhea=?}qtI;K1Qi^kEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tmb&**CalFUb}wGHE{5P>piI$N}hKuLSWS$k}bkFev?uNm=JXyQxjlIEEu#xQ$W z1g=E0(FmytxFmBOd=I({Vi6Lh388B}yCT1}J=r+gf9hziMUZOa^pB}!g!Y9lZiiE1 z*g9;&q(taE-b+{K_$Y%9?=tv!y2y1ytoaYvg^EK8h38tmh<=+v`YltQ87}lBuz^`G zmDN^+eAy5}ms~Mob1g?)k(8-Zl3hltpUS^w;MC`O$f|qF?s&m!7L4Hj>-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3@704VNd;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!<9dP znYOpikbs3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%LP|D^HUXVGZW*kOwDapNyiCl~er diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu deleted file mode 100644 index 9fea507421b3a7c564298daa21c6271d8bc15882..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVM+in^$5PjdT7!fH-q9wFRTeS`#!8VFgC8Y{&^Jtm10T-`r`4S4s$JgFnu1%V# z2rbLZoVlF29Plww@f@_b)8}-RGYXC7y_eoH+SJ65f;nayufW)lX6~9qaAHF;uYvfG zHFrEfmW;jzP>{ulHTPj$=E7434QujN?kQ<)IBSjd`5Csna9ct=7K*r1x~6$$y71olEg^ulI+r3{Z{_09mhJiT~^go_P}#ivCvK+d}?srkre(G z_IHK#Dr`Eo>Os2lRk~Q45tC058t5#p-7ri{x@mf)8y4^1)G_w^m%_RyDeUu{!iK#X zzm8g!Td_Ba<51OnO__AuEvh)BVmRl^rC{xD?cyv@=vcT0;cCEx;uH!tp;96gcOX8i z15!`+MzGlqT)^66k%#AI8Td>g~1JAt_}Gpk9?w$rFeN2p>q=AS8LXjRxDXro4< zns5P@0rrn9RzvTd_HLjN`W8>3UHMs!|4wL1hAu335y%Pa7G6>aiGtOFJ3PZygu%AQ R7s+!sRV$qwFiH71`2*{XD|r9_ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 86c8a6e01b3b0e9435d875061afbfd22f01e3b16..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1588 zcmbVMX-^w55dEHCF$xj_v>`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvE(LP1?c zo81||Io`b4Jw67E^Y0&fd5(Wv+~S+wr@<}8a1I~f;Qp3#>8Bw37DYZt~KiOIWm2W8DzN&NVPBVkm zgfiG=J%crukp<7;ADnw9h_$#N&hX#3@B+`6k(d`+*i*fTAU=`GFqjh~xtxF+Lybxf zv>}s%z*Uw3;v&3 zYVLS|EE(MgP>{ulHFsfL=E7434QujN?kQ<)IBSjd`5Csna9ct=7K*r1x~6$F-m+2<;1%&xTW?**0v# ziUOgl@GMpB^RpB_yi4I@cjfDz82ulxYZ>a~bH|l@75z4ac&}YvYA)2ovw_|YmDN&& zT$vC;H(WAeGF6PYB+*m59|`|eoDn~&X-HU+TGg3S)kCdun@xHfCt4X6l_AJL@16yd{#T8 zZtjg>Ga$IuwPO*wMs*CS0k_ahh8+M=i_oQDMF^^Q0u=XBuwibo13^h0&uhSb z1cn-|fuj_vDn{h<&a#NfwlMiNhD&z>b7f{$lbmgbQKgPh#cqs0Q_9dPu|v>CjY2kI z2`mHbA5pBD-aGBxKqK@mZbiGYvugh*t|=M1Fx*8TCyZP8NFf9YRtxU%3|kQz+a6ye PFRfLrbaKEX<>TZJTtzIe diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index b452d8ed0c9e204adbae54f2e0b55243f8c08cc5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1591 zcmbVMX-^w55dEHCF$xj_v>`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvCjQYcYZ z(PnqXZ;m%_c8`w%P81}0B?IE)g?)+(rcL;3bqonw2N8LTFh z!8Yp|thtOVcoug$?mpaUPC>E>RSKb`Z_Jdc_wf?2pMa6r6>yY6MSVg(oh(n7d;?Q#Vz}~0 zFxPhG3@O-V8CB~DReDJE=UQ7@HMS4N>QU?_?(uzu{iBMdZi6?$4-7&{NYX9iEEW7H rH6=qI_ftl4;>N^>3}T_+k`VC>I}x?%4$m6TeHo25jXkC)A2cDr7Cc(_B}d@so^w2z}! z3#LpkO)R4@UWPw7bv7f$;G9^^|Ki+bc+RxIJXe`HRm%YJsgRn%f@s0T6yyXdDmqY_ zOmhO)NdlA;f6iE*f^uf|maa5Lq0xMB!Wl-JifEEE%XDK8#`rK(+awt$#s{+x;(XNH za*wPS-3Q3Ya?F~$7#C^gD1(O4X)ASwhvE+hfBWGS8PLw8+Rgt9_}(7jyTkv~#jK07+t=jhLwW`qthna)QuqS!VwWqFR! zwSN|>&gYLYoV71vxah4tJrJ$l<<8qN1a^bn2;U-?S}w)A^qF_+zqQLN#WOi|Y^b(l zWt9{mTV#aLEfhh`WVsa456V)!qScWyu_Q9m%35@=|kmXw}CO#J&9qr&M|DNSMjyi zGTpLz<1i0p&Nq|^%k8rCQz%9YzFK9h)2mFheF-h|cY(h%;8Ai4Ih#@;5DFU*p5^sX z&GkmG*$rIjI-v=@qoN+6)~WQ90(PmG@aqpji_lfZavxOx1SsyuVEoEr3mGL`q@Ei^QnnpNnL0vcbwm7#l$w@_9fHzw?6dLv zzw+3BqF4{Tv&z|_Lg+`h5Y5`odi(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`lQy=za2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($)F&A%)}iX$q(3%S?B~=r?gE?Gz$*4ZD(IL_T+1$v4r*A!+!nQeJ2-)Woxa z-mI0?R)k#H5kl8oGGa1SjJPB*Q>Uc5j8;FDf2+W;(an%m`;^`DoYgdF!QHnXL5e~! z(Mcd&b*OqeDLgm!pBn46cs{*!8LJGjIz~)BMQEV2xPrr=o^&mM$~P=NzOQp^k28hU zgi_dMJ%u%wfd$9HI)}A~d&MafY(k|(D2h1Vt1mz;&9z}IZ(RF2k;n6cvWZAguEMav zEi{v1gCTkmx)7{j()A93;$8|iY#nwWD5?8+3)qjqKVHO59uBW-=03_Al%nVp%;MGEo9cV>6Fs#QXk z<(%`K%XcmZd<+<;zrOa;6#u@u$9KJNgL{mj3t!;r365S=b~-_Oc({kW`CcMB&^oR) zn=oOL8R8g)(IWoAnRkL%i*w=(|BG`k@QfLWd9H;$)r$z?6R8Y?IWdyU38*ntsM>)x zWRerOPE(+q1Uu_^2HM-{Yr4`Ig?i(`OK%x%Xktjt95eNIU~EV;a}6Rmu_2lFKzzuW zJ02iQM&AR-$zsGB`!Fsu;VFZn+ zLQOmy=;6(MxXB_k$P`G`vrBee^*3vbn1@w0Xu3*B~EmQUFu&sa%Ay9VJ? zfvc7z@Lbs66;`|Oyk*JCSK(r9MoczEXrR-$cf&9;$)@Y2ZkT`iP{r79U;^u&B(TqO z0vq-!{5on`Zpq##&O=%A4Q0}Cwj7o-~PjbO7LxYD&_6S_wQACc=!g-HRo&`gF408xw3m0)>Ds&@hu_Y$ySYq0}C zNtMrAz%AoZKqL|j!?;N%s*Ai(6X>Y z&_<0yHQ@v-1MELptcKn@?cG2l^dsIxyYjOd|DDj33|-jnJdhLCExaTU5(TRTcX)=a V2!m~pFKe$IRjqXGfJw^7wSSp_E3W_m diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 71c9cab8696620bdf5fcef03d6aabf0e296c78af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6YfsxS6#brGaaAP5P)bMHCT3csHbaOF)`@Q8qvghK>(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8rpFX5;+&)d=^n97@ju`za?xdYU#I9jiGEB+mjw|^p`Zy#Fzg5Z$ z&4rqHHqe{3vf7G}D?38ynoCAZriu}lBxdTAY?smMr}A$VI5xT&vTC2Qd!DnJ1}(Vz z_9LiK2qroSgsTo!4=9DF#{Rak-i)UMOqa3B5UXRvGZKPa1s z1m!9W8{9%O8I~BL7oiKm3MO6e5Gd}YV8h~J2ZEBikJo_x2n@uofTI*D>Lc>`WLd;y z8<>0(!=*cfxiT}WNzOLQs9Hy;(nG92Q_9e)v3<}+jY2nJiys2)KUFMs>z(#)pb?5f yD(o`OQo&oPDH*!3phX}jY)p7aAruNW3K7q+6+xT!-C6R|cTp>y>@i9CIQa+o#TYCA diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu deleted file mode 100644 index 919aad63bf936ea876d2811240d66030b16fee49..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6+fExX5PjdT7=eT++K>|3ikc)+vPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u7yGr zZ8kG==5pq8$j5+j_VY_G%kZzOdwkdXI=IIey6_pEp5XXdWv3IgM@M_en{O4e1Fd&S zvk4O>nIVo*7%k!-oO>sTwYVV8@IScl0?(L{m={{uQ@w~FK9R~Wm=hzpoPZibjY%5%y!4jQh9-s-%rSHD28<19=B_~mCpIMW7Kjg7 zbH@W@$>>`E1zC((V;{z4E<9y$U`^i4JtfTzXU(xbZ(+*|wa7z}-yB>5_#3Ce05ki>~zL0n1w=$PI*8S(K@6~+A^vBdNLi;`rtLzjH+xRS4;&qC1fU5C8V zT&Rg>1HGLqtEC9JvLb|Txn#s-su*!eVx)G-cImBtD}UCGW1-tFtMVy(K3Enp*%l_>#&GFQV6M!}YLc_BZWmlQ&xV71^5 Y&#)C?uZMvM4@bMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaLziGgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9>6HnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM%w_n$j`r{-Hne@qP}MW3sDHk=Yo zH!%|`1VUHgUb;FT-=}cWK1<>3Vwvfl82u{lw4Fl4u3^_QOv&etDfue;BqR;LRmw}v zxSDu6(3`c=+KP}XJ3{D&NlHwn#E3}}Gj&R~%V_me`L_xj8{G_9wO`N&meZOB&6)f9 zJ*bg$$~y^!s}5BUD20EG{cU5t8S4QqN0KgMl_6Hgh{>l24RjW_a2N!Xt~F5khT_A! zI>+ufQ&>$Xg@oC+#k?+6OqOTmW4!wz^s>ONir_9HM5y8@0 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu deleted file mode 100644 index f4928b1e3674bda3f884c208e1fdd2db7bac51a0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TTk0C6n@XII06YVl+sbQiJ7)in<2ym>qNKlXt}Z5dUb5e7rIvb_}NK%p@Y#V z%H);+^=)z}sdV=E@m7Pw|9v$r8kHPs zLnZ}*>nsDxNjPU6&p~@TeN9&xqj1o8@X}jG8=4qWFvrZn9vB=_Sf-FX?u?yoe7oIXWuqJQjo|5K8Ov*0w`YnNA=3pMd< zptoaXwG<&&W`xi!myDQ96(cT5^wcg1FRj&IHrV6gn0*Lf9Pes5pg!O{kOz z#W{%2YKhd{y%B6C1lPKDEJF9VLl3)qjqP@^?)ltNX-hu7BSftCf~+z=}us-%*<+%v+Xdd)Df!Kjqztn8CoTF z1lp)k$R=!oWq|!BidEBlr@b3!guchMXjgVt?f=X*B|{gcy9nfjaSIp-=?M;AbauPJyT8AMy#7`pyU;#N zS}mARNeywdfYBnp;M_Y-ti^&j!~S65Ii67?F%z2GW4(wVHj&Cum=hzJoPZibjY{^k zArnF1D$Af?B%HI3<)FQtzUHff70_tjd+9ALnwl6QsH3K_1IC6jbJrxC5gUrx1>!^1 z%&`D@q{S`(K^7z0+=elkb6-$sSd+JMUy#vLFI}CF?^8HwpQUhiu}pSPjD8b$+D;*2*RX3D2IX_dlzbC?5|W1BD&?hSTunS3 z=*?PbZAHkH9U*kXBqb(OV#FkgnK~umWwiRK{96T%jc$gl+Art>%V|x6=FENl9t6ob z<(&k=Rfno)l)}Hp{&Qo!9P1h0sd+iTbQ!A*u{uUfK1FDtv$%-EprLdvgUUA)AKukD zw#S*mYCcDr7Cc(_B}d@so^w2z}! z3#LpkO)R4@UWPw7bv7f$;G9^^|Ki+bc+RxIJXe`HRm%YJsgRn%f@s0T6yyXdDmqY_ zOmhO)NdlA;f6iE*f^uf|maa5Lq0xMB!Wl-JifEEE%XDK8#`rK(+awt$#s{+x;(XNH za*wPS-3Q3Ya?F~$7#C^gD1(O4X)ASwhvE+hfBWGS8PLw8+Rgt9_}(7jyTkv~#jK07+t=jhLwW`qthna)QuqS!VwWqFR! zwSN|>&c&w~+UFNBeD1A1JrJ$l<<8nM1a^bn2;U-?S}w)A^v4*&w|04@cqXTg4b^t6 ztdb&Ri;NJu<$@8N$b8HN37$GdsS9h>Tk%sH4vB7?EYqj#k*BOILni|Nso<&~F}xHG zo(ro*c-gObZL2U*A0s-QAv9D;Sh|s)mw40iQa8%aKU6+;8yG{~lNfgE9K)u16<>QT z(=Dqv4)ajvd_$SA+%8K$g<`bet5wE2y~;$}m(VhQ7x+5^9wn!cvndqbX%QW!qttsUuWYH^iSvscD(mAt)`!J{!OP zE06sriuKSttDGGwgnonz(X8#P$N!mYN=DXickYYh*Ui7g-~$DtGH&r4o6HZkIlheE Ox>1!>(E$^bPojS_GAerj diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index c081e366b1522cb0b4fd5fc78ffd5bb4f96a3735..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcma)6YfsxS6#brGaaAP5P)fHl&}LesHbaOF)`@Q8qvghK>(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`<2JUua2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($zWrm_qyHG=)#+%S?B~=r?i4?Gz$*4ZD(IL_T+1$v4p-Qi#7*$_vefns_$Q zo3*mqijXTiLg<=HMogxP5tk%p>XcNM(dwu2ZxuK;x*4)+pR#+Nvzi7ixcl}aNKptT zIthfU4pmPlh3CfpQ)9gr&!?9zW0fIR$B4G`#Q(=I8#_n zD1~j-Q&@8uSa2+?b69)0SDZq@CR9p6Fs#QXk z<(%`K%XcmZd<+<;zrOa;6#u@u$9KJNgL{mj3t!;r365S=b~-_Oc({kW`CcMB&^oR) zn=oOL8R8g)(IWoAnRkL%i*w=(|BG`k@QfLWd9H;$)r$z?6R8Y?IWdyU38*ntsM>)x zWRerOPE(+q1Uu_^2HM-{Yr4`Ig?i(`OK%x%Xktjt95eNIU~EV;a}6Rmu_2lFKzzuW zJ02iQM&AR-$zsGB`!Fsu;VFZi14Rjj!ZWu-;*>t_s4fFF4RgC=xCa~^F0{c8C zuwk#lucMabmh6q zRQbFG>_=ee(Hb~PpzLBqz8ox#m~;!1ZDY7}CoorLW;My!b{b{r2$k%{{8Ob2EeksY zZPX}K6HdS~!2XlPYUsVw-VHQDKjKZaD?h98-w930(1q>J136*c!b<`nQLtKYhiBM| VFxd9^vi90h)k@b6n52AM`v;+KE3p6o diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 997a5b23bf5b89ba6a67674701847edd5a9e21ad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6YfsxS6#brGaaAP5P)fHl&}LesHbaOF)`@Q8qvghK>(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8rryo;jpPZ)f>3o^&ju`za?zo*o#I9jiGEB+mjw|^p`a=rww@P`T zxlj|&270qrR$CErWk(2IbIFLwR59X`#7v!%?J`>ZRQ|03$3{0pR_#-E&vRDOpapl| zegriN!9*v4aMhve0j2QN*xxqRoAGpj=`vOsVs(s|e2UOOXK@RMK|twR1C?)BoV>4d z?2a>q)r3;mWj%#8mw^Sx!bXS9hkL~-6l_AJL@4Sw-m8y5tJj2!#)O9yLZM)z5b+FK5wvOFoh2`Q7q!yK9+Q-hlYan77%l(+ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu deleted file mode 100644 index 67a7c7644abf6915c42625fb804e2bf82ca1b50b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcma)6+fExX5PjdT7=eT++K@IB)S4txvPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u7yGr zZ8kG==5pq8$j5+j_VY_G%kZzOdwkdXI=IIey6_pEp5XXdWv3IgM@M_en{O4e1Fd&S zvk4O>nIVo*7%k!-oO>sTwYVV8@IScl0?(L{m={{uQ@w~FK9R~Wm=hzpoPZibjY%5%y!4jQh9-s-%rSHD28<19=B_~mCpIMW7Kjg7 zbH@W@$>>`E1zC((V;{z4E<9y$U`^i4JtfTzXU(xbYhlX^wa7z}-yB>5_#3Ce05ki>~zL0n1w=$PI*8S(K@6~+A^vBdNLi;`rtLzjH+xRS4;Kco=9>yTHP z3pMdn)T-Qyy-}Qps^%NYq~mT;#VHlTIbSXXYjN^R7NIM_ijY+A1SsyMV8hm8 z2ZEA1pVxr>2n;=114k)TU5v=*gJltuZDI0l443W%=E}^hCOO+qqbePtirtuhrj(&o zVMm~i8ii`Y30MZ$f3jE&y?5HXfkxEw_}%E!rHrw1#s diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index f918819aa1a018de556a7dab1d03ff7f01678636..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1589 zcma)6+fLg+5PkPojDmy!Iiw8*wM&AOxCqfgQ<1hjt~T~2*=BFr3uzD^-(B0e77AA6 z*fV?Pa^{Tp*cdR*etzy{8UA&7hi`jd26q_41$=_1CpdW4+3g1J{{9Z~`dfwULi;dj zwO~ReHN?>ZMvM4@bMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaLziGgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9=RHnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM%w_n$j`r{-Hne@qP}MW3sDHk=Yo zH!%|`1VUHgUb;GGA5v(ao~7{dVwvfl82u{lq@6;3E_H`nU7YAh% zk)T|KVS`&}D#H>(^dfZ0si4yJj-bH36l_>L?0^@f?&CFJKLP`>E8r-Fiu#CrK3Ntq z*#;)x#4zcOV6M!}YLe5VgbiSWpqj2^$k0QV4~DjY7mTYid`YvjvlRYX6HctKm1>qPj diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu deleted file mode 100644 index a27a779cbedbacd4082c575f6f5b0442d7d50145..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1653 zcma)6TTk0C6n@XII4TliD5YB&Xfti4HbaOB)`@Q8(Q;$A_3GG`FLbT=@w1ckLI;B= z%H+lX^=)q@re1xNCmECU89vuqJQio|4voX6t`FTcu?UG$gixl0FXS!xt<1%f(}Opj{xLI*(1B3-d^97P?ZT$4 zC=j{|&r;Pn|CmDi>^y}}y|uG@V)U!H({>6GyMue4=KcZ0fVzV@+Sz!Ww$NnxMo z6t?Wu_>I>p-HN?&?1w7nYs#eKZdv&$6{7`Ttpw}zY7^~1p<`hqgv|jDic=`qluC(E zoP+qRmPp;*TfufhaINdaBJ_?*J|fq-3Y`LOshJE*0HPM5OTmf|RR0(#?x$eG;9>`Y zk~*H(fP)AOHQE5jDO6RA$mgA95tHp;@?8vqNKl(Q;$A_3GG`A9O|h_}NMNpo7t< zN|Sqi?&I8Zv&YAParX0bFU#<+%R7AA`!cx07%t!wJUzj|i_UI0c=z|Wkk{WTWEa|p zNvj1DCYd3QQ5Y@a3(mb0#9CYsXZRmnc!6ikNX!c@?6F=%5T8h87|e;0TuwlZp++Tp z+K@>>;3~_2auUv2$8*r$PG9p?!6-DE_g;F-Xj2nI3g(z;?0~VM%-l7J;KYVvc7ga% zHFrEf9vR&QP>{ulHMe10=E7434QujN?kQ=lIctseNgG>UxD6v73q@RMUC}%@L~CY_ zbI+w{B#Dnrzzta_?|RT(5{r;1O$cR5_(I;2-^!dnJKf*$^pB}wg!YBXXTvGcY!fzN zMS;*&xRqm4B~=Mk)Mj>_0cw%dwu}otl>eOqa3B5UXRvr$;GN^pR;`Cje zV|$z_tR|GgHtQ*@xeP2g7M40JKRhT-pC40{aGi_oQD1(U9K1QhpDuwnPG13^jM$6LUD1O{SPz)=bn^%41UvMge< z4NSg?;nE$!T$!2GBxjptRIMXa=^@siDP?HY*gj~ZMxmRq$d3W`pDLER^-g;?&$Ini}OKH2J zAr!)$pS$OtyYm4b1IF3UFTE_ozbF6a+4_3@9hzoOL`0?d|j>Uloi(qj~40w~RJ5F{EIQnZ_O%8))XPNdzZ0kl6>~ z18eShfGio^2T+j3h&6X%T;{@41`TWSR_-ZjZ8U3*_1P)5yl`7aJQj+$(z=#;Ziv>* zoaCNM(MS>>i-2pgP~P>Rvl12|QJN6CRr4G2Tbc7mNBjF6{V_F+(7sUlY&a#FZDS^^ zC=j{~&(hWY_&$ZR(-b~+SBdV3(fQWxL2G)!6sBngyI6kXLWnjW4$qK zRs+|*c5FiTsMI5BovSb@;1-(6u>BCd2wezP1X8_YptzTU4I7Ic2uf-`uL1iJ7<#k@ zj#8+)7?IB>%OWP*!sOc+F5NNAm6=&ha<)yQDjlIp-N-*v%FwE?L(oQzLN#IimjU(< z7OSE6PJ1`d2z`(Dpx{_ Ka=;|zfSZH5pVtP|bFN6U@f)~jP%e$W;1<7X%7g94*k zRHeziKKF6%xoPq-V4VH@+{-fj>-+{^_r45nForYu1P>3e|E#my4c@)IE#&353fYC@ zgQVSt36sna$0&>z@dfAJ31TfSh%@{TF1)}qW+di?7WPOlB8X3O!#!#b@ zrZ!|!5V*)Ppqzwr*6|#)x6_w=RWJ&T)~%P`GTPF_kb*g88arTYC^L60A~><3m|Y+~ zRLva^kb6dV0Tg60Vy$f$m$~qiLBpE7oqI~!YtGtZeR7N~FWiO^kA)(xw618L8=^Hc zN4e)xG?K)}Cg6%Jly^PoE{R1*lqQ63)cl(KR_654(f%$+e@qP{v@cXX8%~L4o0thJ z3WP4gy>xXxyi4KaIE9aA%S5-t=>LFS$}l0HJFeub=(j1v-zw#~=0Z(88|ckaS#3qg zl?@?u#U&#qQ^kl&5;Jv5rpsvcQ~9?F9Q)i1S#?j@9nV?Kf)?C;{T`Gk1QVSE!c~W= zhm*onWB;+S-ioKgOP8_A5UXRvGZ+b^4l1m!9W8{9%O85S6# z7ol^(3MO6e5Gd}YV8ha32ZEBikJo_x2n@uofTI*D>Lc>`WLd;y8<>0(!=*cfxiT}W zNzOLQs9Hy;(nG92Q_9e)u|3d6jY2nJgYN_EA5|=M>z(#)pb?5fD(o`OQo)Z>Q!;d6 mKZ`(4*qHE;LMRk$6e6BsD}pxd!D;f+H&H8{G?}D)ocskZCKkW| diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu deleted file mode 100644 index 465b736e67f6198a1c26f93cf4c3fa565b84d718..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVM+in^$5PjdT7!fH-qb0OSTeS`#!8VFgC8Y{&^Jtm10T-`r`4S4s$JgFnu1T7x z2rbLZoVlF29Plw>;4;gAauV#U<2h(=r_bpsXA~OEJ1@Ouw5f?91#`?aUV*V8&D=GK;KYVxUIXzV zYwmb}EE#@-BqkRV)TE&u4HJE&mC9tMfAHA;&&bLLUW-eo(=SN zs;rhG)qF*nblfefIHh7Z=gXyF?QZSjG*IYRxCY^Bz`f!W3O1oqA{2KZKC1&# zPxeNz*$!Om+OY}Uqmqxvb*{psfLmxL!vTP(Md(7XA|%y228w$r*s!$NfuN+$=Oti2 z0z;41z)=cS7bEiNU|GauTbO(s!=*chxiT}WNzS&@s7gnuVmIcWDP?F?*db`6MxmN; z0hR&wk1SS0@16E;pb`2WPoiD`aaAM)D5V>0(`H(vHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{zlr zwA(OYido_rh0#2{;39ZIoWqiM%l}{*1iocfVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mp^E?O2Nj&E~FTeyhea=?}qtI;K1Qi^kEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tmb&**CalFUb}wGHE{5P>piI$N}hKuLSWS$k}bkFev?uNm=JXyQxjlIEEu#xQ$W z1g=E0(FmytxFmBOd=I({Vi6Lh388B}yCT1}J=r+gf9hziMUZOa^pB}!g!Y9lZiiE1 z*g9;&q(taE-b+{K!@CTQk23gpy2y1ytoaYvg^EK8h38tmh<=+v`YltQ87}lBuz^`G zmDN^+eAy5}ms~Mob1g?)k(8-Zl3hltpUS^w;MC`O$f|qF?s&m!7L4Hj>-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3@704VNd;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!<9dP znYOpikbs3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%LP|D^HUXVGZW*kOwDapNyiPZ#z8 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu deleted file mode 100644 index 023bd24211a6a8a2bad2fc2d692f431eb928309a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcmbVM+in^$5PjdT7!fH-q9wFRTeS`#!8VFgC8Y{&^Jtm10T-`r`4S4s$JgFnu1%V# z2rbLZoVlF29Plww@f@_b)8}-RGYXC7y_eoH+SJ65f;nayufW)lX6~9qaAHF;uYvfG zHFrEfmW;jzP>{ulHTPj$=E7434QujN?kQ<)IBSjd`5Csna9ct=7K*r1x~6$$y71olEg^ulI+r3{Z{_09mhJiT~^go_P}#ivCvK+d}?srkre(G z_IHK#Dr`Eo>Os2lRk~Q45tC058t5#p-7ri{x@mf)8y4^1)G_w^m%_RyDeUu{!iK#X zzm8g!Td_Ba<51OnO__AuEvh)BVmRl^rC{xD?cyv@=vcT0;cCEx;uH!tp;96gcOX8i z15!`+MzGlqT)^66k%#AI8Td>g~1JAt_}Gpk9?w$rFeN2p>q=AS8LXjRxDXro4< zns5P@0rrn9RzvTd_HLjN`W8>3UHMs!|4wL1hAu335y%Pa7G6>aiGtOFJ3PZygu%AQ R7s+!sRV$qwFiH71`2*{`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvE(LP1?c zo81||Io`b4Jw67E^Y0&fd5(Wv+~S+wr@<}8a1I~f;Qp3#>8Bw37DYZt~KiOIWm2W8DzN&NVPBVkm zgfiG=J%crukp<7;ADnw9h_$#N&hX#3@B+`6k(d`+*i*fTAU=`GFqjh~xtxF+Lybxf zv>}s%z*Uw3;v&3 zYVLS|EE(MgP>{ulHFsfL=E7434QujN?kQ<)IBSjd`5Csna9ct=7K*r1x~6$F-m+2<;1%&xTW?**0v# ziUOgl@GMpB5ARYqKTF|bcjfDz82ulxYZ>a~bH|l@75z4ac&}YvYA)2ovw_|YmDN&& zT$vC;H(WAeGF6PYB+*m59|`|eoDn~&X-HU+TGg3S)kCdun@xHfCt4X6l_AJL@16yd{#T8 zZtjg>Ga$IuwPO*wMs*CS0k_ahh8+M=i_oQDMF^^Q0u=XBuwibo13^h0&uhSb z1cn-|fuj_vDn{h<&a#NfwlMiNhD&z>b7f{$lbmgbQKgPh#cqs0Q_9dPu|v>CjY2kI z2`mHbA5pBD-aGBxKqK@mZbiGYvugh*t|=M1Fx*8TCyZP8NFf9YRtxU%3|kQz+a6ye PFRfLrbaKEX<>TZJTx~3` diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index f4638ce13c0b1889c84464c2940df10a8dfeb00a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1591 zcmbVMX-^w55dEHCF$xj_v>`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvCjQYcYZ z(PnqXZ;m%_c8`w%P81}0B?IE)g?)+(rcL;3bqonw2N8LTFh z!8Yp|thtOVcoug$?mpaUPC>E>RSKb`Z_Jdc_wf?2pMa6r6>yY6MSVg(oh(n7d;?Q#Vz}~0 zFxPhG3@O-V8CB~DReDJE=UQ7@HMS4N>QU?_?(uzu{iBMdZi6?$4-7&{NYX9iEEW7H rH6=qI_ftl4;>N^>3}T_+k`VC>I}x?%4$m6TeHo25jXkC)A2|P1tJWluk}X11&{U)?kCwA`5;tDk@sTwYVV8@V~h50?(O|m={{uQ@xBJK9$NaSP&z*oPwG_jY}*py0%P#l1GueL|s)?2}L zG;po!#3J;LNEZmF3J(+^RL(1l<{2&#Vq6!%lGVPdfZK}j9YYrsJSh8k^v z;}ohYM&$F(vWUrcF!?TqOLqbbW#(3sob85DrH)X=Zj3)u%FrsYL(oQzLpEXjR{{2) zDArByo%U|15&9ALpfSZH5pVtP|bFN6U@f)T?7#e$W;1<7X%7g94*f zo22*p+{d}+W{*z*`=F;3CU_auUv2$8*r$PG9oX!6>xaw_bY7Xj>CQO6Hhp?SQeN%-pq!;KYVvc7ga% zHFrEf?it+$P?E)%wYOnh=E743Eo<^l?kVZ4IqOXH$uYLPbQ?xI5sJ9Vx}te*h}O&; z<(^B?N)jKNfGe_4-u0okBo-l2nGm{B^K0@)nbSu{2fG~oIW>&Xfl&EuG$opCVhUE4 z2wjA0?f#L)P6>cE@wpu%HEZ-+lxo3c*A-f$(WU zHN#2asd4bw*lfkq;ib!1RfyFwV)7|ML!HG990l>D>-pEdQTgF*lVf|DDXb=x!ZzzE ztho#VDcG=d*nyy=>EkuvAOZuiE8sYVn)--*K3Ntq*#;)x#Bk{jVXn-~ zYLc_fGV0b5s`U`-&y+H>Zfqa4QRC1}*x>sB`%e{1-Fm0J8)}52kV?CZvsCb-)Rc@| n*v~SM6E-G1q!0=P8-<8x*ovS{`{p!x>7A&RPWG6je3JYFL9G_W diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu deleted file mode 100644 index 576d6663e15ca34f8d4d3a409c9fd5830e604e59..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcma)6Sx*}=5PsiZF$xj_v>`3DRcn$+$rd3hXe!c{N6T3|i5stN`5;ZikFUMEIe|j7 z*<;4v9N!!Vd<+<8zrOae4FA5o!?(R}gFB3&3t!;z5ssc!b~-_Oc({kW{$3$FaCV%u zS}O})wM`6~K-3h`RIywqH%iDv`7 z9V)A(2)QyLgl@QG#AK=%aY>@5c1d<=t$r&XYr`?mZIe~@ls)jARV=g<2)`O!S0sg} z!v3zX9)+hBOINl^6YFEd}khieBl4#m9HGkNx_ku&zl8`#h(xVXwxo zy;kW~?2TeOR5@Q$CLMQ+%1@~n&iQgFSi4)BI13ay7Pdjy8t|Yvg@R3}lnBKci1%uN z)SbN%Y^DR(x^^r=_o(C}a-FNtDc}~G$*=$*Y7x2=tO!B%PJrTG3N{Qab|5IJ<9Q9( zkHAo)HE@(dRmF&W-dPqg*%l_>#&GFQV6M!}YLc_D83&ULma>BTUmlQ&vV71^5&#)DtvF-6i Q^3qP#N+$yD+W_5h#PEvqifIl(g5Jwa4b<7&|Wgh7pg2Ccd<;Xr5bQ46{c? z;7T+bjgXpvE3(kR_n^BZ79mlZ5W3OxYw}0i)2*ZZr;g@Y1gW-8|D0MzXkX}JHk=Z} zHenMcB|;bRTDm$P-)E41pU#%KZizK7!cLBNVV5cnDHNV-^&d2YDSlfVXM zvs6}F5%Oh22wic-h|RSeaYa(5PDyqdt$r$RtH7zx&5%|1l-==y)hrmn{kI=cj6yKc zX+Zqcq3Zc$@YvYjHr9*rcz)S3RvBV-jM!p|(7@zr35QWX*;)dXZzw;!t#j;7GlSKH zGT3E3gEg0t1<&F_$Hj*`%_&GWp-Lf?WSp+mJD?Wl+OQTlu6>=vONiq_7gA?y8@0fsHjiKr<3IglW$;(O$=B5 z5a!y>oFN6VxC)NSx4_<=zv2}!zToTY-d rQd2VYaX)1wCvHr9$RHL9E(sCOuoF?6?#*fAx!0o6rm@Eq<>SUbWY!n^ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu deleted file mode 100644 index c4414a1c1f79356a8d5b78e239b3d88c1e73c180..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1649 zcma)6S#KIK5PsiZF(OiwMoVZ(TeS`#!8VFgC8Y{&^Jtm10T-`r`3ME&$JgFn&L&OJ zvd4_SIleiL_yjP{etzj^8UA&7hj05|hj$o54?e@=Bb+>`>~@3p`1k;M{jEZFq5URl zwP4C5GsH0p<7NE7xp#tCiwoim|APxJ@SGWmd7*_p)5{3rQ>hGt1u>G#DX0n5sN_f+ zGA#&PWf@RT!a3`B4%*w-wI=$!jV&+Sju20TBCeEfXkHkiH8W?q z=TbD1#K$DyhAfqLedw-b#NlWvP%=|;tex3!PM`lYa`NeYKNr?6$O z#&5h<=~nEGV>?tiUsEO>cgxC8sTeKzY9&~wSDR=D3LOjEAZ!hIP@F=+rc_FV;ta%l zwLt35-U_zUfookS7NK`i@)5btRp=COOU-0h01&kZT?$r&p!%mkaX$qch88;zl+^LO z1RO+QsL=*EPNAw|L_X~-iUTW^Ogf*=`tB>IhZr#`rU(46PD725rPD!~2|#k%Re)7}j=Lf_*`v}-%-_TO_&$;gG_E&@4W+`>x=AyBYdaEIsEiqP2h S%|-IuPSr{$M@&*aN&W)=Z7Y5N diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 7f48f96f3e26c8ccee86b82bd6c787d765986aad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1588 zcma)6Yflu%W+3iN}{{9Z~>RW~E!pUKy z-G&KM%o5Kij27t+E`k@tIV_2{{128v;2EG%hgs!<_#O7L#xFRW2rzE?KRzH=uRp8X;X2`00%I`pU- z)r2zGWj%v6myrd};zGy8hkMN_NH(ELA(UjCuGKrB7U$Zq7C5eboy6n$LD@tisL*lP z;1`CexWkaV2wey!nex3OptzTTi+hJX2ukWcUIO+LFcP~0jxwmIPspc}HQclfsdb$Ev{^x!i*KElbf%5FDkkB|3|*WW5+7us)< zRtu&~GD94rFkZ$FoO>sTwYVV8@IScl0?(O|m={{uGrf!;K9$NaSP&z*oPwG_jY^KR zA=84uRh9weB%HI3=b*ivy`-y*QD`*pz4VsRrY43I%rVnA0AoX#xoZ-^i4DOV0`Vbg z?s$N#7(E0~kmZ;)_hDS-!cztfYw}j^DQRsvYfbcd8(Utu9U-0wMO-P}(7Z52Yi7=J z&!uQ2iH}LZ4OuGh`p{hyi;yTq2xUt6Lf(?!%3M4-Jvi|6kC|bF4us0*qZ!d`7dB-@ zfzVaBm#WUk_bJ5xr{3DvJu&)K*m-*&b}d7leD1iCucF_j5U-8POU;FvdN$O%p|VbLT-F&y*UHCbg(*#pm6#X={6@T;z4RD-7RmF&W-dPqg*$yV(#c=6PVWG_2YLc_vFsjrMs@RS3XG$4bC3XzjsBy?9 zEP+*k{U?fb(|f1A8)}5U$E|4BcGm6x#5E-&7lykC;_WpIlzoWm!0e1wB%o!xHq?(c6QufA2tE}R@T z+HIIH#VqlR!f28H;39ZIoWqiM%l}{*1fDUgFqcNSW3xygK2h2-m=mkGnt&cdjcV)} zOD2-QWu61&B%X7g7hr;$zT~TdQE0aAf(nk&mLZl%=9z8ofN`+c2&Q5h#PEvqifIl(g5Jwa4c41UoMMh7pg2Ccd<;Xr5bQ470~Y z;7T+bjgXpvYqHS6_n^BZ79mlZ5X!X(Qr(c>+MYc*-QV%_kEvyZ_JuBH!znRr6EZkIy3Y_}f3|Vzg**z~<&4Ll!fBhcyCCnS$Mb`-i9}GL4|@=l)P1}L>?dF(b_E<|P*IbxFke8!%jqPy2G=^OK(P_O=FKK%EyhrkvSPL diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu deleted file mode 100644 index ce7de93b46aa24fe768c796607f575fdf7896d77..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1644 zcmb7EZExBz5dNNDaiUVyR-tUICUq_#K~`0r+A)=F{b-rkfQMsSeklX>$Ini}OKH0@ zghIIUbNAeHcRt`_z&QK)rI%&+*VR3~>wO*EV+>vR3{OvR^s2Md3EsoQ9pufo3fY0P zz@ej_u6U16v5NG%wTzG+J%t*`&E$pdYL=c}yWf;tfkz7tdjiE** z2ilNHLEt*efN~PfS;uqG-cH~0Rlz7Unh#!j%V<**Lki}YY3zZqfoATSL~vpQnSCHW zu;z{j$db{000mi$SaTP~WiC8r(6At>|z(%64)theB0 zv(i#S9|m3pH% z3{}lHlu5_kqRLY`hI77L3fAt{A?G?QWXA$k$I608WMdM7|}F9jR+6*~}=)O_9p_9HO#Xbl{t zP<1gPUr&}rOtyu|w=rD06PPPAvzp{=n?_YSLY2Caf2Ne7RbhvqjT(h&!tyTz>>n&v zL+_pTZlDqR9&batinAL3XQ?R}y0F?s5GSl#cu64;1*-*jc!sSAgKdv5leex@t#oq0 IB<17eFD<<(-v9sr diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu deleted file mode 100644 index b8319cd75310f5a27792f9579e9313bffcb90f84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1583 zcmb7EYfsxS6#brGaaAM)D5V={5;HAQn<2Cf)`@Q8qvghK>eaC=Kj@11@w1ckL4nal zRhr!Eb06oPn>{`OjI*Dg`dNm5UEboG{+Hn`#&7{2;qeg;UUc?)!Mne|h5Y=jM)u(3 zFzIxlV3HZ)7=`g7zTn(DL9E3kafbiFr5AX{jKsXu!XE2I1o1*D!(dK~H+5uxjnYn8d!HEsU>;mzj zYVLS|JTSTopd^biYj4B2%!Q{6TGr&9+*8t7bJm&Y(-UlY={AgbA{23zbw%^s5UrUx z&OMi+l_Wkk0oP=qyz4`6Ni0I5G9h%U<~QWGGUrc@4t6>EV`>Zl?!JBxIuwG5ZUW(ILN%jF z;kj|})Y$CA^U#D^8(g1(gz^sNi_7*T2!_+OXC(ZhYOyB-hhZI7gV51Q63|kSjX%Ekn*S?5a>12;d$|uQR`;r!1 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu deleted file mode 100644 index 254272c00316cfa4fdccb0d4ac65371853e8fd23..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1647 zcmb7E+fExX5PjdT7zGId+K?7hiJBx*vPFmrnu@gL(Q?*K;>K%RzK|y3<7@A3u0Wwt zwAswenai2W0Ura#+0QS%EW^L9?(tpk>);+^=)z}sdV-@Dm7Pw|9v@6k1zC((b05ZKE<9z>uqJQio|4vvv({LjpJB@jwFlU z+lEb8Q6O|3o~5dNk;3^|3K!i~s0U*7|D$eXXpzqySMp8p#}wjs9q~$Yp(dUU^md}G zmLlZJauB-Zk`a@sV#Fnhk=iBIrMLR6{8>AWRc^biYNzax=d9wOoj~~1;JP0vJQwzN zh4msl?^n9=Rk~Q45tC058t5!8-7w5cx@mc(8y25F)G_wEm%_RwDeUu{!iK#Xzm8g! zTd_Ba!%)?HLz#5kEvh)BVmRl^rC{xD?cyv@=vcS~;by?2;uH!tp;96gHy}Q%^HIYV_^y%cO%S?oYiQs?s)upfb; zM{D3Hg{q4Y`Esx`aaAM)D5V={5;HAQn<2Cf)`@Q8qvghK>(wv$L080&pPi&nV6do4 zlY4#c@pTAYeE}R@T z+HIIH#VqlR!f26Ra1p#9&S6Qs<$tgY0?(LLm`fwvu~{S#pD1k^%!yT8O+b&KMm6?~ zB@;>DD$jv(63;o$3oyY=U-DJKC^TDlK?TQX%MeQ>^UOBifN`Fk> z+k{P+ln7nLd+F+&WpH|u!P&(!)g7_s|52Y+98oAd*XmX9`wY@=mGRPWp(lY2%x0mi zwj$)qb`ZMeiV>S@IpT_>Or4VIGFtsq{;dM1E;mC~y;FA23s!Sr1ovORM=1)yM5h7q zszcS&$>6E6ziq77;_39VWvnv9>KL)b6rq91(+Uowda|_uD&J6kcvt7xon;2A0cEhu zdIoDQBMY9zb&hKf_nK3XY(kYnD2X`Tt1mz;&9z}IZ(RF2iN}kBvWY}cq2sW@FAP(0 zgCThlx)e+@<$Fg!aW4ZGw+?#{l+=B^2J9zbBz6THWl&L{kk2Q}6DHrl6q^{X{1MEx zojF4awpm8iIzp8mQvJEsmR61JgRy!PyNN6O5MlqQVyWBUP4EMQP!f`K%Q#B~Z>6SW o=;Le&?6kHM_o?$1VHr?TQjAbl2V1Xd9=*hfQ#3*dedZ@yN@F0@aR zRtu&~GD94rFkZ$#IQLEvYjHuG;lFX=1)eh_F)y^R=Xx1Id@7Y;upmZqIR!O=8kHPr zL#73R&shePlVE2Z&p~@Tdr4P0qtIwRcuqJQio|4v)j8t`FTcu?UG$gixl0FXSEhrOefy_24<{d$rgz{XR2{(1B3-d^97P z?P8{^C=mJ_o~5dDnZiXoh0ESL&I2*}|4}zGw8-a>+#7r_-lOrZ6pCo?(o$xCa diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu deleted file mode 100644 index bd15a9c301cfa85ac24272084fd836add8bdbaca..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1586 zcmb7EYfsxS6#d>`aaAM)D5V={5;HAQn<2yo>qNKl(Q;$A_3D@Wpey3X&rZ_kV6do4 zlY4#c@*Iz4S7fucv z?KVu9VwQMDVYEmuxCmYl=ddK+^50knfoIGr%%u_T*enu=Pn5O{=EN$lCZNYqqZ)h0 zl8Gd6mFGY?iRYZ>1(@KbFZrrq6q>F3pn_wxWr!t`d1jk$z_?gu;afy-;$ksxfrMB! z_dG%#8GQ>tlEsL%c41r^4O?+ux(LA@r7-o-) zz?En=8X+|SH)NrM??HD-EJC6*A(U$oq`D=)v_0QC-GA=%ot|$U{XVsf(7w>cY&a!` zZDJ-&N`$WBy>xZXGB`cS;Ot_V=bl*e|EOyfM-&RrwR#o&K7;gIWxO<8=t*D$vsoys ztqA$D9fWSUV#MZJj<_NzQ>Uc5j8;FDf2+W$%gvBg@0308g4G-t!TpzSQHnw^(P==u z>QMD`GWgrr-!|53v7X*?9@#Qh8De#e*kX#%z~pHKhfzJ*S^$-AC_lWbbL`GCgVlgC z*kwI~HJ6bE&*D1AwTB1IDM&V diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu deleted file mode 100644 index 58b093d41fbbd4bc99c97ac7e91700836f0de8c1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1650 zcmb7ETW{Jh6n@XIIHFS3R-tTElR6iWAgiWM?U>58K3XO=;NjSoFJ++q_}NLgl(j31 zqFm1LcP`(#9PtTYoc;LJ&ocb;`X1l)zYOm&h8}!`rzbdhQQ7SV?eXy*^5$!W>_YoA zX|-U=Bs0V@3gcz`!MS&WSc?nd4F8P_FYufhiFu)gJ=e9*>XZd81DSNqtnUJ9F%q_EF(3S0JS{KjjQ zZpGd>Hba&34Q0}Cx2*h>iqV3vR)TeUwTX71(6O)%!rFjG#VHhQN~J_74ncfY8>DXS ztzbJIxYl)I5qc*jACc=^g-!vt)J%pA08xw3wO~aEs(%I)_fxQ8YOw=BNgdByz(E9t z8f}2%6sjslQG&_<0zHem&< z0_-1Atef6D?cGo#^eyg0ySB4#|7Wf#8M!dpMIa}PTlh#J1PWFQ?(iI25gOZ`UL~(B NRjqV##3bdD`aaAM)D5V={5;HAQn<2yo>qNKl(Q;$A_3D@Wpey3X&rZ?@9gHrj z(&S#B`#AU9?C~*Rod5XL%X9qm@($njJ`e6Nh70%zPfu|0qO;qL-u?Y8yN4>1)0!7=>o*KB(XrZ5d*TWS-gP4j329EPRUyPFyTz7f6Uz zbI&8>kpiI$N}hKuLSeS$k|wPq5?CZy51dXyQxjisrc`#xQ$a z1g=E0(FmytxFHK2d=I)yVi6Lh387qzAk{7TrS18%)BPP!f1g@LXkX}JHk=Z}HenMc zB|=y6Ub;GG8JwPEaCWgwbx*AMf7G>#QwoLWTD=K=pF#SqGF}=k^dzu>*({XRR)l=n z4nj9vF=BHqM_iGVsZ+9DMysF7zg6JWlv)Mj4XH-H#%-UJZMfqvI$iRp`_zj+hPNcHDhTUs@?560?I>?UsUV}$*qiluIYH^C1KLP 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 16>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu new file mode 100644 index 00000000..ed2bfa09 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 16>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu new file mode 100644 index 00000000..09c0c11f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 2>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu new file mode 100644 index 00000000..09c84e49 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 32>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu new file mode 100644 index 00000000..2be9035e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 4>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu new file mode 100644 index 00000000..1c751e12 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 4>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu new file mode 100644 index 00000000..ece1c4d7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu new file mode 100644 index 00000000..13031cfb --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu new file mode 100644 index 00000000..4dda125f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu new file mode 100644 index 00000000..d0653427 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 128>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu new file mode 100644 index 00000000..4cfc6d63 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 16>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu new file mode 100644 index 00000000..41b3a65d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 16>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu new file mode 100644 index 00000000..afe747f9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu new file mode 100644 index 00000000..f9b65472 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu new file mode 100644 index 00000000..7b0f8c1f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu new file mode 100644 index 00000000..7fd28ef7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 64>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu new file mode 100644 index 00000000..b61d3b55 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 64>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu new file mode 100644 index 00000000..b64ee3be --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu new file mode 100644 index 00000000..efc6f3dd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu new file mode 100644 index 00000000..38eef285 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu new file mode 100644 index 00000000..9db9b56a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu new file mode 100644 index 00000000..b3979fad --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 32>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu new file mode 100644 index 00000000..29f0f783 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 32>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu new file mode 100644 index 00000000..0814d571 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 4>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu new file mode 100644 index 00000000..53567cdf --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 64>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu new file mode 100644 index 00000000..ab26e0fd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 8>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu new file mode 100644 index 00000000..9321d8fc --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu deleted file mode 100644 index 95a8741c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu deleted file mode 100644 index a620831f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu deleted file mode 100644 index b3c0e76d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 2>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu deleted file mode 100644 index 0870613f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu deleted file mode 100644 index bcdfb0eb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu deleted file mode 100644 index af6d0e49..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu deleted file mode 100644 index 37bc33c8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu deleted file mode 100644 index 4ea842e8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu deleted file mode 100644 index bc916cfc..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu deleted file mode 100644 index 5ed9df14..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 128>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu deleted file mode 100644 index d38317f2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu deleted file mode 100644 index 7ebe415c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu deleted file mode 100644 index e7647be1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu deleted file mode 100644 index 2e0f0575..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu deleted file mode 100644 index c8252f5f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu deleted file mode 100644 index bc53eefb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu deleted file mode 100644 index 2c818beb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu deleted file mode 100644 index 4efb152b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu deleted file mode 100644 index 1ed408b3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu deleted file mode 100644 index ddf70bb8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu deleted file mode 100644 index f35e9ed3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu deleted file mode 100644 index 4b2e2fdf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu deleted file mode 100644 index c7771133..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu deleted file mode 100644 index 8ab75b3e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu deleted file mode 100644 index 2d281ce7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu deleted file mode 100644 index eeab0c50..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu deleted file mode 100644 index aef942ea..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif -- GitLab