From 973d2a0ac27d736a376c474ea9c1830beb5d3cdc Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 13 Jan 2021 16:42:34 +0800 Subject: [PATCH] feat(dnn/cuda): add cutlass matmul using split k parallel GitOrigin-RevId: 650209e35f813e8eb8373d2ddc1671d3abb1759e --- dnn/scripts/Makefile | 4 +- dnn/src/cuda/matrix_mul/algos.cpp | 20 ++ dnn/src/cuda/matrix_mul/algos.h | 27 +++ .../cutlass_float32_simt_split_k.cpp | 76 +++++++ .../matrix_mul/cutlass_matrix_mul_wrapper.cu | 190 +++++++++++++----- .../matrix_mul/cutlass_matrix_mul_wrapper.cuh | 6 +- ...trix_mul_fp32_simt_128x128x8_32x64x8_nn.cu | Bin 1572 -> 1648 bytes ...mt_128x128x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1587 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_nt.cu | Bin 1575 -> 1651 bytes ...mt_128x128x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1590 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_tn.cu | Bin 1575 -> 1651 bytes ...mt_128x128x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1590 bytes ...trix_mul_fp32_simt_128x128x8_32x64x8_tt.cu | Bin 1578 -> 1654 bytes ...mt_128x128x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1593 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_128x32x8_64x32x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_128x32x8_64x32x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_128x32x8_64x32x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_128x32x8_64x32x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_128x64x8_64x32x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_128x64x8_64x32x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_128x64x8_64x32x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_128x64x8_64x32x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_16x128x8_16x64x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_16x128x8_16x64x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_16x128x8_16x64x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_16x128x8_16x64x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_16x32x8_16x32x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_16x32x8_16x32x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_16x32x8_16x32x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_16x32x8_16x32x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_16x64x8_16x64x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_16x64x8_16x64x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_16x64x8_16x64x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_16x64x8_16x64x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_256x32x8_64x16x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_256x32x8_64x16x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_256x32x8_64x16x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_256x32x8_64x16x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_256x64x8_64x32x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_256x64x8_64x32x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_256x64x8_64x32x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_256x64x8_64x32x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_32x128x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_32x128x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_32x128x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_32x128x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_32x256x8_16x64x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_32x256x8_16x64x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_32x256x8_16x64x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_32x256x8_16x64x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_32x32x8_32x32x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_32x32x8_32x32x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_32x32x8_32x32x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_32x32x8_32x32x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_32x64x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_32x64x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_32x64x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_32x64x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_64x128x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_64x128x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_64x128x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_64x128x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu | Bin 1571 -> 1647 bytes ...imt_64x256x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1586 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu | Bin 1574 -> 1650 bytes ...imt_64x256x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu | Bin 1574 -> 1650 bytes ...imt_64x256x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu | Bin 1577 -> 1653 bytes ...imt_64x256x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1592 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_64x32x8_64x32x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_64x32x8_64x32x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_64x32x8_64x32x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_64x32x8_64x32x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu | Bin 1570 -> 1646 bytes ...simt_64x64x8_32x64x8_nn_splitk_parallel.cu | Bin 0 -> 1585 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu | Bin 1573 -> 1649 bytes ...simt_64x64x8_32x64x8_nt_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu | Bin 1573 -> 1649 bytes ...simt_64x64x8_32x64x8_tn_splitk_parallel.cu | Bin 0 -> 1588 bytes ...matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu | Bin 1576 -> 1652 bytes ...simt_64x64x8_32x64x8_tt_splitk_parallel.cu | Bin 0 -> 1591 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu | Bin 1568 -> 1644 bytes ...2_simt_8x32x8_8x32x8_nn_splitk_parallel.cu | Bin 0 -> 1583 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu | Bin 1571 -> 1647 bytes ...2_simt_8x32x8_8x32x8_nt_splitk_parallel.cu | Bin 0 -> 1586 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu | Bin 1571 -> 1647 bytes ...2_simt_8x32x8_8x32x8_tn_splitk_parallel.cu | Bin 0 -> 1586 bytes .../matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu | Bin 1574 -> 1650 bytes ...2_simt_8x32x8_8x32x8_tt_splitk_parallel.cu | Bin 0 -> 1589 bytes ...atrix_mul_float_simt_cutlass_wrapper.cuinl | 29 +-- dnn/src/cuda/matrix_mul/opr_impl.h | 1 + dnn/test/common/matrix_mul.cpp | 12 ++ dnn/test/common/matrix_mul.h | 1 + dnn/test/cuda/cutlass_matmul.cpp | 25 ++- 147 files changed, 328 insertions(+), 63 deletions(-) create mode 100644 dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index bc28b4ce..b093b4ae 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ ../src/cuda/elemwise_multi_type/kimpl CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl -CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl +CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl -all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL) +all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py ./$^ $@ diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp index d2c44734..fa190980 100644 --- a/dnn/src/cuda/matrix_mul/algos.cpp +++ b/dnn/src/cuda/matrix_mul/algos.cpp @@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : simt_float32) { all_algos.push_back(&algo); } + for (auto&& algo : simt_float32_split_k) { + all_algos.push_back(&algo); + } for (auto&& algo : all_algos) { m_all_algos_map.emplace(algo->info().desc, algo); @@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); } MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index 65429370..d647c661 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -43,6 +43,7 @@ public: CUDA_NAIVE, CUDA_BFLOAT16, CUDA_FLOAT32_SIMT, + CUDA_FLOAT32_SIMT_SPLIT_K, }; using Mapper = std::unordered_map; @@ -198,6 +199,31 @@ private: std::string m_name; }; +class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase { +public: + using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam; + AlgoFloat32SIMTSplitK(AlgoParam algo_param) + : m_algo_param{algo_param}, + m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", + m_algo_param.to_string().c_str())} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { return true; } + MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) + + std::string param() const override { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; + } + +private: + AlgoParam m_algo_param; + std::string m_name; +}; + class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { private: AlgoBase::Mapper m_all_algos_map; @@ -216,6 +242,7 @@ public: AlgoBFloat16 bfloat16; #endif std::vector simt_float32; + std::vector simt_float32_split_k; std::vector all_algos; const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp new file mode 100644 index 00000000..50ccb67d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/handle.h" +#include "src/cuda/matrix_mul/algos.h" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + return args.opr->param().format == param::MatrixMul::Format::DEFAULT && + args.layout_a.dtype == dtype::Float32() && + args.layout_b.dtype == dtype::Float32() && + args.layout_c.dtype == dtype::Float32() && k > std::max(m, n); +} + +size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes( + const SizeArgs& args) const { + size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0], + ldc = args.layout_c.stride[0]; + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + return cutlass_matrix_mul_float32_simt_get_workspace_size( + param.transposeA, lda, param.transposeB, ldb, ldc, problem_size, + 1.f, 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + split_k_slices); +} + +void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec( + const ExecArgs& args) const { + size_t lda = args.tensor_a.layout.stride[0], + ldb = args.tensor_b.layout.stride[0], + ldc = args.tensor_c.layout.stride[0]; + auto&& param = args.opr->param(); + int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], + k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + auto&& stream = cuda_stream(args.opr->handle()); + int* workspace = reinterpret_cast(args.workspace.raw_ptr); + return cutlass_matrix_mul_float32_simt( + args.tensor_a.ptr(), param.transposeA, lda, + args.tensor_b.ptr(), param.transposeB, ldb, + args.tensor_c.ptr(), ldc, workspace, problem_size, 1.f, + 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + stream, split_k_slices); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu index 6f6e079c..4907b4fa 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu @@ -18,6 +18,7 @@ #if __CUDACC_VER_MAJOR__ > 9 || \ (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) #include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_splitk_parallel.h" #endif #include "src/common/opr_param_defs_enumv.cuh" #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" @@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( float* /* d_C */, size_t /* ldc */, int* /* workspace */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */, + int /* split_k_slices */) {} #else void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream) { + cudaStream_t stream, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( workspace, problem_size, \ epilogue, stream); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + return cutlass_matrix_mul_wrapper( \ + d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size, \ + epilogue, stream, split_k_slices); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif @@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper:: bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */) { + const GemmCoord& /* warp_shape */, int /* split_k_slices */) { return 0; } #else @@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper:: bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape) { + const GemmCoord& warp_shape, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper:: split_k_slices}; \ return Gemm::get_workspace_size(arguments); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - static constexpr int split_k_slices = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + using TensorRefA = cutlass::TensorRef; \ + using TensorRefB = cutlass::TensorRef; \ + using TensorRefC = cutlass::TensorRef; \ + using TensorRefD = cutlass::TensorRef; \ + TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast(lda)}}; \ + TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast(ldb)}}; \ + TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \ + tensor_C, tensor_D, epilogue, \ + split_k_slices}; \ + return Gemm::get_workspace_size(arguments); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh index 3446842c..1947f773 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh @@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper( typename Gemm::ElementC* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); void cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); size_t cutlass_matrix_mul_float32_simt_get_workspace_size( bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, - const GemmCoord& threadblock_shape, const GemmCoord& warp_shape); + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1); } // namespace cutlass_wrapper } // namespace cuda diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu index bdf8a6e3b3871607341ac7415cfb181dc3541dbf..3828423399ba519e605671a5151787c2cc95985f 100644 GIT binary patch delta 54 zcmZ3&^MPkW0^?+6HnEBKxY=?OE0a=nHwQ4TXA##?$jmEIC@#p!EQ!yKFV4wKPA%57 K=9_$56*UP`vPFm%nu@gL(Q?*K;>K%RzK{m-@wIn17Xp+- zk+Yfcnai0oyT`|XarWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;KREYJ5NmNkoZ)|Q;RT*CBQYWVK65~aybDth8mUZ zX+tIjfy*od%1O9q9nV2~JAKVp1*6bt-g)UQqfJc=DVSrXu>;12GIQ4?f)g8x*#+W5 z)!gv_d0=!GKtUEG*4&11nF~)DG_1*6xu>MH=Bzc=Cv9wb;WmtTEEI92bw%^s5UrUx z$~~8&kt9Ag0oP=qyz4=CNi0I5G$C}W<~QVzGG|Y&_IJ7Zb7~l&eWCK%a7r}W#7tOG zAaohd($zU`pQcbAr{~L5cf{y7fhX-0K7B}G4ZD(IMm~33$v4r*A!+!oQeJ2-)Woxa z-Yk~YR)k#H5<=HpGGa1SjJPB*Q>Wy*j8;FDKdZp8)6I}o|CHVHoYg#N!QHnXL5o5# z(MceD>QMD~Qh09cKQ-2S@qB#gGFBO4b&Z&OiqJr3aR-M%KIvKkm2X&ld|&6-o@WZH z38k>jdJ1bU0}GCYeGYpM_li>}*n~=nP*ia|t5-m+&9z~zZ(RF2k;n6cvWZAguEMav zEi{v1g&}$ox)7{j()A93;$8|itQ~eBD5?8+3)qjqKp-OT}vn^?qk6f*Nl6p9OSGE3sK0%Q?T delta 19 bcmey&vz%u`0^{cCjEh*9G_5BGuqguoNV^6n diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e78d64a9bc30510d118c93487da1333369ef6b22 GIT binary patch literal 1590 zcma)6+fExX5PjdT7zGJYv>_$56*UP`vPFm%nu@gL(Q?*K;>K%RzK{m-@wIn1SD+9@ z&Su7EE@#f{9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|ly;M_Ywti=U!hX2Kd7kI{u#JteL9_d8{@rhK1!JHV$On!7MAbKxn2hBbLB_ms5OoVCXKq>U{v+=da4g(9xBu4tYcqBS!| zx#vIycRlDXiA6}1CWLO){D%Bd=4|U~|GBG~626dIw|`CzBeXA6J{wMn zW}C1HD++`z!&$mI$L-S;%H#BWneC1k{VMRJox-ONDXd{vGK|URjw|^p`Zy#F-&M*B z&4rqHHqe{Jvf7G}D_cV7noCAZriu}lBxdTAbeGZUr}AeNICi=jvg)6*d!DnJ2Q9e! z_9Mtq2qroSgijr+o=^%;js0z7y&6v^m@Z?LAy(Ij$)^YnbQafe7!;JQMNs*M#mDz` zj@@~tu$oW`yR4_M<}$G0SXk+>`f#r}g@R3}ln6y3$Fq71)bd;#))L3HuM>GZKPa1s z1m!9W8{9%O8MYXr7oiKm3MO6e5Gd}YV8iBN2ZEBikJo_x2n@uofTI*D>Lc>`WLd;y z8<>0(!=*cfxiT}WNzOLQs9Hy;(nG92Q_9e)v3<}+jY2nJjUNK+KUFMs>z(#)pb?5f yD(o`OQo&oPDH*!3p+z7kY)p7aAruNW3K7q+6+xT!-C6R|i%~0`>@i9CIQa*S0vKHY literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu index 6b4ec037db902b3835c1c8fd720192add28883f9..0fe5a161cc69870a9562465b0f0a14d3232c6175 100644 GIT binary patch delta 55 zcmZ3^^Op-OT}vn^?qk6f*Nl6p9OSGE3sK0%Q?T delta 19 bcmey&vz%u`0^{cCjEh*9G_5BGuqguoNV^6n diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..964ef5252061492b51c82d6a1e8694452388ee73 GIT binary patch literal 1590 zcma)6Yfl~@29e}4yg{jEZFp?#RN zS}>uK8scaHqec9KbMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaL+oHgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9>6HnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM&5_n*6cr{-H%e@qP}MW3sDHk=Yo zH!%|`1VUHgEM1+G_E`$$adxpxbx(|b6?ocC;p6)h*05_C#^iIylzbI^5|W1RD&?hS zTunS3=*?njZAHkHEg^KnBqb(OV#FkgnK~uiWwiRK{8sl_6Hwh{>l24RjXQa2OPnu0>G!hT_A! zI>+ujQ&>$Xg*-3>(}+QyI1xq8Fh{P6d^&cLW9QrC`J6VF$b*bsw(*`w(caRMbc0^U1P^ z$u=b#~8#M~ugf)H)u>VxC)U9{gyMab1 z3Q5>yoTY-dQWs?C!iI`KPS}|6kU}UFY!o7%VatOy?csUy(u+|mo$OIruyOJilA0J@ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu index a7b3fbea821bf428f5d1d7f4fb5fd28284f00c55..374a8d73a7653ce40ec5f278fe50034efe708a36 100644 GIT binary patch delta 55 zcmZ3*^NnXi0^?+TMn$&V#LA>p-OT}vTUo?)6f*Nl6p9OSGE3sKaqhXh$Hsth_VaTu%kZzuJAB*wGPuJSF5nY9J;A|?&Tcn&_xHDu*WW5+7uttO zs|6D(sUeOQFj~YvIQNbdYq22Cus>LMj%UZob#fU%*>+%*Yj#D-#af%s50 zb1Xm}X|W4Hkj02Lw_!}?+!quY*5s|+7o@f3tTooBZETrv8%8|lin!9cqIqtJ*3=y5 zo=M(F5+9p@8?sQ|^`N^X79n1m5XuzyLf(?!%A7yD-QV%{kEx-g=yR3NhEt;HCTv25 zKSFSj8;FDKdZp8)6I}o{{?+uIjwonoVl;xgCaSn zypurq)S>DjrSPw@|J+z_$9jl&YF z%2gOPxP_)NEHXqdLYJHhDqZgg3fxP3K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`lQy=za2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($zU`pQdoqPT}-?ndy!g{U+|yhi%xE3?uTn<4V4XK29P2Rw*wu7i!|!KyTK{ zYAZsn>)%D+|M*yv`+s(s4tdCqDYwBYXBk03=MnCK)B zt~yjbofMuM`%jJaT0Ea#x{OtZSRErKpCUBSSzN(kP*1uRK;;`2AK%wGw#S*mYCU?6q{9HmfEACWI7%OWP*z~q}4F5Mx_ zm6=&ha<*AU)jC3z9%B8OQifKI?SnRI6uJp3{19OOsbZ;H@3eOVjZhR)VV7~13VxEB plA#L=S_E>!#)O9yLZM)z5b+FK5wvOFoh7e*6t&XH9+Q-hlYhbO7V!W8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu index e4e097ccf0b2c9aa94e9c6ccb5228217cb5865a6..230297e8e11c84a1f5ac40878efeda3dce43a208 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sK(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8r$L-S;PTDD)o-dQ#5u;zlefqEqyOLo_K6hNnSJB5Q#NR6Ah2}y{ zJR9iET3KyH$dw%-bj>9rCR4?TOA<46O18^r^;7w`3LG2V3|X~L**(u$O@kKPeftsA zCOonv>L zDXb=x!Y=D6tho#6!d#h|)g)(|WmK&rROunspDAT%)!060qeh{du*DAn_Ma-2y7f+bH_!-0Ar*ER vXQ|+=)RYWeSkNMn6E-G1q!0=P8-<8x*ovS{`|d1x>AR?vPWG6je4P9P=1dqT literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu index 35c35bb9260bc16077d7a7ba8112c27685545b63..04f80d335c10f9eafb47b31cb54c0871d70c8701 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`A9O|h_}NMN91Nl= zP44x%k8{t>9v=h7+0W0tEW^Jp@9=H!%is=UxPVXa^aKabI=kKA-QV9qUVp2QU1%RB ztrkp}WQI6KVYG-ZIQLEvYjHuG;eT-91)eb@F)y^R$9fS#d?J-$FegTGIRQ0>8kOv6 zLnZ}*t1JV`NjPU6&p~@TeaTk^qtIyHd+9BsO-&3bm}92#28<15=B`NuCpHxG7Kjg3 zbH@YZkLc5(H~R82<;1%&xTW? z*(PSfiUOgla4%h*llEB(r|lHZE|!_@iP5j(KEB_DUCS^fpF6JPtLT#y;%}AmQgfju zo(=S7t*o{pnW_c3@kVnHacuRJSa|~U=u1OLQ%)@UVQ{=b*>F-h2z@Si9B8$lubl} zautRRZlReBOAOJA(4}Anldg9J6!%iFVezm7K}p@mYruX424YvhQ3@6H5&3+wEMl?^ zOumWX(jCEEnVHojXPaeIts_+FA=aNMWoXsdK4_yxp_{P9j{)|dDwewSPJ1`d2t^?k xb{S`>;H}h@3|&~zB9Id{COo7N3I!X5h-cV}piO&tp1ky3)Ji9NOj15h{sQad7$*P# literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu index 9df595430beb61f4488b925ced89ed925fb8fe03..70fd338f09b70d2dda029a294e7f39ca5ccf62a1 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`A9O|h_}NMNpo7t< zN|Sqi?&I8Zv&YAParX0bFU#<+%R7AA`!cx07%t!wJUzj|i_UI0c=z|Wkk{WTWEa|p zNvj1DCYd3QQ5Y@a3(mb0#9CYsXZRmnc!6ikNX!c@?6F=%5T8h87|e;0TuwlZp++Tp z+K@>>;3~_2auUv2$8*r$PG9p?!6-DE_g;F-Xj2nI3g(z;?0~VM%-l7J;KYVvc7ga% zHFrEf9vR&QP>{ulHMe10=E7434QujN?kQ=lIctseX&YN!xD6v73q@RMUC}%@L~CY_ zbI+w{B#Dnrzzta_?|RT(5{r;1O$cR5_(I;2-^!dnJKf*$^pB}wg!YBXXTvGcY!fzN zMS;*&xR7H7t3V##OOD1AK!1ou4NdM&mC9tP4r0$@wZBOsku-S&jxz4 zR#saPa%D#d-Ehf>$y71olEh4%lJGKG{Z#&~0>?%-Lssom_P}#i)1U=+U%v-I3c*At zfpFEK>KUc*ud)B!STDzVhIeXS4lrHDDnqP}5tC058t5!8;xK3^UCW^I4T}%&>KxnS zOkp*l6t-DUVa;V=!LhK^Vfo=faS8>SP$?0LQjYiPJ5URBZCHyO*S=2V@#3ItA`+CV zFl=xO&1BePh+c#)1uK|zy(6Hwmx2wuhaCt?>OS5A_9HM5y8@0VxC)U9{gyMab13aPNm vI7TZp6Z9Ef literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu index 474114d8f7faf58046bfbd5e0b09b362e916cebc..bed908a162008352e0403c19eea24855eb0eb498 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3qNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`lO33&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#`CyQeXH`VhnE`6|;LG5SqhfSh!juq)|DRHTk8`6l`}hVWabywF^ztY9PE ztd-SPgk0GXLf2d}ViJ{4xFjJ{ucW$+RzH=0>%gJWX2`02%IJFQ!-h-Gf@6N2{o2F5;uLb0Q7I7$A`bWJ3s6gQW7xOS5A4g=5=y9Q2TsHhLfmy;y{lQb}C6T_uD zgoQG5t4Ye5WmK&rROunqpD1N$)!060qb9x^zrqh5_Ma+Nx-Fb8+(;u7gp}J=oRxx~ rq^4x-{DS759KSLCAqHP4*dRna$5wc4+IMHsYad0ebhO7L<p-OT}v8(GA46f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`U+9YX@w1b(O$!X7 zDoyVE+{<&%Wsgq)(pC`<_5s>KcYqs-aX(ZO>^Qzd*Zw@&|@8Aj+psBAu( z5zXqbDa&(&F2lWWb&orz34HpH!0Gua*&Q+ZRa}Iebn37x8K&ej$CZ2)eVjo2ty5lT zF4WYsp|01;Dl0;+=m?=}E*UYY%Ew%in5kQ^T|}#%iobQ>*l0au**<0WJYy9NT5$L6 zM^K{>OmrI%t}0YDpah;82iwMKGoB7GS;Zn?J)s14Sx;cYWnjUvu+d@j;a+hHIh#@`5sEsF_wpl9sdHo4C>&S5ZshU&plBiz zl&LUma7)c(SYn7?gf0ZjnRNX_ptzra4U2~z2ui9xUIPvyFc7;2juR-UkI3hfr4f_X zFj*bLr8|U$GIOg*#_DC1ts_+GA=aNNWoX&hK4_!Hp_{P94*~X{DptDnPJ1`h2t^^~ xb`@u(;H}h@j9ggIJdhJMCOjk%3I!X5i09afpiTSktnt!!Q7hfpW0LYo;~)6v7%Tt) literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu index b55f2e2f6bb1b561e3d15f7207a7d77cf3c7730c..6eae3c183fccc5e700f4c75dc4c67fa8ab2fdd0a 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKjq%FJAg2u^G$<}DB( zs^*Rd$Rnd~0pw&kX02TqmznUCLDQP7oq0;y8_wDjecHj6=dNbN6QPJJtZSMVhG@;q zapt)c%|^q=Cg6rFm3MvUt%yZP6efgHC44S#$!}%Ow@we9JAJ1XTStG)3?pSK(f`x+k5p1U|k`;Ot_R>7E$R~4!nPy+uN2iwMKGd2TUjU-vcDnhJ}5tGdj8tOD|;V1|w*=V5jjq(rg zsvNuHOkh2s1a?_ZV8dl#!LhK>Ve{cZaSAz`QYjINI*#}9BT%VxW7sGhSH5oK@#3Ip zA`+CTFl=y3&16_&h+c#)16E-G1BoGP(8->AR?vZtO8h`K0j|{(l%O literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu index 95d263d61404d146501f29c9fb7630989de0a289..3107bc36bc19887c93761ff52dc198d14e3a2a51 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKp-=?M;AboP3|yT8AMy#7`qd(b&- zwA(PHk{aS@4&!Bf!I^iQSc?U5hW)|9b3CU;VkR`V$9fq-Y$}zZupmYN9;I%f%de4oJC#VXl7G5Sqhgq(Kjuxl9xV#!UTVhG)YGA^ z*GelZLayitp&KSCF{u(`CP~cHEeJ27RZqp=I&f^X9{8m%v^uj$on8+-j21dKqQw2$gz>^`}aiyliYAv{B>GO<3f|0Q*lBE8Tjhy&GzTqL74L w#aSu%S?ZjOTv$*M$O#)09uf$Jf{jANb8LCgrae4wy!L6-N;meX%-N*z7bcGxVgLXD literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu index c3a64d63c7b87f05c32256db47866cc6ccd062f6..ce92e1493134d6574866f856862e5a3c3b2ad203 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3`3=7Ae;wXn3}^5e9v@-vMQ5)Uyt}&_$jk2)vIj@+ zl1>MvOfo|pqcC2?7o2-1h_$#N&hWpu@B-g4BQYMF=BzW(AC9o)g=-k`L@44)>x$;NAzCwY zkb5pgD@lB80sIEBNblN3(QmYHse(Ocn;KQ>{PGK|RQjw|^#IwTFhRmyYCg_?Rc)XiF1 zZAHkH9U*kZB_k$N#h6PHGj&U<%V_me`L_xj8*PTH+NbQE=d7kd3+}%C2vQV+iEaYn zszcS&N#VJ1@YGnZ#q;T<%UETI)iGl78A3yy#T6U{^`vV7RK8L1>3yAJbDSxxCX~V^ z>nW_c3@kVn);X*_+$&C@U{fk3LQ%x=UVQ;-X|4@xdE?sGjXa+1l}$u~autRRZlReB z8w}Bl(79j*ldiuH6!%lGVe7C1K}p@mTfjjC24YvhaS9dn5&3emEMl?-CU0W6bo($@ z=FV!8vt}7p>j+hPi1lYm8Co^A3)-l0=q9Z2LxBCKiluJ7)7}j=LQzPCUB+1|_(^I? pMlLL95y%M}6CP3sg@TPj#CO<=piTSkGp-OT}v8(GA46f*Nl6p9OSGE3sKmjvp7%Eg_ zPa87H30$QqP)@=*>v#s*+v#h*G8lzs>)uOm8Et7|NX{HH%^ff{l$p5}5uDgi%q|cg zs^*Rd$Rne>0CKVzvDP+>%S?F6plMCk&O9aUHD~Ry{&0dV&t1)k$3hWTSXVU94bhsJ zqs((DnvI5!O~4IVDDQgET@s6sC`<_5srfDWqs;ln(f&(EQzd*ZH%|YY8b)YesBAWz z63yzc3CnYYuEM==b&gLHI6gT`;Ot_V?4B6C6Ylh59d<3llzis6l5e9!((qfQywqH% ziDv^{ua#9+gj~@PLN{D8Vp5flxFj)Cr(nB?Ry`GetH80*ddRYU${u*eDjKxl?%R)` zMj@E!G$346sA@n7JU8|?jn!s6A7HYKRfJd_BPN?7G|*|>!eJ0lverQ98|I(hS2?!F znZRm732d{Tz?#dzf@5K$!{)<-;uLZ=p;96gbsX>IN1#&Y+OSqQu6&)y_=cAb_E_1g3b?cq>ZlDp0Ldxwj v&Qie}sVNz{u%LM$Cu~f3NFWpnHVP5XuoXd@_T72oweO-SUb^pO}V literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu index 526dfd3e6112e85bccd8ec8b7d1a8003f1c67f4d..28b32c915377c15b0e7d1ccae995b96e6d76c131 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKasp}$6{@kP z4Vef6*J%nlBjKENECcQB^fg}@EQe<6!Aoyx-qOSnK^-;C9WXYOnYk9>jMz}jE)XB8 zW{w5O6U}!42(lQ_);5gEjQgBI)0(WE`JA-ZoVCaL{VBFgxSA1*xEx8cj8%kK9U~^2A~eux+`?fHP_ouQ=^KiV@2VWz z<4j;Rp#-*BPhibuV8PL_(P8u9QL!8Zo#av?6m=Z$zGSbwUN$;-y}K^rv+-GnWE3b6lFvDB@1+Pi^9C<;l~ wWt^pgH&W+h=)!`EKu*}0@Q^?#6l@eCo?**_Htpd>p-OT}vTUf+(6f*Nl6p9OSGE3sK)S%!aI+~S-5m*FkOa1Ni~=?V5;boP3|ySuxAy!uuldvLs; zbUHAlk{aS@0pmq{!MS&wSc?U5hW)|9b9_&Y#7t;z5A`B~*iatdk!H7eQB zhD-&4%PfO}k#NpBmV@?o_L{E>RzRzL=cTu_Xlr7KppKf>78o1K%w3ysMrtQ{uo;(T*HVbToG4VS2WKJ(VCjW z+%w5rN#bJ@a7`ATvLl49nWV&IN{pE#F;llByo^>qm4B~=Mk)Mj96UGH%dwu}t(unuOqa3B5UXRvKUFMs>z(#)s1b@n5_TD9 uso-a+3o>$HK}8@ZY)p7aAruNW3K8F9%Y!!U{#o+cr%@}N>`+;-N%9vWIT>L9 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu index 816cff198e51060eabbfef8554dd6f720f16e47c..225cdf3b09b4dda07f262e43c5a9790e0df3b427 100644 GIT binary patch delta 54 zcmZ3)^Nwdj0^?*xHnEBKxY=?OE0a=nHwQ4TWf9j=$jmEIC@#p!EQ!yKFV4wKPA%57 K=9>JURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3(wvW4qXvHes+>RC@`u; zRhr!Eb06oPnpLT@c0P(FFL#3;N9EXLSB8VkX<-A zNZM_fFvTqKjKXLUUvOS{L7c;qc+3A_SqMC1R$(rUa7ShlL42aLWiTgJaWw%wh8mSL zjU^LF;4;gAauUut&vP(^o4)3&f>CI+?g~{nMq7qhBAI8lu>;11GIQS|f)f{t*#%OB zs=4O@^1$dW07(`j*4l<~m5YKhXgHg<^MaE0nzQ!UoStCErQa~(vCzbq))mcjON?Rm zC@;7YjU*{z6L3uyy6`>dE{R1*lqQ63_56nX*7oel(f%$+e@rbSv@di%8%~K~o0tic z5~0g*FI}DE4=J3Ur10^4ndpvK^B=G)6(&^Vo@@0sI)&r#Tcx}(Tb6oVx~^XbQ!IFD*sl2W1pKLtL`bg=Q*ocFoOH9--8l`V4{;ixav^# za8h_~>_0WuTk(8&=`vOsVs(tze2UP(WN`zBK|JYt{*`Yi-@mJKY)>6USp3VxEBlA#a# lDFZoSW5PoUp-^y9hp-OT}v8(7436f*Nl6p9OSGE3sK`aRm|rl+pp(w3!yE%@AUPb)wt&Xt}Z5di6_w&=v9HXD8_s7-&(I zCinW>$GPWbmyZDB{O9Lxp5tE^xA>;}rGJYtoWm!0e1!dHot;kf?(J8ma1p#9&S6Qs<$tgY0#BJ$m`fwvk(nnDA1iGc%!pN7jX{r~Mm2Vg zC1Xk8GS7i>63;o$3oyY=Uh-AJDC{)vf(nk&rXiL{=9%4j1IEQN3*RJy6Bmnl3nav< zx#tn`!01~5lFWy!xeep05P>q-akgj`fs)pWv)0I*o?yqNUo+y7(8QP4CCxKSjA8bu z2waJsMkAyq;F`>J@LlLEh($=0CWLPF?1uc-_H5&5@42J77D1|w(?2Gb5!w^Fm<}ey zuyxp&Nr}*9yqB)_@rMjfPcry;zQ}b)toaYvm5M_Oh38tmj?UmX{+1~(3>SJFSl_Id z%4#b@zHA7gYpxivxt2q&NXpbM$u6VSPvze-aO!hCWYs-o_q<>=3r2AN^?MYf5KOci z5U)B^J)aDo8he|@dNH2PFI&VaLoAOGTTBq@n>;PyAnGSuNucr#;xe+}SzPG2_;9Z|1t`^B5KneoHbthEE;VZyG&6&YWxK}f*0@r literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu index db1b87962dc781c3236eadd802f6d26657f59f83..e3ba197ff404d56cbf8547c35f3ea3f2affc05bb 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK`aaAM)D5V3mX)`TSo1uvf)`@Q8qvghK>(wv$L080&pPi)7!9a_u zG`ZL3KF&QidwdKS=RZF7@*MxXyu~-Y&x2cx;Q~Iu<0BkA>Fjo+cYl8edG)nIcH#7} z(Qd1@$10wwJ=XYH{$JH?JmzhT5IS@IB}*iA6}1CWLY=f>bx;m$v6yr~6NxzSZ-squ-~N5!x5Jm<^}I zuuaT_Nr}*>crRU@lXn@Moo4Xs#7L4Hj%eN>-A(-ei zAYOH-dOjKaZR~Fw>%~~lZ#j`{8LJGjI!0_UMQC91w1mT`pKL9G$~Tm6-_$vFrS$bx5aq2uDiz2+1on^2_?N-|FO>Kjmtb8T1)9M`^1;_>33Y$6d<=s0Zf z3&T|0VMtztE(McJ`Q8yw+{?hly~7>^C3PP!0s9FUiCqCl8C29KpnCRDZ6urB!45V5}a+ZsHO@MA$#7Sn4)-6a2s+l!PSRGR{)L sTd64-`naDmk`p&3K4cIJ1($?~XV{6TO?Pp-OT}vn_0wl6f*Nl6p9OSGE3sK`aaAM)D5V3mX)`TSo1uvf)`@Q8qvghK>(wv$L080&pPi%+IvCZW zDoyV7xsP+t%^n{E#`%wry*$T1FK_Wp@AKdmW4M41@c0M^&pNx^=-uDnLSB8XkX<-E zY_!`jVTxJe8HLdzz2G8vL7c;qc*}od83dj&t1y>FxMQ;eg~ zYVLW2JTSToK$69XwYFhg6(UduO=pXC5h!V|Icty2*(r8h`VAu<3r&1!UC}(Z#299e zi@=p=HX0!{0oP=qgYQ9iNi0I5G$E905u~~yzqCDna=O3c>F-m^2<;19%!X59*d}bk zq(tabyqB)d$-4~BPBVCavCMTxtoaYvm5Or;h38tmj?UmD{#Ge34HtS6*uZR-%4#b@ zzHA7gYpxivxt1fYNXpbH`7Wc?Pvze#aO!h2WYs-o_q<>=3r2AN)lw7@K(>u`DM#kWr)==Vv8w41Cysc97YLcYZX+!p?v$M&apks3|14$ zV4L*})?7vwJc~OWcOULGry$vcDuqx|a=KTafm)wy!&>9G_H`1E7YAh%iJ(HqVS`^7 zrs5hy@*;F8m}JWLj)3A`1}?51_8=&!`*;o5Pryj*3OLH3qCO#?PnIW4zJVz=F2L_=eBJURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3X;7-v5}ce4!ty12zR-7o!HjNu$U!Q&(Bz3A+8f_Ha!19|nWLU!PI zKWVjK%p^0!F$%+Ze8IVQf>?_S;tcv#^@+sSLbDj0=E^Uh0e8EtA}NWmO4jV&-Xl$pCG5uDgi%r+1o zs^*Rd$OEI>017f6vgRg?%UpQMpkYnk$~`5m6=$uHJ~_sg7jDgnM?w)-T9-7>4AGjI z!`yQz8cE_~6L3xD%DXOf7Q`YXN)tl2YIZ|@D|7bbXm6XNKPH9|+7l|D4kkpib^(KsTk(8&=^|DcVtI_1e1cG4XK@1uK|JYd{*`Y~e0W#q*qmkx%L%2h$$APa zE&~gWg>4R75BG{wDA<@viBQyVyjLH9TA3@uO5M2jwIh$`du0=mpj?GvgPUt6!vaI} zB6J~G!KCXR0L9%DY*;$%Ku}Wm@fNTbfq~d1aF{|xeMG*TEQ^?I4U?~9xO4|FQ)X&4 z$=P}tRqF^p-OT}v8(7436f*Nl6p9OSGE3sK`aaAP5P)Y~e&}LesHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{wEv zXt!a)6tl!L3Zr>?!A0p3&C;B$_GIH||EZ(77D1|w(?6z`5!x5JxE)T3 zVe7C7lM-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3>nA1Ll+;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!s3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%M)%}L|A&!W+$vBMPQp-OT}v8(7436f*Nl6p9OSGE3sK`aaAM)D5ZmKXfrKRn<2yo>qNKl(Q;$A_3%r6&=v9HXD8`%FwmkZ zP44x%k8{t>4j%)?`HxS%JjXxJZ}4^R^WX+!ID?Py_y~JXI=kKI-QC?nUVg2RT{zxv zwA(OYido_rh0#2{;39ZIoWqiM%YS1T1ioWdVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mpi#!L)Nj&E~FTeyhea=?}qtI;K1{EBmEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tn`!02lLlFUb}wGHE{5P>piI$N}hKuLSWS$k~W9%IL)Uo+yd(8QP4CCxKSjA8b$ z2waI~qY+XQa7E@i_#Si@#3Ce06GFKbL8@!=OWV_p)BUGT-{{%K(eG2s2<;19+zqG1 zuyxFYNr})!yqB)d(Mbj;?=tvswn%hKtoaYvrHVreh38tmjLzUF{+21v4HtS6*ubop z%4#b@zHA7gE3O!^xt1fYNXpbH$u6VSPvze-aO!hCWYs-o_q<>=3r2ANnCRDZ6urB!3QV5}a+ZsHO@MA$#7Sm-u*6a2s+l!PSRBF;j= s8>uN7`naDmk`p&3K4cIJ1($?~@30e5n{NNK@!V(8Xw%qXit=&e4`w|V`Tzg` literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu index dc13d44b39ba61a86f53067c793c2656c5c1eb45..e5c3e2d2ea69dda538040c88a9f090678958a0e1 100644 GIT binary patch delta 55 zcmZ3%^Mz+a0^?+TMn$&V#LA>p-OT}vn_0wl6f*Nl6p9OSGE3sK`aaAM)D5ZmKXfrKRn<2yo>qNKl(Q;$A_3%r6&=v9HXD8`{4o0=8 zN|Sqi?&I8Zv%|-LasJ~|FVFGM^Ba8K`#iY87|!4$JU+tSv(9cedUtm>ke6R8WEYP2 z8|^ksm|~WAMqxBhFSrO^5a+NY-tymA27&LGRhUa7+@YB#5T7V*8O(@PTunfap++@! zj3pCE;3Cg~auUxu&kHcYO<(d=!6-Caw?PHRXv+{wB=gKRx4^hqX5m{zaN=Sy+dx9B zntL804~%XDkYqk$txXtLg$R^E)7he31WMW~&e~)1_82=Z{hASvg(kkVE@_@wVhpo~ zMc_&_8;y{ffGaZB!S|rMAQmA}nh?sh2vS{>U)r8NIo;p#^!KS{g!Y9l?uJuh*g9;& zq(tZ<-b+{K=p=)acNu&*TjaVW*8B(TQpGui!gH-&M`v&pf6J8Th6_CjY+%+)WwjL{ zUp9o$6<3VdT+0zxBxUN9e3#Mcr}A$ZIQ6+6vg)3)dtR`b1tYls@-6C72qroWh*uq| z9#IB=8~aa<^=_<2c%x_K{IW%?GQ{#2vBeajfyvVz4x@y!l?p1~P`-at=h&QP2FnR$ zu*rG`D=s4op2eMxyASu8Q;=*zl|m>fIo+$zK&{V}VWn|g`#Ooov%Rv3L{OpQu))s_ zQ*n(Uc@a7nOfuzr2S9Nz0~c2hdk~b=eY^(jCtxIY2^?imQJ;{{C(9EiU&9pZ7_R&Q z%(T68h7@eQjH-2nDm|q7bFD3{8rub9^(b}|_xK^g{!zt3x51m>2L_=eBrxeHo25jUA>aA23K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3wA|Qjy*jq#2VD_Aes+>RC@>mT zX>za6eVlu4cK8S|&VGIAW*Po{d53SiU;B3$!v%bXrzhBZ)!FF;@9ypf^7?y)?7;DU z(rUq&NoI&+6o&Kof^+W#u@)D^8U7a+Uf?M+67xa}d#L9T#K%$@1~XzLmt#;Ps8Pv| zHe_57xXLo1oP=}M@f@_blec_TFba+4y_eoH+SJ65f;nayTVQM`Gj~lQII*FaZ6H2W z%^eSrM@F{+6l6YR%}p4Wx$u-h!&RQdVa*Qo6+?o-Ogd(oAE@_?_qBS#z zx#vnP7EWoCsaNiOo(Rdm@z8~ zgs#H9bhVF9&r&!!PT|wVBGWxF`d!@F#}tkNbOpPXVMIQ6T*-IQM*$pu%aoUz3pMtv zuh(m3wG|;(c7)IkmyDQ96+Hd0hbdIlN960tvWUsnF!?%$ zOLqV>Wu{h>oUNBpwT@7whgg56l%Z8)yP%C4hHk=^W?3MqEp-OT}v8(GA46f*Nl6p9OSGE3sKwA|Qjy*jq#2VD_Aes+>Rfk9NI z$-O@JaqhX<;UmB}`}L)pW%&2y9lq^;?cZSx7w{RLo?!1qXQvaqySrP+>+cn^1IPPG zs|8~wnIVo*7|!Dh&b<@FT3irk_+MOjfv3z!%nL2-p`J$&A4_Ez%!rX(jzNu}MkPDi zka0oaD$9U!63$u2bI{&SUh`GKC^VY)UV6)DQxii9=9p=`17ky(xoZ-^i4DcP2jWB3 z-0=W;Wb{3Ng3O1kxeeno7oIX`Sd+JMPf2UVS!<+Ejp8Qt_z(7u?UIMgwUOu-I71boNpZMy>v8F!WVMm^v{W5g!Y8Wr-KR6 zY#lacMS;*&xR@#$F#C&wv#x>zK;Cq}=CJNuZzQGl*s*D_4W=Z-7+Ci*CV!*7}L zQgflkp7r&5t*o{pL>_AXb_wg367lDD;C2*KRMSVoRoGgo& zYz>pIW4Lq&FjHn~HObj}8CB~DReFf^XG$4bHMR@dsA1?PZ1H1&{ilkBZoSjq^)*6K yNQGU*StxiTH6;TV7PJWDgpCOgDTG47Mj_%UwjyZL?w=>GeHXRT$qtj0kCK1%$rvjD literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu index 6878509e632750c7939a0b8eeb71dfa4cba0966b..7880c3cb34566e30bc90fc25b89235434f38b601 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`A9O|h_}NMN91Nl= zP44x%k8{t>9v=h7+0W0tEW^Jp@9=H!%is=UxPVXa^aKabI=kKA-QV9qUVp2QU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~fvI3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qS6K#>lW@*Do`d#w`jW2-MxoKX_tINNo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*BcpEt6l5`C&0QFmx$u-h!6x#3Ce06GE91zL2-%w=(Bjr~A*HzEkt9qd%sG5!x3jpADx( zvrWu|6$L_9;a<8r?bEXqPTDDayjW(sCq}=DJNuBrae%I2*D_4W=Z-7+D*8Bp!*7-H zQgfjuo(=S7t*o{p?~= z$|fQ~xeCJux6n+6C5Grl=u)tPN!L3BihC*8uz1*kprr2OHDEsi1Fp-OT}vTUf+(6f*Nl6p9OSGE3sKp-=?M;AbauPJyT8AMy#7`pyU>1@ zv|2Esk{aS@0i#8H!MS&wSc?U5hW)|9b3CI)VkR`VM|u%KY$BDRFegSbIRQ0>8kOv6 zLnea2RhB`)NH}L5%RzfPea%+|E1=Q5_tINhG&M0qP)AK;2aFA6=B`ONBQ_MX3&e-2 znPUO+NQ+$nf-FX~xea47=f0rOuqJQiz96kNXRWb5X=BTT+c4rWSHzXp70q))w5H}L z_e}CelK9vJ+>nLxt_R&Eu?X?fgixlq7xI?;R_6TK>HdzVe@qP}MW3sDHk=YoH(?Vh z1VUHgUb;H%)3X##+9`azSSGtCM!$(W`;fwMfUaTJG7QS+jw$&j`Z$2YZ=Gp;6{ z4)kWNw6-GT%8n4aVUiM)DKTP_#7v!%@G@HcRQ|03$3{0pR_zz`f#tNOL38H5eh-4= zobpZr;i^N`GfLrKWB<9aUXJw)@6@~;V7iP|hFBdVCZ8fS&{VbD;zmO(+fM%6k(l^$aKnNp^x8ruhL)F^Zl7Wpy2{!_(Lx87;*1{$F# zBw?3vmI{8Bx*$Ur7E}as!p4M$6hfh3qY&{7TOPD&-<>C~eHyjW$sUyj8z+AOB5@gE literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu index 33ddd12c0f5b6bb268d35bfa0159219591b40b2a..7e4b278b4811f5a512309bae6c3bcaec0e90f2f4 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3qNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`nZcN&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#`uC#Nx-evIMM`6|;LG5SqhfE;(5uq)|DRHTk8`6l{94B@vL|MocJFQ!-h-Gf@6N2{o2F5;uLb0Q7I7$A`bWJ3s6gQW7xOS5A4g=5=y9Q2TsHhLfmy;y{lQb}C6T_uD zgoQG5t4Ye5WmK&rROunqpD1N$)!060qb9x^zrqh5_Ma+Nx-Fb8+(;u7gp}J=oRxx~ rq^4x-{DS759KSLCAqHP4*dRna$5wc4+IMHsYad0ebhO7L<p-OT}v8(GA46f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`U+9YX@w1b(O$!X7 zDoyVE+{<&%Wsgq)(pC`<_5s>KcYqs-aX(ZO>^Qzd*Zw@&|@8Aj+psBAu( z5zXqbDa&(&F2lWWbvq}g37mdR;M4gk*&Q+ZRa}G|cj~Y!8K&ej$CZ2){UL$)Tc^Cx zT&SsMLtU?xRaS&t(Gfz|Try%(m5;e3F;ll-yNFgj6@Tl%vC(?SvVF?#dB!RlwBYXB zkDx{&nCLbjTve!QKnXlG4z`WeW;`8WvWiuNSRW%Mn;|sRY23n55KywwKn?J)s14Sx;cYWnjUvu+d@j;a+hHIh#@`5sEsF_wpl9sdHo4C>&S5ZshU&plBiz zl&LUma7)c(SYn7?gf0ZjnRNX_ptzra4U2~z2ui9xUIPvyFc7;2juR-UkI3hfr4f_X zFj*bLr8|U$GIOg*#_DC1ts_+GA=aNNWoX&hK4_!Hp_{P94*~X{DptDnPJ1`h2t^^~ xb`@u(;H}h@j9ggIJdhJMCOjk%3I!X5i09afpiTSktnt!!Q7hfpW0LYo;~xNX7%l(+ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu index b4529df8f5280a442f3a8dd77202165e45d7f4ba..0227b52148b0c738c5ef2bdc979bf8ae595c2017 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKU zyZ6qyk8{u6JvIT1)1ROFX^MYc-r?K+m*E}8Z~>p-=?M;=b@qC}yT8AKy#7`qd(b&- zwA(PHk{aS@4&!Bf!I^iQSc?U5hW)|9b3CU;VkR`V$9fq-Y$}zZupmYjMz}jTOdAE z%^VAmN1DF{Ajon|Te~nOGwyQ=O>44t=5x~CaMqsalMc2_xSA19xFW8wu4!HvqBS+g znP-wW8x0?ufE%(@-u0okA{HTDm=H=8_d?#1-^!eCogO@Q`c5sjj{cY#O7a0$*?cr3 zn$|H>Dg;7T;a<49ozt@f&ORjY@nV(fo*4ZqE<#Q^b=b8GQ}UT(O1_GIpFsSrQ(kJu z)zs6WuGdN{D?+a52%#G$DKV)MVvi-Kc^2YqZJLBGxzm- zP$TD*cN-9{DpWO~1pYM+wvE+hYzDX*NwSJngjgRVCYvEN)M?zpQ4mnF(Lm`NiPLvg zj@@x4u%1u?yR0X$;WDt`XxQkm`S74v4uVc|DG`b~j`#8-P^oic*eD!VzHa34;-F|E z5|pVhY;a3WWmsZ}UW6_=6;!(Z5#+d^fDMa>9q^n~eY^%7L|`Cx4IC#>QXi4eCrcwH ztzoh{hDmn>3uWe3lZ@8OC|gIU)I+R4Rm$XLWBZ_u8i#Je7C#2qf2vsN);sOpP$LwD yBp-OT}vTUf+(6f*Nl6p9OSGE3sKac4W2IVuym3$NZK7sgKr@Yi$sHtZ| zU9Xi@R)k#95kfayGGbDdkGUitn=ZGlYgZjf*%68cH@YD1D>+^j(!> zdz=ZZCzQZ8>j`YQ3@kVnmO3myJSa{fXHzOALQ%@`UVaBEfo=>Nk>kqOjXYi)6iq~e zG8KjmZmF3JdkoQw(4}BGldgXR6!#ObVfU~DK}preTfjjC24dI1aRMdv5&3emG-A>k zCaYt(bVsmIW^OgfSiOw0b%aVi#QIaE3@sbm2W`|ibQ2c&F~I&)#Y(r{Y43&_p(v!> xuHvi|{46ykBNrAl59EZ62@eT`LcvBM;yJb=Xwx2^H(vWRYNZ={Oj15+`~@xy8Dsze literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu index 7539d009018dacadd71a1b384359003c356f1ab2..ccb3a6fb2b97063ec110cdbf3f35c0befcfe8ae3 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3qNKl(Q;!q_3GG`?a&qR<7X#n8w!j@ zRhr!SxtHgj%O0Ns#>uZQgCxPfFK+S8;OpoXV>pM;@c0M^FFN~u@7>?uLSB8ZkbUUB zi#i?1m}G`HMq#oHFE}lnAlBlXIK%(qybyTKjKnIH2u`doW*11| ztLBb-$OEIh0CKXNu=X~L%TyGULCc!7lNOY8Hk@^)`lO33&t1car$P}|TGuo$4AGjI zqqN{sw4$g8O~5r-s=^JRzakbPQJN6CRf`+)N13xHM~Ax{{W&v?(4kQ2d^{tXH8B~> zbA&Gay>#_Xx-lGgPh&VeUuC)@M(>3C^q~p6l72)*>bR0`qK|!2|68ZL&|IjjU?bhE zmDN^+T-gyq*IY7U5|vN5Bq39;q`Hh&Kb3#$z@gD*$f|wH?s>{;8noc<+Yc{AA(-ez z;IBGVJ)Iby8;4Jg^;$fiUc8D`hFBjXCY>QP(n(msu~$#L5kTb|=O5qKIkv|c!+Jt7 zY_lH2hD*@H4)^K{P)l=T*vK2#zFy$*{Ge zq~Bl&UW6_L%b9e8L!fvNgY{d79SBP5KHdTj1JDz@22NtAs1L}OlO+L@G%#rs!=*cf zg)(!iNy?gKRIMXa=^@mgC}n8X*gj~ZCcYcL!VezypDI?mEu1dgNFx-4l-pIDm4ct7 qrey5=g65tazcKzH245)HAVfUJR(NgNcW2RSA4RQnw8tdn)94?;vKH|G literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu index 420a2271721c1c309add339748681c80aed6cd2c..296e163de22dd04e5c23ad2bf79cab37d616e187 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKmjvp7%Eg_ zPa87H30$QqP)@=*>v#s*+v#h*G8lzs>)uOm8Et7|NX{HH%^ff{l$p5}5uDgi%q|cg zs^*Rd$Rne>0CKVzvDP+>%S?F6plMCk&O9aUHD~RyK0U#f=dNbNW1)yEtSg%5hG@;q zQRcZ6%|^q=Cg6rFly^PoE{R1*6efi3)cltGQRaN(X#b_7sS-Yy8>fFx4I{KKR5lw< ziDq@!gylIxSK(f`I;STI9G{#eaCWgwc2A7n3HRwk9d<3llzis6l5e7qL(=eDrM%Q! zsEKC-U9Xi@R)k#95kfayGGbDdkGLc;Q>S3Nh*munf2+W;(R#?Teaaqq#wr@L;O^Ux zphh8>=rkZ)Rj6t}2|PFUH;vV1JRe}Pj8%kK9U~^2A~eux+`?fHP_ouQ=^N%B-&Z-d z$CLIgW?o&Hlb1?6m=Z$n>3WAiaW4TI77sfRlvI7Z1?)#)Aa(^DB~VfykuN7pBPOk3 zvO0!KcL;N3W>%Ao)ypVbN2t_8tUp!C(6X_8&_<0yH(`q(1MELlEOqOh_HLjNibBfm wGR{)L8>uN7y0D;mASY~0ct{`=3N{K6&#)CioA%v#p-OT}v8(GA46f*Nl6p9OSGE3sKwA|Qjy*jq#7rG*T{Olxc(+)mB)e zrwy6p1g_H*C@0~Zbvy&@?esNY8H_@+_28wqjJ7l}BxjD9<_;Je%FJAg2u^G$W*3ML zRddG!x8>jm(oxWG|jiW!Nh7sBqDw_?b zM6)_(!txxU>u@hzowL&fPEOAgIKNzGdLTycg!}lu4!ez(#)pb?5f y%Iz}FQo$RkDH*!3pm`uCY)p7aAQTEV3K7q+6+xT!@S^eBcTp?d*kh9NapN!RsTd~! literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu index 4ff6119f137894c02fc7f78be0d12519faae7596..9bb8ea8a2e35dceaa177de902b3c844ae34f2814 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKmbj}huyI3W=Cr0mt`}n>NyOv>4K66~jH_;~{Y51*EUTQAX)U%P@&J!IKFWe+@K6%AT&_w{=aq!3JW z8xXE4R5han{xuGs8>{8m%Reu73m+_Y<&T_pk#&N!7<&z(E8CV%NZN0wwhk`Es%}V$vEW zt7EuyN3c+4ZZ*kRy^OMTgi1Zc`ctJ0EgRbhZPYk)6BhX~!2VOkO1Iu=?}i$oD5Tu3 w;;a<>EHxz~7Zx-Rt<8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu index 52d10eb7ccf6da3ed2b7e9bdbc98edf0808ee495..18047dc247afd53e9bf23c57273dd9ddc77eb3a3 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3I12u`doW*12A ztLBb-$Rne>05Y;1v(`3@%T(l)LBpD~o#vFZH=MO6`lN#`&s@!jCqfZdSl2W!4AGjI zqcrDIG@>XEO~4IVs@(OVw;~oHQJ4_AQ;S>jN15|yM+dtc{W&v?(1B3td^97P)iG0+ zWe8pQd*SMybYke7p2hI#VwLHh82v8p*h9`f)?wGuk0?(aSMq&y49EVrPI;-hP}7_Z zb-h+rSrKwYM+n_;$%si*Hs+FqOx=R&B3kuS{H+6rM(ZKV_9=VdDXVDEg1c`&ycC6C zq8ov~s!-K*Vt8pBJU3Qr@p5|cDpnCD2O=R%P&AB&5dCrZ(RAhfyayeqKQCIs{F9Q zEj5#VgCTekx)dy9()AC4;(iR)Zyk0ZD5?5*4>$-wPwW~vj-jMJAYV_G1WZ!Hq;(9J z?hqEr%&jIVtCvx>j!>zGP=BJ7p=D!xpp6>)Zu|;Adf0!eSm`!*I(I{jP!LjPS8-Mf sewLb&k@E|hd2;;5_=gyLpwP_E|qqjbaTIs0CB;}LnAJ8ur^#A|> literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu index 1067544a79082c1cad31cc721aa71fdb2af57bb6..86899145251d59b31bd0027a459b066ac3986d4a 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sK#Qoc{XKOH=&&@($njz7Fm%h70%%PfxJ-qO;o#-re0T}t6z*U+8Fj>YbLadGvlT8sC=rnHOFbF7FYoPQE^V1Ji zj_q+Ku$oW;+pH(B<}$G0SlH;W`S74Pg`7>Oln6x~$9wq^sMNVOtQC$cUnlZ-u~#$^ z3CdI$Hn@dmGAuDfFG81sQ5{PZdkudZ)b`XoRAW ya=VPPRPaV>N`@{hXdcK38xtN92!(=;Lc}v{MbM_*KX1JDUDQf9c9^7m-1rCdjup-OT}v8(GA46f*Nl6p9OSGE3sKFOjJ7l}BxjD9<{L0Jl$p5}5uDgi%v&Hn zRLva^kVi(}0?5g7%v!rJE;HdNgQhiEJM)yZH=MO6`lN#`&t1)kCqfZdSl2W!4AGjI zqs((DnvI5!O~4IVD)0KxTM>(pC`<^YO88vflHbalZ=D`Iclu5(wvPUo8Aj+psBAu( z5zXqDDa&(&uEM==bx%48bWYC__;|6(bWe^1P_ofL=^N#z z@2ecU<4j;Zp#*kWPhi7kV8OAl(P8u9L2(K>n^GweiaL(>@*_~Gb7R;j99O<>SdIzBUI`k)}Jb6XxZ34XrsoVo3O=?0rsCNR=V|0dpFbw zMIq&O6=$X3t<;o^Tv*UNkP|j0JR}eb1sjEk=h%v%P5bV=@zQruE8W;*lJZI8FZw1J AD*ylh literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu index 965a98e2e02feff28ad0519a3ca98e2b1b8e1aac..ecd87aa35bc9b3b21bb70450f4480ea16b1f39a2 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKp-=?M;AboP3|yT8AMy#7`qd(e5; zXt!ZXB{js+9LCG|f-~@{B*EQe<6-b-(3-qOSnK^-;C9WXYOnYk9>jMz}jE)XB8 zW{w5OBh7aK2(ld0);5gEjQgBI)0(WE`JA*joV6$Vq=PLJu4cp&u81qFYnm5^Xid#g z=9%QpM#IM@;D#)fcYWxsh((AOCWKPOy^y!$w=(C?P7iiG{bOb*$p>6z^U;iGT8B-k z5C~m`d*SMybQ0*Co+a?{VwLQk82u*hI6%%m)M3{$49aJYDfu=!f#dL7r@YjRtEs0$ zU9Xi^R)k#95kfaiQesji#!Ql!sap_UM5~^Pzjff)Xgy@veoh})Mk^XLXYT9wAV|(B z?=~P@Rj6u43H)muJU3R$v6tn=ZGlYgZjf*%68cH@YD19Sw`o7As zJNk>kqOjXYi)6iq~e zG8KjmZmFpZdkoQw&?TpWO4mPx9QPBjVfU~Do|CGNw}68P48*R1;{;0TBl6{BX~d*8 zOjgG*=?-C`%-m{{(RvwW>j;&4i1nvRnY?UlAGA^9&`nt6#{m0J6)W9(r@b3$grbmy xUBy``_*v?lj9gey5y%M}6CM%>g@TPj#B*$U(58KN-gxcPsFiN)QJJ$z<1ZpM8DRhb literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu index 11829eaf92b48d901a634120f8f1a36d3aa59f1e..ebb5a2f3abc168d6e1bf6e3f1d5938f8fd5a03aa 100644 GIT binary patch delta 54 zcmZ3)^Nwdj0^?*xHnEBKxY=?OE0a=nHwQ4TWf9j=$jmEIC@#p!EQ!yKFV4wKPA%57 K=9>JURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3wuHbaOF)`@Q8qvghK>eaC=Kj@11@w1ckL4i@N zYLoO{pZhrX-0blQV4VH@+|M%n>*5yQ^uG*mF@|&a1doq!@S?NV3*PpNec1z@qC%+ju`zPuqzoR`A);v|WRp8j?X2`mG%I4m%K(G=01U97JFsb_E=#P*Wd~FDJ_)CfmT|n;0(L5zLjDSxs`b zSw`JDLbV=Z{h3mR){X6hHfkKY2^;(nVE?FMsax-~cSDU(6jEuIah3{xlA4l{3;S6H ja>B-hhZI7gV51Q63|kSjX%Ekm*WQU*>12;d$|uQR3fmU8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu index f302afbffd4578dcd7fb88dce7b3bf04a7bb37b2..f330b6d73cc9eb10cb2d133068565f883618118e 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sKKa zmwV^j$GPY39v=h7`OnY2JjcH-Zt+d;%itDcIEPR0_y`BjI=kKI-QV9qUVW>OT{t;x zwA(OYido_rh0!AY!A0piI$N}hKuLSeS$k|wPq5?CZy51dXyQxjisrc`#xQ$a z1g=E0(FmytxF!o7d=I)yVi6Lh387m(zahW1J=;3kf9`0mMUZOi^pB}!g!Y9lX2U5l zY!fzNQX+I2ucfPVdXhocY?jJu zD?+|(2%&4P7_qsQBd$ow)G5g>qt#F4Z5256xf!zRp0ayhu$l!Uxc~Y+icttAIt_@Q zI#fNM44xYM+s1k^p3W~@#wtUsjuBf-5gM30E#WZgCtFLP@(tyOcXf{4X=bpRPzJlK zXRzinvfx=<=(zZBuQ>(DCR8bel8n=}dI!|vTpQK`$F;AMcsxHSn@9u|Iu0BB!Y~zg z7?KyE3&A8)zIOx^_cCyC@303!N!`b5z;M1& literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu index 04753040fd76d14aafcc55b97221e22ec2b070ba..db6e22b935df5481a92cf228bd0af5c2ee891582 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK(wv$L080&pPi)7!9c6p zB)!+?KF&Qi`+N)-7e7AsiUR+Fjo+_uya$`RQwo?850$ z)^5XuDQ1ah6h@2m2baMM;vAO5TmBo%An=S?g}F4sotQ-e@rlxw!JJsd)dchy8dSD# zESX3GS49DolX%W~UV;g3`kb#0Mq#gY7gTVJwhXaEGSBSZD=;pWS^5?coVZxbYak(3 z%{`Bh2S#55kYq7ptz8&br3jS4p0j1U43xCjoVCa1>=ZjL{e}^bg(kkTu4tZHVhpn< zW#CHeWm!l~z%^Ou;Cs+r5{rOr47CDq8bY-By89pPM1;?kT(HC2LqPg8MJuq8No>qLV@V zG@+XLw@fhp1w4x@hZwFGM4P`-Q9Yu)!}3 zQ*nnOc@er4Ofuzr$3Sr}2N(Aadk~Z~eY^zhCtxIY1svs2Q=gDeCo2-B*ua#V7_R&= z%(b04LrS(;M%_9>wH{Lah1QnVjU9lodK9~fOZ*UF|EOZA+u%*`1A|Z!l61>BO9gMG qrex^je#%Hr+?e=~Lo5_r5+a^qC!#jp(RudVYtd+v?K4IBIQs)LoEPi> literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu index 11733646250e8918d70427f8f7830370ac21b0aa..161e1337315dd4ac59f5b824d5e3d6e3dc370a54 100644 GIT binary patch delta 55 zcmZ3%^Mz+a0^?+TMn$&V#LA>p-OT}vn_0wl6f*Nl6p9OSGE3sKwuHbZC|tP|bFN6U@f)~jFggRY1lKRZbubTH7W zHc9XGxsP+t%^sfs#>J11{i47>FK+O4|MTz$V>pKo@bCZ!Pda07;f(*4~A2Rf<3vw45zFWuT<9;jA+;rzhBP>9>q{A~f-pbxrfa5@VP> zE(2Ghm1Q9{0as+HgYQFcMJz(1G9grG5u~~%zqCDjbb9d0)8A*75jqgMoR4P2ux;3s zNr}*9yjHI6=}8Xh_x^m9>y}vaAFxj<&MB0hYxN@fZ4T+XPI+Ot(9^(%X1i3@SP}A7 zLkL}Q#fUAm9CJldrf$V|6|H%yZtK9Q&+U+P_mti7k~J(C!TpzSQIA3}(aj)!no!M% za`@Xgcx-HTV>7}Vy{P7wuVPgp*2jn~X9x{Vk@j#DC6sSeQ2R#m-J2%I?lf~)Pbi07 z)^pf!8Cmcw?sVLJxYL}1WK*gXLP^Q#TE7MweQpdJjpN4GO+20-R81s;N*#v{ercGB zYYfSY(1l=JURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3fSZH5pVtP|bFN6U@f)~jP%e$W;1<7X%7g94*k zRHeziKKF6%xoPq-V4VH@+{-fj>-+{^_r45nForYu1P>3e|E#my4c@)IE#&353fYC@ zgQVSt36sna$0&>z@dfAJ31TfSh%@{TF1)}qW+di?7WPOlB8X3O!#!#b@ zrZ!|!5V*)Ppqzwr*6|#)x6_w=RWJ&T)~%P`GTPF_kb*g88arTYC^L60A~><3m|Y+~ zRLva^kb6dV0Tg60Vy$f$m$~qiLBpE7oqI~!YtGtZeR7N~FWiO^kA)(xw618L8=^Hc zN4e)xG?K)}Cg6%Jly^PoE{R1*lqQ63)cl(KR_654(f%$+e@qP{v@cXX8%~L4o0thJ z3WP4gy>xX>j#K#XE`^V0%S5-t=>LFS$}l0HJFeub=(j1v-zw#~=0Z(88|ckaS#3qg zl?@?u#U&#qQ^kl&5;Jv5rpsvcQ~9?F9Q)i1S#?j@9nV?Kf)?C;{T`Gk1QVSE!c~W= zhm*onWB;+S-ioKgOP8_A5UXRvGZ+b^4l1m!9W8{9%O85S6# z7ol^(3MO6e5Gd}YV8ha32ZEBikJo_x2n@uofTI*D>Lc>`WLd;y8<>0(!=*cfxiT}W zNzOLQs9Hy;(nG92Q_9e)u|3d6jY2nJgYN_EA5|=M>z(#)pb?5fD(o`OQo)Z>Q!;d6 mKZ`(4*qHE;LMRk$6e6BsD}pxd!D;f+H&H8{G?}D)ocskY{uaOh literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu index 7aac31e0cec013b05a0965d7e5791ba29f5df4d4..cf467004508bd41551eadc14db74497fd20d4eff 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK`aaAM)D5V>0(`H(vHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{zlr zwA(OYido_rh0#2{;39ZIoWqiM%l}{*1iocfVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mp^E?O2Nj&E~FTeyhea=?}qtI;K1Qi^kEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tmb&**CalFUb}wGHE{5P>piI$N}hKuLSWS$k}bkFev?uNm=JXyQxjlIEEu#xQ$W z1g=E0(FmytxFmBOd=I({Vi6Lh388B}yCT1}J=r+gf9hziMUZOa^pB}!g!Y9lZiiE1 z*g9;&q(taE-b+{K_$Y%9?=tv!y2y1ytoaYvg^EK8h38tmh<=+v`YltQ87}lBuz^`G zmDN^+eAy5}ms~Mob1g?)k(8-Zl3hltpUS^w;MC`O$f|qF?s&m!7L4Hj>-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3@704VNd;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!<9dP znYOpikbs3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%LP|D^HUXVGZW*kOwDapNyiCl~er literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu index 19ff9eb4ba5d68db8ade04f1d6e97a17ef3acb87..9fea507421b3a7c564298daa21c6271d8bc15882 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvE(LP1?c zo81||Io`b4Jw67E^Y0&fd5(Wv+~S+wr@<}8a1I~f;Qp3#>8Bw37DYZt~KiOIWm2W8DzN&NVPBVkm zgfiG=J%crukp<7p-OT}vn_0wl6f*Nl6p9OSGE3sK`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvCjQYcYZ z(PnqXZ;m%_c8`w%P81}0B?IE)g?)+(rcL;3bqonw2N8LTFh z!8Yp|thtOVcoug$?mpaUPC>E>RSKb`Z_Jdc_wf?2pMa6r6>yY6MSVg(oh(n7d;?Q#Vz}~0 zFxPhG3@O-V8CB~DReDJE=UQ7@HMS4N>QU?_?(uzu{iBMdZi6?$4-7&{NYX9iEEW7H rH6=qI_ftl4;>N^>3}T_+k`VC>I}x?%4$m6TeHo25jXkC)A23K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`lQy=za2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($)F&A%)}iX$q(3%S?B~=r?gE?Gz$*4ZD(IL_T+1$v4r*A!+!nQeJ2-)Woxa z-mI0?R)k#H5kl8oGGa1SjJPB*Q>Uc5j8;FDf2+W;(an%m`;^`DoYgdF!QHnXL5e~! z(Mcd&b*OqeDLgm!pBn46cs{*!8LJGjIz~)BMQEV2xPrr=o^&mM$~P=NzOQp^k28hU zgi_dMJ%u%wfd$9HI)}A~d&MafY(k|(D2h1Vt1mz;&9z}IZ(RF2k;n6cvWZAguEMav zEi{v1gCTkmx)7{j()A93;$8|iY#nwWD5?8+3)qjqKp-OT}v8(GA46f*Nl6p9OSGE3sK(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8rpFX5;+&)d=^n97@ju`za?xdYU#I9jiGEB+mjw|^p`Zy#Fzg5Z$ z&4rqHHqe{3vf7G}D?38ynoCAZriu}lBxdTAY?smMr}A$VI5xT&vTC2Qd!DnJ1}(Vz z_9LiK2qroSgsTo!4=9DF#{Rak-i)UMOqa3B5UXRvGZKPa1s z1m!9W8{9%O8I~BL7oiKm3MO6e5Gd}YV8h~J2ZEBikJo_x2n@uofTI*D>Lc>`WLd;y z8<>0(!=*cfxiT}WNzOLQs9Hy;(nG92Q_9e)v3<}+jY2nJiys2)KUFMs>z(#)pb?5f yD(o`OQo&oPDH*!3phX}jY)p7aAruNW3K7q+6+xT!-C6R|cTp>y>@i9CIQa+o#TYCA literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu index bffbaca96f8af413a6c13ccd911af7287fcfe1fa..919aad63bf936ea876d2811240d66030b16fee49 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKZMvM4@bMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaLziGgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9>6HnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM%w_n$j`r{-Hne@qP}MW3sDHk=Yo zH!%|`1VUHgUb;FT-=}cWK1<>3Vwvfl82u{lw4Fl4u3^_QOv&etDfue;BqR;LRmw}v zxSDu6(3`c=+KP}XJ3{D&NlHwn#E3}}Gj&R~%V_me`L_xj8{G_9wO`N&meZOB&6)f9 zJ*bg$$~y^!s}5BUD20EG{cU5t8S4QqN0KgMl_6Hgh{>l24RjW_a2N!Xt~F5khT_A! zI>+ufQ&>$Xg@oC+#k?+6OqOTmW4!wz^s>ONir_9HM5y8@0 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu index 001247cae57870c6d8fbd5b235d0011902b6c71c..f4928b1e3674bda3f884c208e1fdd2db7bac51a0 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKp-=?M;AbauPJyT8AMy#7`pyU;#N zS}mARNeywdfYBnp;M_Y-ti^&j!~S65Ii67?F%z2GW4(wVHj&Cum=hzJoPZibjY{^k zArnF1D$Af?B%HI3<)FQtzUHff70_tjd+9ALnwl6QsH3K_1IC6jbJrxC5gUrx1>!^1 z%&`D@q{S`(K^7z0+=elkb6-$sSd+JMUy#vLFI}CF?^8HwpQUhiu}pSPjD8b$+D;*2*RX3D2IX_dlzbC?5|W1BD&?hSTunS3 z=*?PbZAHkH9U*kXBqb(OV#FkgnK~umWwiRK{96T%jc$gl+Art>%V|x6=FENl9t6ob z<(&k=Rfno)l)}Hp{&Qo!9P1h0sd+iTbQ!A*u{uUfK1FDtv$%-EprLdvgUUA)AKukD zw#S*mYC3K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3(#L>Kj@11@w1ckL4nbz zN|Sqi?&I8Zv&YAParWy=FU#=ni(7ot`#QMA7|!7{JU+s~i_UI0c=z|WkXPR;WEa}+ zl2!{QOfo|pqcB>;7o2-1h_$#N&hWpu@B+`6k(d`+*dx7&AU=`GFqjh~xtxF+Lyb!I zv>}s%z-5*JCHR@4WPu(WWMb6wEQx*a2fhnYn8c!HEsU>;mzj zYVLS|JTSTopdgD8Yi`51%!Q{68rI~k+*8t8bJiN`<2JUua2rNE7K*six}te*h}O&; z<(^B?ND?2LfNQc)-u0lnBo-l2nh?5G^BeL&XzEJsWI3=2GVkWF8 z5V{Qa($zWrm_qyHG=)#+%S?B~=r?i4?Gz$*4ZD(IL_T+1$v4p-Qi#7*$_vefns_$Q zo3*mqijXTiLg<=HMogxP5tk%p>XcNM(dwu2ZxuK;x*4)+pR#+Nvzi7ixcl}aNKptT zIthfU4pmPlh3CfpQ)9gr&!?9zW0fIR$B4G`#Q(=I8#_n zD1~j-Q&@8uSa2+?b69)0SDZq@CR9pp-OT}v8(GA46f*Nl6p9OSGE3sK(#L>Kj@11@w1ck2@Ikt zP44x%k8{t>9v=h7*{?6XEW^JqZt+d;>);k+IET;h_y`BjI=kKA-QV9qUVX2SU1+~c zS}m9`$qaFf!e|j+aPFNT*5ZOV!~f#K3p`^+VqR!rkMtsf_(UqhU`~wWasp}$H7eQD zhD-_qmstjslW@*Do`d#w`jW2-MxoKX^U_;Jo0=F>Fvm>e4Hz5B%w3ZRPHZUVEf627 z=8gx*1EX&N6l5`C&0QFmx$u-h!6x#3Ce06GFFYenb8!bGCJ~|J>0`317&q(?6$%5!x3jpADx( zvrX896$L_<;a<8rryo;jpPZ)f>3o^&ju`za?zo*o#I9jiGEB+mjw|^p`a=rww@P`T zxlj|&270qrR$CErWk(2IbIFLwR59X`#7v!%?J`>ZRQ|03$3{0pR_#-E&vRDOpapl| zegriN!9*v4aMhve0j2QN*xxqRoAGpj=`vOsVs(s|e2UOOXK@RMK|twR1C?)BoV>4d z?2a>q)r3;mWj%#8mw^Sx!bXS9hkL~-6l_AJL@4Sw-m8y5tJj2!#)O9yLZM)z5b+FK5wvOFoh2`Q7q!yK9+Q-hlYan77%l(+ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu index 203bd8959a004a5227fad8bd069d45607518076f..67a7c7644abf6915c42625fb804e2bf82ca1b50b 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sKZMvM4@bMH8@77OAG`-6q&ct(xHOlWS8^&*1UL@GmJPK;!70%{C3D%sP9 zOay_eEQ5lPaLziGgZ6g%lCKI@K%;r@rMI+bYGR0>j+(|BFgBE#yC&g`*ig({AU;&h z91D;~TD%1y$YMmByD%nm?h6VHYw}j^3({J1)*9=RHnvQ-4I>_NMOj z&m?aoiH}Xd4OuAfdeB`Gix4kO2xW?UA#cfVWzM%w_n$j`r{-Hne@qP}MW3sDHk=Yo zH!%|`1VUHgUb;GGA5v(ao~7{dVwvfl82u{lq@6;3E_H`nU7YAh% zk)T|KVS`&}D#H>(^dfZ0si4yJj-bH36l_>L?0^@f?&CFJKLP`>E8r-Fiu#CrK3Ntq z*#;)x#4zcOV6M!}YLe5VgbiSWpqj2^$k0QV4~DjY7mTYid`YvjvlRYX6HctKm1>qPj literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu index c353c7a281d5da4cf006db37aada058c4b5ac290..a27a779cbedbacd4082c575f6f5b0442d7d50145 100644 GIT binary patch delta 55 zcmZ3<^Oa{q0^?+TMn$&V#LA>p-OT}vTUf+(6f*Nl6p9OSGE3sKqNKl(Q;$A_3GG`A9O|h_}NMNpo7t< zN|Sqi?&I8Zv&YAParX0bFU#<+%R7AA`!cx07%t!wJUzj|i_UI0c=z|Wkk{WTWEa|p zNvj1DCYd3QQ5Y@a3(mb0#9CYsXZRmnc!6ikNX!c@?6F=%5T8h87|e;0TuwlZp++Tp z+K@>>;3~_2auUv2$8*r$PG9p?!6-DE_g;F-Xj2nI3g(z;?0~VM%-l7J;KYVvc7ga% zHFrEf9vR&QP>{ulHMe10=E7434QujN?kQ=lIctseNgG>UxD6v73q@RMUC}%@L~CY_ zbI+w{B#Dnrzzta_?|RT(5{r;1O$cR5_(I;2-^!dnJKf*$^pB}wg!YBXXTvGcY!fzN zMS;*&xRqm4B~=Mk)Mj>_0cw%dwu}otl>eOqa3B5UXRvr$;GN^pR;`Cje zV|$z_tR|GgHtQ*@xeP2g7M40JKRhT-pC40{aGi_oQD1(U9K1QhpDuwnPG13^jM$6LUD1O{SPz)=bn^%41UvMge< z4NSg?;nE$!T$!2GBxjptRIMXa=^@siDP?HY*gj~ZMxmRq$d3W`pDLER^-g;?&JURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3fSZH5pVtP|bFN6U@f)~jP%e$W;1<7X%7g94*k zRHeziKKF6%xoPq-V4VH@+{-fj>-+{^_r45nForYu1P>3e|E#my4c@)IE#&353fYC@ zgQVSt36sna$0&>z@dfAJ31TfSh%@{TF1)}qW+di?7WPOlB8X3O!#!#b@ zrZ!|!5V*)Ppqzwr*6|#)x6_w=RWJ&T)~%P`GTPF_kb*g88arTYC^L60A~><3m|Y+~ zRLva^kb6dV0Tg60Vy$f$m$~qiLBpE7oqI~!YtGtZeR7N~FWiO^kA)(xw618L8=^Hc zN4e)xG?K)}Cg6%Jly^PoE{R1*lqQ63)cl(KR_654(f%$+e@qP{v@cXX8%~L4o0thJ z3WP4gy>xXxyi4KaIE9aA%S5-t=>LFS$}l0HJFeub=(j1v-zw#~=0Z(88|ckaS#3qg zl?@?u#U&#qQ^kl&5;Jv5rpsvcQ~9?F9Q)i1S#?j@9nV?Kf)?C;{T`Gk1QVSE!c~W= zhm*onWB;+S-ioKgOP8_A5UXRvGZ+b^4l1m!9W8{9%O85S6# z7ol^(3MO6e5Gd}YV8ha32ZEBikJo_x2n@uofTI*D>Lc>`WLd;y8<>0(!=*cfxiT}W zNzOLQs9Hy;(nG92Q_9e)u|3d6jY2nJgYN_EA5|=M>z(#)pb?5fD(o`OQo)Z>Q!;d6 mKZ`(4*qHE;LMRk$6e6BsD}pxd!D;f+H&H8{G?}D)ocskZCKkW| literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu index 30e837535733678179759ef5feb9b49bf6dbd78b..465b736e67f6198a1c26f93cf4c3fa565b84d718 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK`aaAM)D5V>0(`H(vHbaOF)`@Q8qvghK>*1IDpey3X&rZ@OFwmkZ zP44x%k8{t>4j%)?`OnY2JjcJzuJKjx%itPgIE7E}@Bn*HI=kKI-QC?nUVN*NT{zlr zwA(OYido_rh0#2{;39ZIoWqiM%l}{*1iocfVJ?kuhi0BYe4?~vFe6rRH32<_8r9e_ zmP{mp^E?O2Nj&E~FTeyhea=?}qtI;K1Qi^kEki7k%ro131;)iP3*RDw6Bmnl4J5>> zx#tmb&**CalFUb}wGHE{5P>piI$N}hKuLSWS$k}bkFev?uNm=JXyQxjlIEEu#xQ$W z1g=E0(FmytxFmBOd=I({Vi6Lh388B}yCT1}J=r+gf9hziMUZOa^pB}!g!Y9lZiiE1 z*g9;&q(taE-b+{K!@CTQk23gpy2y1ytoaYvg^EK8h38tmh<=+v`YltQ87}lBuz^`G zmDN^+eAy5}ms~Mob1g?)k(8-Zl3hltpUS^w;MC`O$f|qF?s&m!7L4Hj>-Q)|A(-ei zAYOH-dOjIEHug7-^q)l2r6_OHu$+= zD()~OFG6R6Nv3@704VNd;Nsq44}y}qkC%Y`1dPNkfujs6>J#$mWO>5mYnWmk!<9dP znYOpikbs3r65mJIKdM;hHh2^Kz#x=_B;6v;LctrU qDH-~>pE8mYHzq!05DNvDgotmk6H%LP|D^HUXVGZW*kOwDapNyiPZ#z8 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu index 9dbb5d1b7eb64262f97281f110dee13909007251..023bd24211a6a8a2bad2fc2d692f431eb928309a 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sK`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvE(LP1?c zo81||Io`b4Jw67E^Y0&fd5(Wv+~S+wr@<}8a1I~f;Qp3#>8Bw37DYZt~KiOIWm2W8DzN&NVPBVkm zgfiG=J%crukp<7p-OT}vn_0wl6f*Nl6p9OSGE3sK`3DRcjKYWQ!0jG!^Ob(Q?*K;>MSJkOuMNYwvCjQYcYZ z(PnqXZ;m%_c8`w%P81}0B?IE)g?)+(rcL;3bqonw2N8LTFh z!8Yp|thtOVcoug$?mpaUPC>E>RSKb`Z_Jdc_wf?2pMa6r6>yY6MSVg(oh(n7d;?Q#Vz}~0 zFxPhG3@O-V8CB~DReDJE=UQ7@HMS4N>QU?_?(uzu{iBMdZi6?$4-7&{NYX9iEEW7H rH6=qI_ftl4;>N^>3}T_+k`VC>I}x?%4$m6TeHo25jXkC)A2JURT%*NdJ?Dr delta 31 ncmaFIvxsLy0^{cCjB{C-G_5DcF-mY3fSZH5pVtP|bFN6U@f)T?7#e$W;1<7X%7g94*f zo22*p+{d}+W{*z*`=F;3CU_auUv2$8*r$PG9oX!6>xaw_bY7Xj>CQO6Hhp?SQeN%-pq!;KYVvc7ga% zHFrEf?it+$P?E)%wYOnh=E743Eo<^l?kVZ4IqOXH$uYLPbQ?xI5sJ9Vx}te*h}O&; z<(^B?N)jKNfGe_4-u0okBo-l2nGm{B^K0@)nbSu{2fG~oIW>&Xfl&EuG$opCVhUE4 z2wjA0?f#L)P6>cE@wpu%HEZ-+lxo3c*A-f$(WU zHN#2asd4bw*lfkq;ib!1RfyFwV)7|ML!HG990l>D>-pEdQTgF*lVf|DDXb=x!ZzzE ztho#VDcG=d*nyy=>EkuvAOZuiE8sYVn)--*K3Ntq*#;)x#Bk{jVXn-~ zYLc_fGV0b5s`U`-&y+H>Zfqa4QRC1}*x>sB`%e{1-Fm0J8)}52kV?CZvsCb-)Rc@| n*v~SM6E-G1q!0=P8-<8x*ovS{`{p!x>7A&RPWG6je3JYFL9G_W literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu index 75404041f2bb2773e870f994cfab7a585486de4e..576d6663e15ca34f8d4d3a409c9fd5830e604e59 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sKyD+W_5h#PEvqifIl(g5Jwa4b<7&|Wgh7pg2Ccd<;Xr5bQ46{c? z;7T+bjgXpvE3(kR_n^BZ79mlZ5W3OxYw}0i)2*ZZr;g@Y1gW-8|D0MzXkX}JHk=Z} zHenMcB|;bRTDm$P-)E41pU#%KZizK7!cLBNVV5cnDHNV-^&d2YDSlfVXM zvs6}F5%Oh22wic-h|RSeaYa(5PDyqdt$r$RtH7zx&5%|1l-==y)hrmn{kI=cj6yKc zX+Zqcq3Zc$@YvYjHr9*rcz)S3RvBV-jM!p|(7@zr35QWX*;)dXZzw;!t#j;7GlSKH zGT3E3gEg0t1<&F_$Hj*`%_&GWp-Lf?WSp+mJD?Wl+OQTlu6>=vONiq_7gA?y8@0fsHjiKr<3IglW$;(O$=B5 z5a!y>oFN6VxC)NSx4_<=zv2}!zToTY-d rQd2VYaX)1wCvHr9$RHL9E(sCOuoF?6?#*fAx!0o6rm@Eq<>SUbWY!n^ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu index ab0a848d59744ee6cdc960230f9f6b6d7f8f218d..c4414a1c1f79356a8d5b78e239b3d88c1e73c180 100644 GIT binary patch delta 55 zcmZ3=^O0vm0^?+TMn$&V#LA>p-OT}v8(7436f*Nl6p9OSGE3sKu%W+3iN}{{9Z~>RW~E!pUKy z-G&KM%o5Kij27t+E`k@tIV_2{{128v;2EG%hgs!<_#O7L#xFRW2rzE?KRzH=uRp8X;X2`00%I`pU- z)r2zGWj%v6myrd};zGy8hkMN_NH(ELA(UjCuGKrB7U$Zq7C5eboy6n$LD@tisL*lP z;1`CexWkaV2wey!nex3OptzTTi+hJX2ukWcUIO+LFcP~0jxwmIPspc}p-OT}vn_0wl6f*Nl6p9OSGE3sK;_WpIlzoWm!0e1wB%o!xHq?(c6QufA2tE}R@T z+HIIH#VqlR!f28H;39ZIoWqiM%l}{*1fDUgFqcNSW3xygK2h2-m=mkGnt&cdjcV)} zOD2-QWu61&B%X7g7hr;$zT~TdQE0aAf(nk&mLZl%=9z8ofN`+c2&Q5h#PEvqifIl(g5Jwa4c41UoMMh7pg2Ccd<;Xr5bQ470~Y z;7T+bjgXpvYqHS6_n^BZ79mlZ5X!X(Qr(c>+MYc*-QV%_kEvyZ_JuBH!znRr6EZkIy3Y_}f3|Vzg**z~<&4Ll!fBhcyCCnS$Mb`-i9}GL4|@=l)P1}L>?dF(b_E<|P*IbxFke8!%jqPy2G=^OK(P_O=FKK%EyhrkvSPL literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu index 8312b40c57107611f8ed5c4459055ec5b763400c..ce7de93b46aa24fe768c796607f575fdf7896d77 100644 GIT binary patch delta 51 zcmZ3$^M+?b0^?+TMn$&V#LA>p-OT}vt69W#6f*Nl6p9OSGE3sKeaC=Kj@11@w1ckL4nal zRhr!Eb06oPn>{`OjI*Dg`dNm5UEboG{+Hn`#&7{2;qeg;UUc?)!Mne|h5Y=jM)u(3 zFzIxlV3HZ)7=`g7zTn(DL9E3kafbiFr5AX{jKsXu!XE2I1o1*D!(dK~H+5uxjnYn8d!HEsU>;mzj zYVLS|JTSTopd^biYj4B2%!Q{6TGr&9+*8t7bJm&Y(-UlY={AgbA{23zbw%^s5UrUx z&OMi+l_Wkk0oP=qyz4`6Ni0I5G9h%U<~QWGGUrc@4t6>EV`>Zl?!JBxIuwG5ZUW(ILN%jF z;kj|})Y$CA^U#D^8(g1(gz^sNi_7*T2!_+OXC(ZhYOyB-hhZI7gV51Q63|kSjX%Ekn*S?5a>12;d$|uQR`;r!1 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu index 6ad1b86b9182934e96b64fb6bb5b3155b3fd0d99..254272c00316cfa4fdccb0d4ac65371853e8fd23 100644 GIT binary patch delta 53 zcmZ3?^PXoz0^?*RHnEBKxY=?OE0a=nHwQ4TV-eR;$jmEIC@#p!EQ!yKFV4wKPA%57 J=93K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3`aaAM)D5V={5;HAQn<2Cf)`@Q8qvghK>(wv$L080&pPi&nV6do4 zlY4#c@pTAYeE}R@T z+HIIH#VqlR!f26Ra1p#9&S6Qs<$tgY0?(LLm`fwvu~{S#pD1k^%!yT8O+b&KMm6?~ zB@;>DD$jv(63;o$3oyY=U-DJKC^TDlK?TQX%MeQ>^UOBifN`Fk> z+k{P+ln7nLd+F+&WpH|u!P&(!)g7_s|52Y+98oAd*XmX9`wY@=mGRPWp(lY2%x0mi zwj$)qb`ZMeiV>S@IpT_>Or4VIGFtsq{;dM1E;mC~y;FA23s!Sr1ovORM=1)yM5h7q zszcS&$>6E6ziq77;_39VWvnv9>KL)b6rq91(+Uowda|_uD&J6kcvt7xon;2A0cEhu zdIoDQBMY9zb&hKf_nK3XY(kYnD2X`Tt1mz;&9z}IZ(RF2iN}kBvWY}cq2sW@FAP(0 zgCThlx)e+@<$Fg!aW4ZGw+?#{l+=B^2J9zbBz6THWl&L{kk2Q}6DHrl6q^{X{1MEx zojF4awpm8iIzp8mQvJEsmR61JgRy!PyNN6O5MlqQVyWBUP4EMQP!f`K%Q#B~Z>6SW o=;Le&?6kHM_o?$1VHr?TQ3K delta 31 ncmaFQvzTW?0^{cCjPqESG_5DcF-mY3`aaAM)D5V={5;HAQn<2yo>qNKl(Q;$A_3D@Wpey3X&rZ_kV6do4 zlY4#c@*Iz4S7fucv z?KVu9VwQMDVYEmuxCmYl=ddK+^50knfoIGr%%u_T*enu=Pn5O{=EN$lCZNYqqZ)h0 zl8Gd6mFGY?iRYZ>1(@KbFZrrq6q>F3pn_wxWr!t`d1jk$z_?gu;afy-;$ksxfrMB! z_dG%#8GQ>tlEsL%c41r^4O?+ux(LA@r7-o-) zz?En=8X+|SH)NrM??HD-EJC6*A(U$oq`D=)v_0QC-GA=%ot|$U{XVsf(7w>cY&a!` zZDJ-&N`$WBy>xZXGB`cS;Ot_V=bl*e|EOyfM-&RrwR#o&K7;gIWxO<8=t*D$vsoys ztqA$D9fWSUV#MZJj<_NzQ>Uc5j8;FDf2+W$%gvBg@0308g4G-t!TpzSQHnw^(P==u z>QMD`GWgrr-!|53v7X*?9@#Qh8De#e*kX#%z~pHKhfzJ*S^$-AC_lWbbL`GCgVlgC z*kwI~HJ6bE&*D1AwTB1IDM&V literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu index ea2147c6ac578a4c9d46b432c082c9779e2d5c66..58b093d41fbbd4bc99c97ac7e91700836f0de8c1 100644 GIT binary patch delta 55 zcmZ3+^ND9e0^?+TMn$&V#LA>p-OT}v8(GA46f*Nl6p9OSGE3sK`aaAM)D5V={5;HAQn<2yo>qNKl(Q;$A_3D@Wpey3X&rZ?@9gHrj z(&S#B`#AU9?C~*Rod5XL%X9qm@($njJ`e6Nh70%zPfu|0qO;qL-u?Y8yN4>1)0!7=>o*KB(XrZ5d*TWS-gP4j329EPRUyPFyTz7f6Uz zbI&8>kpiI$N}hKuLSeS$k|wPq5?CZy51dXyQxjisrc`#xQ$a z1g=E0(FmytxFHK2d=I)yVi6Lh387qzAk{7TrS18%)BPP!f1g@LXkX}JHk=Z}HenMc zB|=y6Ub;GG8JwPEaCWgwbx*AMf7G>#QwoLWTD=K=pF#SqGF}=k^dzu>*({XRR)l=n z4nj9vF=BHqM_iGVsZ+9DMysF7zg6JWlv)Mj4XH-H#%-UJZMfqvI$iRp`_zj+hPNcHDhTUs@?560?I>?UsUV}$*qiluIYH^C1KLP(d_A), - typename Gemm::LayoutA{static_cast(lda)}}; - typename Gemm::TensorRefB tensor_b{ - const_cast(d_B), - typename Gemm::LayoutB{static_cast(ldb)}}; - typename Gemm::TensorRefC tensor_c{ - nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; - typename Gemm::TensorRefD tensor_d{ - d_C, typename Gemm::LayoutC{static_cast(ldc)}}; + cudaStream_t stream, int split_k_slices) { + using TensorRefA = cutlass::TensorRef; + using TensorRefB = cutlass::TensorRef; + using TensorRefC = cutlass::TensorRef; + using TensorRefD = + cutlass::TensorRef; + TensorRefA tensor_a{const_cast(d_A), + typename Gemm::LayoutA{static_cast(lda)}}; + TensorRefB tensor_b{const_cast(d_B), + typename Gemm::LayoutB{static_cast(ldb)}}; + TensorRefC tensor_c{nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; + TensorRefD tensor_d{d_C, typename Gemm::LayoutC{static_cast(ldc)}}; typename Gemm::Arguments arguments{problem_size, tensor_a, @@ -42,7 +47,7 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( tensor_c, tensor_d.non_const_ref(), epilogue, - 1}; + split_k_slices}; Gemm gemm_op; cutlass_check(gemm_op.initialize(arguments, workspace)); cutlass_check(gemm_op(stream)); diff --git a/dnn/src/cuda/matrix_mul/opr_impl.h b/dnn/src/cuda/matrix_mul/opr_impl.h index 56215411..b554a9ea 100644 --- a/dnn/src/cuda/matrix_mul/opr_impl.h +++ b/dnn/src/cuda/matrix_mul/opr_impl.h @@ -42,6 +42,7 @@ public: class AlgoBFloat16; #endif class AlgoFloat32SIMT; + class AlgoFloat32SIMTSplitK; class AlgoPack; static const AlgoPack& algo_pack() { diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp index 4cbe5c77..00b63cdd 100644 --- a/dnn/test/common/matrix_mul.cpp +++ b/dnn/test/common/matrix_mul.cpp @@ -117,6 +117,18 @@ std::vector matrix_mul::get_matmul_args() { return args; } +std::vector matrix_mul::get_matmul_args_split_k() { + std::vector args = get_matmul_args(); + for (auto iter = args.begin(); iter < args.end();) { + if (iter->k <= iter->n) { + iter = args.erase(iter); + } else { + iter++; + } + } + return args; +} + std::vector matrix_mul::get_batched_matmul_args_mask( uint8_t mask) { std::vector args; diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h index d52f1814..ab3057e0 100644 --- a/dnn/test/common/matrix_mul.h +++ b/dnn/test/common/matrix_mul.h @@ -53,6 +53,7 @@ struct TestArg { std::vector get_matmul_args_no_mask(); std::vector get_matmul_args_mask(uint8_t mask); std::vector get_matmul_args(); +std::vector get_matmul_args_split_k(); std::vector get_batched_matmul_args_mask(uint8_t mask); std::vector get_batched_matmul_args(); std::vector get_batched_matmul_broadcast_args(); diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp index 55b13c28..ae04cd02 100644 --- a/dnn/test/cuda/cutlass_matmul.cpp +++ b/dnn/test/cuda/cutlass_matmul.cpp @@ -21,7 +21,6 @@ #include "test/cuda/fixture.h" #include "test/cuda/utils.h" - #if CUDA_VERSION >= 9020 namespace megdnn { namespace test { @@ -284,6 +283,15 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { param::MatrixMul::Format::DEFAULT); } +TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_MULTI_BATCHSIZE) { + auto args = matrix_mul::get_matmul_args_no_mask(); + test_multibatchsize( + handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), + "CUTLASS_FLOAT32_SIMT_SPLIT_K_128X128X8_32X64X8", args, + param::MatrixMul::Format::DEFAULT, + [](const matrix_mul::TestArg& arg) { return arg.k <= arg.n; }); +} + #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 64, 256, 8, 32, 64, 8); \ cb(2, 256, 64, 8, 64, 32, 8); \ @@ -314,6 +322,21 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { MEGDNN_FOREACH_CUTLASS_KERNEL(cb) +#undef cb + +#define cb(name, tbm, tbn, tbk, wm, wn, wk) \ + TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_##name) { \ + matrix_mul::check_matrix_mul( \ + dtype::Float32(), dtype::Float32(), dtype::Float32(), \ + handle_cuda(), \ + "CUTLASS_FLOAT32_SIMT_SPLIT_K_" #tbm "X" #tbn "X" #tbk "_" #wm \ + "X" #wn "X" #wk, \ + param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ + matrix_mul::get_matmul_args_split_k()); \ + } + +MEGDNN_FOREACH_CUTLASS_KERNEL(cb) + #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL -- GitLab