diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index bc28b4cea858c01e779f691a27fa13ba098d140f..b093b4aec773073a580da8ffe4c66c3a6d8dcaf3 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ ../src/cuda/elemwise_multi_type/kimpl CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl -CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl +CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl -all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL) +all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py ./$^ $@ diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp index d2c44734f77f7553918ff65765e9a45c102932c9..fa19098057a7d0e8a8cb2637e1156a015c2b4607 100644 --- a/dnn/src/cuda/matrix_mul/algos.cpp +++ b/dnn/src/cuda/matrix_mul/algos.cpp @@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : simt_float32) { all_algos.push_back(&algo); } + for (auto&& algo : simt_float32_split_k) { + all_algos.push_back(&algo); + } for (auto&& algo : all_algos) { m_all_algos_map.emplace(algo->info().desc, algo); @@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); } MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index 654293704689f6834ee84745b875d4be88119252..d647c66131886546ea091c605b37f16be7892401 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -43,6 +43,7 @@ public: CUDA_NAIVE, CUDA_BFLOAT16, CUDA_FLOAT32_SIMT, + CUDA_FLOAT32_SIMT_SPLIT_K, }; using Mapper = std::unordered_map; @@ -198,6 +199,31 @@ private: std::string m_name; }; +class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase { +public: + using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam; + AlgoFloat32SIMTSplitK(AlgoParam algo_param) + : m_algo_param{algo_param}, + m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", + m_algo_param.to_string().c_str())} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { return true; } + MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) + + std::string param() const override { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; + } + +private: + AlgoParam m_algo_param; + std::string m_name; +}; + class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { private: AlgoBase::Mapper m_all_algos_map; @@ -216,6 +242,7 @@ public: AlgoBFloat16 bfloat16; #endif std::vector simt_float32; + std::vector simt_float32_split_k; std::vector all_algos; const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50ccb67db2f7abfe7e5ef75df2fd1b432a2f8c03 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/handle.h" +#include "src/cuda/matrix_mul/algos.h" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + return args.opr->param().format == param::MatrixMul::Format::DEFAULT && + args.layout_a.dtype == dtype::Float32() && + args.layout_b.dtype == dtype::Float32() && + args.layout_c.dtype == dtype::Float32() && k > std::max(m, n); +} + +size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes( + const SizeArgs& args) const { + size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0], + ldc = args.layout_c.stride[0]; + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + return cutlass_matrix_mul_float32_simt_get_workspace_size( + param.transposeA, lda, param.transposeB, ldb, ldc, problem_size, + 1.f, 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + split_k_slices); +} + +void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec( + const ExecArgs& args) const { + size_t lda = args.tensor_a.layout.stride[0], + ldb = args.tensor_b.layout.stride[0], + ldc = args.tensor_c.layout.stride[0]; + auto&& param = args.opr->param(); + int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], + k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + auto&& stream = cuda_stream(args.opr->handle()); + int* workspace = reinterpret_cast(args.workspace.raw_ptr); + return cutlass_matrix_mul_float32_simt( + args.tensor_a.ptr(), param.transposeA, lda, + args.tensor_b.ptr(), param.transposeB, ldb, + args.tensor_c.ptr(), ldc, workspace, problem_size, 1.f, + 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + stream, split_k_slices); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu index 6f6e079ccd8cd498c225a4a5625448e692e3891f..4907b4fa386fba8d1166a6fbfb536fb8376fe430 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu @@ -18,6 +18,7 @@ #if __CUDACC_VER_MAJOR__ > 9 || \ (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) #include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_splitk_parallel.h" #endif #include "src/common/opr_param_defs_enumv.cuh" #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" @@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( float* /* d_C */, size_t /* ldc */, int* /* workspace */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */, + int /* split_k_slices */) {} #else void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream) { + cudaStream_t stream, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( workspace, problem_size, \ epilogue, stream); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + return cutlass_matrix_mul_wrapper( \ + d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size, \ + epilogue, stream, split_k_slices); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif @@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper:: bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */) { + const GemmCoord& /* warp_shape */, int /* split_k_slices */) { return 0; } #else @@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper:: bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape) { + const GemmCoord& warp_shape, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper:: split_k_slices}; \ return Gemm::get_workspace_size(arguments); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - static constexpr int split_k_slices = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + using TensorRefA = cutlass::TensorRef; \ + using TensorRefB = cutlass::TensorRef; \ + using TensorRefC = cutlass::TensorRef; \ + using TensorRefD = cutlass::TensorRef; \ + TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast(lda)}}; \ + TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast(ldb)}}; \ + TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \ + tensor_C, tensor_D, epilogue, \ + split_k_slices}; \ + return Gemm::get_workspace_size(arguments); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh index 3446842c99528331b1ca0ddcc8b92d6367f92d0e..1947f773d31ab6ff0687b48c17b3901de34f619e 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh @@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper( typename Gemm::ElementC* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); void cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); size_t cutlass_matrix_mul_float32_simt_get_workspace_size( bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, - const GemmCoord& threadblock_shape, const GemmCoord& warp_shape); + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1); } // namespace cutlass_wrapper } // namespace cuda diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu index bdf8a6e3b3871607341ac7415cfb181dc3541dbf..3828423399ba519e605671a5151787c2cc95985f 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a4b4e0c42b26a5c5d0fce9dba31f59e2097a20cb Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu index 842f673e666f4fde52ddaa8527cb89d06e83bf5d..d69e3359f96c60c98817363753fed02afbe772b8 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e78d64a9bc30510d118c93487da1333369ef6b22 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu index 6b4ec037db902b3835c1c8fd720192add28883f9..0fe5a161cc69870a9562465b0f0a14d3232c6175 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..964ef5252061492b51c82d6a1e8694452388ee73 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu index a7b3fbea821bf428f5d1d7f4fb5fd28284f00c55..374a8d73a7653ce40ec5f278fe50034efe708a36 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d5795b04cf8078899a2c0186865c485e166a4001 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu index d4cfccda7296a5e92d67d155f0526bdc4e10b025..87d80fd403e16791e42db575a4427977a09a785c 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..74b75ebd8d53cf72267f8c5c72d51f4833242cb7 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu index e4e097ccf0b2c9aa94e9c6ccb5228217cb5865a6..230297e8e11c84a1f5ac40878efeda3dce43a208 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..30dd6ad07d71f3a03f7c5e4fa6babe8c57e2ffce Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu index 35c35bb9260bc16077d7a7ba8112c27685545b63..04f80d335c10f9eafb47b31cb54c0871d70c8701 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d702c06b400d79cf54d07225f41fa517652aafde Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu index 9df595430beb61f4488b925ced89ed925fb8fe03..70fd338f09b70d2dda029a294e7f39ca5ccf62a1 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a54b66d81682c87e874227cbab51e5bbeb5d95b8 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu index 474114d8f7faf58046bfbd5e0b09b362e916cebc..bed908a162008352e0403c19eea24855eb0eb498 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..91c76b87e3edaff71dda946d22f2f2fc087b82c1 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu index 9b39434a3aaf533c6fb7870a87df2cc380a6f7bb..a063706f12cffc0cf3063db3897156cdef7f28b4 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..161b9e5589680bebc567641aceb957a8f5e9299a Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu index b55f2e2f6bb1b561e3d15f7207a7d77cf3c7730c..6eae3c183fccc5e700f4c75dc4c67fa8ab2fdd0a 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c8968eb31ded5a3689268f632eafb4323589a59f Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu index 95d263d61404d146501f29c9fb7630989de0a289..3107bc36bc19887c93761ff52dc198d14e3a2a51 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..22f137974860d6b4a3950a6cd555c6f4e630dc81 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu index c3a64d63c7b87f05c32256db47866cc6ccd062f6..ce92e1493134d6574866f856862e5a3c3b2ad203 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c51f78158b051aa0c244814fb66476d212cfaa6 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu index 3f151dce69c7ad0d47e3b9945fa89664384bfbb5..50f5e49f97a4617a4e3601c0ec66039df3ede4c8 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ccf1190326149280628a6a5812820e99061d31d Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu index 526dfd3e6112e85bccd8ec8b7d1a8003f1c67f4d..28b32c915377c15b0e7d1ccae995b96e6d76c131 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e25e44e35f6c4b424d2e1bba227c3c87d22c2bfd Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu index 5bc0ed7afaff083c6a1c73bd5c68c8f9103013e7..4e1a9f6ccbeb1454456107a94917e61d7454b894 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7f1fb6924f03bea7a2c534dc7939f831c394ce7 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu index 816cff198e51060eabbfef8554dd6f720f16e47c..225cdf3b09b4dda07f262e43c5a9790e0df3b427 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0050f6695f0dac33ef8c88a0a476b86a59e0310f Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu index e463f557e5b39cb48b896bc7376350f32721535b..91c830c681c1936acf4e10015fc8c50dec6f93d5 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c2fe7bcbf98c49c3096f070fd3660c86cec41734 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu index db1b87962dc781c3236eadd802f6d26657f59f83..e3ba197ff404d56cbf8547c35f3ea3f2affc05bb 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..fff368ea3961cdf40c9aaf87733fe8e6203cde33 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu index b31e1e828674cf722864141bfdad769242ea33c5..9e41f58231cd9a4372c58ceb08f9acde8a6d64e0 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..511cd5573f9d5c5bb1bec1c39488d74422989699 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu index 32f6726b87a3c1865f3bc6f318e722e162100e48..49de56074234c4b6d3eccdabf152ceaad6e06c26 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..07296250401ae1350e793015e0e797a4e5da420f Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu index 34d8833366255a5f509cec88634c5b3f25e9a541..872b8ded4ccd7baa35769fe183b19003419907f0 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c7774d6445374273224a8707b960eab0d91b5742 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu index ba1e14b13bef16038713ff208d48613d00cfe36f..a6178562dd271e66c97018590559e4a5239d5aea 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..fccd72b0645add80e87fe857072483ad0f0787eb Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu index dc13d44b39ba61a86f53067c793c2656c5c1eb45..e5c3e2d2ea69dda538040c88a9f090678958a0e1 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2fa630980baf4fb37d268e0ff2cf73b6967bc71 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu index fcd6e1a93ec387dd38fec2e4eca537abb316bcd0..a85bae3d807d98621057fde32dc5cf3bb6030743 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..cb855f5541af4bce823f32de29208e47f14c8feb Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu index d88a6bd28868caa5ddd51f283560b5bab1325997..4d8cddb569a6e2d5c7b5897d74f735aa6e1bbe7f 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca5408f425724ee4ac413bdb36aacdea83efc183 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu index 6878509e632750c7939a0b8eeb71dfa4cba0966b..7880c3cb34566e30bc90fc25b89235434f38b601 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a77f8c79d00687dec9562a2efdd63bcce18b0ec Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu index 121ed9e0149fc6618d67e206768d29aebd49fbe5..6e396c45574793167456b31f547e9cd1949beb29 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a3a9ba6c2adb21e5b47ed8e001fe0b0589ac75fa Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu index 33ddd12c0f5b6bb268d35bfa0159219591b40b2a..7e4b278b4811f5a512309bae6c3bcaec0e90f2f4 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..05437d7de46595df005f5fce04882b693eae9f6b Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu index d074d4f829584995f2c2fddd5409967848da6e07..61f578ad8758e2e20c3fa1c9d2e8de9f1f7fff0d 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..55eac3eb75786e1ff729ec14a029288894a0e7d3 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu index b4529df8f5280a442f3a8dd77202165e45d7f4ba..0227b52148b0c738c5ef2bdc979bf8ae595c2017 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a777e87d2ad465a632f83c95d3aba9447c10008 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu index cac23b2deb5d2b66258b0927740671d0d0f75edf..90a24e98ec94ab1952a220946595a6ebbaec2cb1 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..3b268760120ec72e08baeb6a55edc121004c388b Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu index 7539d009018dacadd71a1b384359003c356f1ab2..ccb3a6fb2b97063ec110cdbf3f35c0befcfe8ae3 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..57f714577be6d41ed07fd5790b86e4b2bc638ef4 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu index 420a2271721c1c309add339748681c80aed6cd2c..296e163de22dd04e5c23ad2bf79cab37d616e187 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7d9dae19af858ca5afe6248f06f6c2b8b9d179fc Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu index 37d9265ea4c311c2d12a833486773ea0ef10cf22..c964aaf865f832aa9d815172a9c63014af335b41 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c6be5d7b5c7725e43924fb9408db1dd805010f79 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu index 4ff6119f137894c02fc7f78be0d12519faae7596..9bb8ea8a2e35dceaa177de902b3c844ae34f2814 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d5f9afb33809e9427f47e314f2d2fb428a611b2d Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu index 52d10eb7ccf6da3ed2b7e9bdbc98edf0808ee495..18047dc247afd53e9bf23c57273dd9ddc77eb3a3 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7a66c1632b513fc2a87f2470cc87e705c293ac59 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu index 1067544a79082c1cad31cc721aa71fdb2af57bb6..86899145251d59b31bd0027a459b066ac3986d4a 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..78c0283e0865719001348d449c53abc1c2bb6f14 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu index 8dc734f1ca07567bc9700052520694afe4309ca7..c65df06ed00b80a6f5171ff6d870ce4a178dba87 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..fcb716b7701900cabc37b80105a95f99c14910ff Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu index 965a98e2e02feff28ad0519a3ca98e2b1b8e1aac..ecd87aa35bc9b3b21bb70450f4480ea16b1f39a2 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0afda5f1f3824ca85f65471fc2cfd29de973e3f4 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu index 11829eaf92b48d901a634120f8f1a36d3aa59f1e..ebb5a2f3abc168d6e1bf6e3f1d5938f8fd5a03aa 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a678b28cbaa9cafa05a668d850588ccc0b67fed1 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu index f302afbffd4578dcd7fb88dce7b3bf04a7bb37b2..f330b6d73cc9eb10cb2d133068565f883618118e 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..44e8a1b14c1a0c9492b0937f41fa721923c51ecd Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu index 04753040fd76d14aafcc55b97221e22ec2b070ba..db6e22b935df5481a92cf228bd0af5c2ee891582 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..55cee82c6deadcbccdb82a1255d3b4277c2c7daf Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu index 11733646250e8918d70427f8f7830370ac21b0aa..161e1337315dd4ac59f5b824d5e3d6e3dc370a54 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9269ac054465c4a44cf407ae57d5db1ae43e2374 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu index b2c77ea6e73a19a3af36de64d7d69997ecf242e8..929bcdc68d8a0e679a3154ac49ce71b75c340b58 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..71aa87a19d65e3c821baa7d4a28d0c62fa257cf2 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu index 7aac31e0cec013b05a0965d7e5791ba29f5df4d4..cf467004508bd41551eadc14db74497fd20d4eff 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..88f5c826782159a57994196de69d2802574de459 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu index 19ff9eb4ba5d68db8ade04f1d6e97a17ef3acb87..9fea507421b3a7c564298daa21c6271d8bc15882 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..86c8a6e01b3b0e9435d875061afbfd22f01e3b16 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu index 601b715b131ef8dabb1ce9edd6aa06e901056367..9976be6bca2c9da87f455a8a9c38d6300a4b1bc0 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b452d8ed0c9e204adbae54f2e0b55243f8c08cc5 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu index bd4dbe66d53416861c5edf269455ceabb0ee83d0..32175db5de86c79c1d0c9c1c401ec02b7bbbfbd2 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a19eb570e95d89e422513f9039d86e3ab0bf86c6 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu index b78e7892563d228e8fd86d612a59380f529cfa40..1ead99173635ea43238c742f3254511f0168cba4 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..71c9cab8696620bdf5fcef03d6aabf0e296c78af Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu index bffbaca96f8af413a6c13ccd911af7287fcfe1fa..919aad63bf936ea876d2811240d66030b16fee49 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..541af533af06947d90ecb8988f4644e64e43fe89 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu index 001247cae57870c6d8fbd5b235d0011902b6c71c..f4928b1e3674bda3f884c208e1fdd2db7bac51a0 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..593a73b6b7d47f1f3e6d6c0c93b486734e6c1d94 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu index 4b7441978e98ac4e94cd4a77d64c8f9193a806f3..2ff883c039ef116e70650132acc06ced7271d828 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c081e366b1522cb0b4fd5fc78ffd5bb4f96a3735 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu index 5c43eb1a804f8e00a4045bb29433c336c08df75b..f6f214de2168f51da02be17a7e5c695a27ace8cb 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..997a5b23bf5b89ba6a67674701847edd5a9e21ad Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu index 203bd8959a004a5227fad8bd069d45607518076f..67a7c7644abf6915c42625fb804e2bf82ca1b50b 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f918819aa1a018de556a7dab1d03ff7f01678636 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu index c353c7a281d5da4cf006db37aada058c4b5ac290..a27a779cbedbacd4082c575f6f5b0442d7d50145 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5d8c09c7cb73628a6cccaabc92a76f584558a6ea Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu index 83930042b27fc6ea9483a4ff50fdd596854810b9..48bd2b0668e4fce70e7b4f8c104f872cd583f663 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c81cadd1253d4a885666a03c40238e712f5c716a Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu index 30e837535733678179759ef5feb9b49bf6dbd78b..465b736e67f6198a1c26f93cf4c3fa565b84d718 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e31365d5469d7eb95f490eea949f18b76221d87d Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu index 9dbb5d1b7eb64262f97281f110dee13909007251..023bd24211a6a8a2bad2fc2d692f431eb928309a 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5b89b5f6300a7d8946af26eb6c856cf6c32ce825 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu index fa6ac73ba8d9cfc632afa2464039ecd724027065..927984ebaa1628b4684b57958f8529d8dae4a291 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f4638ce13c0b1889c84464c2940df10a8dfeb00a Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu index c2e26af435dfd9c5efd5fc7a60ce46690b09d7e3..2bef3b7d64128dc2bd6387c0ca5c3ff8d71f56c7 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1ed27985d9654e93fdaa88ae9f4e1b9d2ffe94cb Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu index 75404041f2bb2773e870f994cfab7a585486de4e..576d6663e15ca34f8d4d3a409c9fd5830e604e59 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a600832e3a6e72403118854a1048cbd4b646b4ce Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu index ab0a848d59744ee6cdc960230f9f6b6d7f8f218d..c4414a1c1f79356a8d5b78e239b3d88c1e73c180 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f48f96f3e26c8ccee86b82bd6c787d765986aad Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu index 261779d812e77b4e4a1b144935e75602f66f4901..a4831f9070f730676cdf81b2c3ebe5b4cbb93a28 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ac2dea6067ffbf08a8cb04f3a18c52ecfc04f497 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu index 8312b40c57107611f8ed5c4459055ec5b763400c..ce7de93b46aa24fe768c796607f575fdf7896d77 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b8319cd75310f5a27792f9579e9313bffcb90f84 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu index 6ad1b86b9182934e96b64fb6bb5b3155b3fd0d99..254272c00316cfa4fdccb0d4ac65371853e8fd23 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0b3cdc14631ec5e5ab9b3490d96864f4071b7269 Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu index 0f5579bd035a1d078adff85906fdfc26c34f635d..ea150057f786943e5a855917b666eb9c467841d8 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd15a9c301cfa85ac24272084fd836add8bdbaca Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu index ea2147c6ac578a4c9d46b432c082c9779e2d5c66..58b093d41fbbd4bc99c97ac7e91700836f0de8c1 100644 Binary files a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e4a2e18e6bfa42ff2dcb13cf4b806c94af675a2c Binary files /dev/null and b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu differ diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl index d58c804874d508a3d7d2ea8e1532b5e5983018b2..4610a8f64dc9fb892dadaa273e2b4ef7e96bc287 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl @@ -11,6 +11,7 @@ * implied. */ #include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_splitk_parallel.h" #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" using namespace megdnn; @@ -24,17 +25,21 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( typename Gemm::ElementC* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream) { - typename Gemm::TensorRefA tensor_a{ - const_cast(d_A), - typename Gemm::LayoutA{static_cast(lda)}}; - typename Gemm::TensorRefB tensor_b{ - const_cast(d_B), - typename Gemm::LayoutB{static_cast(ldb)}}; - typename Gemm::TensorRefC tensor_c{ - nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; - typename Gemm::TensorRefD tensor_d{ - d_C, typename Gemm::LayoutC{static_cast(ldc)}}; + cudaStream_t stream, int split_k_slices) { + using TensorRefA = cutlass::TensorRef; + using TensorRefB = cutlass::TensorRef; + using TensorRefC = cutlass::TensorRef; + using TensorRefD = + cutlass::TensorRef; + TensorRefA tensor_a{const_cast(d_A), + typename Gemm::LayoutA{static_cast(lda)}}; + TensorRefB tensor_b{const_cast(d_B), + typename Gemm::LayoutB{static_cast(ldb)}}; + TensorRefC tensor_c{nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; + TensorRefD tensor_d{d_C, typename Gemm::LayoutC{static_cast(ldc)}}; typename Gemm::Arguments arguments{problem_size, tensor_a, @@ -42,7 +47,7 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( tensor_c, tensor_d.non_const_ref(), epilogue, - 1}; + split_k_slices}; Gemm gemm_op; cutlass_check(gemm_op.initialize(arguments, workspace)); cutlass_check(gemm_op(stream)); diff --git a/dnn/src/cuda/matrix_mul/opr_impl.h b/dnn/src/cuda/matrix_mul/opr_impl.h index 562154113aa1353471234ec1ea5aee8184cf1eda..b554a9ea2c768746ebc277e559ebd13d01cd70c1 100644 --- a/dnn/src/cuda/matrix_mul/opr_impl.h +++ b/dnn/src/cuda/matrix_mul/opr_impl.h @@ -42,6 +42,7 @@ public: class AlgoBFloat16; #endif class AlgoFloat32SIMT; + class AlgoFloat32SIMTSplitK; class AlgoPack; static const AlgoPack& algo_pack() { diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp index 4cbe5c77a00cf312351c0bcac2245c27f1424af7..00b63cdd09e3498e6c30444b917c31ad9ec7c506 100644 --- a/dnn/test/common/matrix_mul.cpp +++ b/dnn/test/common/matrix_mul.cpp @@ -117,6 +117,18 @@ std::vector matrix_mul::get_matmul_args() { return args; } +std::vector matrix_mul::get_matmul_args_split_k() { + std::vector args = get_matmul_args(); + for (auto iter = args.begin(); iter < args.end();) { + if (iter->k <= iter->n) { + iter = args.erase(iter); + } else { + iter++; + } + } + return args; +} + std::vector matrix_mul::get_batched_matmul_args_mask( uint8_t mask) { std::vector args; diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h index d52f1814aac4120902b9dd68ffa87af0bd16c731..ab3057e09d2bb1ac11d21be56ce105f767508b8e 100644 --- a/dnn/test/common/matrix_mul.h +++ b/dnn/test/common/matrix_mul.h @@ -53,6 +53,7 @@ struct TestArg { std::vector get_matmul_args_no_mask(); std::vector get_matmul_args_mask(uint8_t mask); std::vector get_matmul_args(); +std::vector get_matmul_args_split_k(); std::vector get_batched_matmul_args_mask(uint8_t mask); std::vector get_batched_matmul_args(); std::vector get_batched_matmul_broadcast_args(); diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp index 55b13c289517c21cb6085bae352cb5402bbb68e8..ae04cd028447ce1bc56455b7c3a762cf68fd137b 100644 --- a/dnn/test/cuda/cutlass_matmul.cpp +++ b/dnn/test/cuda/cutlass_matmul.cpp @@ -21,7 +21,6 @@ #include "test/cuda/fixture.h" #include "test/cuda/utils.h" - #if CUDA_VERSION >= 9020 namespace megdnn { namespace test { @@ -284,6 +283,15 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { param::MatrixMul::Format::DEFAULT); } +TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_MULTI_BATCHSIZE) { + auto args = matrix_mul::get_matmul_args_no_mask(); + test_multibatchsize( + handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), + "CUTLASS_FLOAT32_SIMT_SPLIT_K_128X128X8_32X64X8", args, + param::MatrixMul::Format::DEFAULT, + [](const matrix_mul::TestArg& arg) { return arg.k <= arg.n; }); +} + #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 64, 256, 8, 32, 64, 8); \ cb(2, 256, 64, 8, 64, 32, 8); \ @@ -314,6 +322,21 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { MEGDNN_FOREACH_CUTLASS_KERNEL(cb) +#undef cb + +#define cb(name, tbm, tbn, tbk, wm, wn, wk) \ + TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_##name) { \ + matrix_mul::check_matrix_mul( \ + dtype::Float32(), dtype::Float32(), dtype::Float32(), \ + handle_cuda(), \ + "CUTLASS_FLOAT32_SIMT_SPLIT_K_" #tbm "X" #tbn "X" #tbk "_" #wm \ + "X" #wn "X" #wk, \ + param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ + matrix_mul::get_matmul_args_split_k()); \ + } + +MEGDNN_FOREACH_CUTLASS_KERNEL(cb) + #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL