From cc07b96f826dcbb972dc3f4c833a5481c6ea2dd5 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 11 Aug 2021 14:11:00 +0800 Subject: [PATCH] perf(dnn/relayout): disable copy_last_contiguous when contiguous_size is small GitOrigin-RevId: 7bd5e793a4f252510d739ae8b39046cea969cb58 --- dnn/src/cuda/relayout/kern_contiguous.cuh | 2 +- dnn/src/cuda/relayout/opr_impl.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dnn/src/cuda/relayout/kern_contiguous.cuh b/dnn/src/cuda/relayout/kern_contiguous.cuh index 8c0740f44..9d6872e0b 100644 --- a/dnn/src/cuda/relayout/kern_contiguous.cuh +++ b/dnn/src/cuda/relayout/kern_contiguous.cuh @@ -232,7 +232,7 @@ DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch, m_param[1].layout) typedef OpCallerBinaryContiguous Caller; size_t size = m_param.size; int grid_size, block_size; - if (m_contiguous_size > 32) { + if (m_contiguous_size >= 32) { void (*fptr)(Caller, uint32_t, uint32_t, uint32_t, uint32_t); fptr = cuda_last_contiguous_large_kern; safe_size_in_kern(size); diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp index a55da9348..dda9bb74e 100644 --- a/dnn/src/cuda/relayout/opr_impl.cpp +++ b/dnn/src/cuda/relayout/opr_impl.cpp @@ -176,7 +176,8 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() { !has_negative_stride(lsrc) && !has_negative_stride(ldst)) { size_t contiguous_size = gcd(lsrc.shape[lsrc.ndim - 1], ldst.shape[ldst.ndim - 1]); - if (contiguous_size > 1) { + // FIXME: disable copy_last_contiguous when contiguous_size < 32 due to performance issue + if (contiguous_size >= 32) { copy_last_contiguous(m_dst, m_src, contiguous_size, m_opr->stream()); return true; -- GitLab