提交 cc07b96f 编写于 作者: M Megvii Engine Team

perf(dnn/relayout): disable copy_last_contiguous when contiguous_size is

small

GitOrigin-RevId: 7bd5e793a4f252510d739ae8b39046cea969cb58
上级 b2918f40
......@@ -232,7 +232,7 @@ DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch, m_param[1].layout)
typedef OpCallerBinaryContiguous<PVis0, PVis1> Caller;
size_t size = m_param.size;
int grid_size, block_size;
if (m_contiguous_size > 32) {
if (m_contiguous_size >= 32) {
void (*fptr)(Caller, uint32_t, uint32_t, uint32_t, uint32_t);
fptr = cuda_last_contiguous_large_kern<Caller>;
safe_size_in_kern(size);
......
......@@ -176,7 +176,8 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() {
!has_negative_stride(lsrc) && !has_negative_stride(ldst)) {
size_t contiguous_size =
gcd(lsrc.shape[lsrc.ndim - 1], ldst.shape[ldst.ndim - 1]);
if (contiguous_size > 1) {
// FIXME: disable copy_last_contiguous when contiguous_size < 32 due to performance issue
if (contiguous_size >= 32) {
copy_last_contiguous(m_dst, m_src, contiguous_size,
m_opr->stream());
return true;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册