From 8b94f4932853fe1a2f0eba5a1763b7102538ab32 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 10 Sep 2021 11:23:53 +0800 Subject: [PATCH] fix(dnn/cuda): fix elemwise and relayout int4 bug when last shape is 1 GitOrigin-RevId: e7d64c49871032deeda4176289f0457d4b9d85b8 --- dnn/src/common/basic_types.cpp | 8 ++++++++ dnn/src/cuda/elemwise_helper.cpp | 6 ++++++ dnn/src/cuda/elemwise_helper.cuh | 5 ++++- dnn/src/cuda/relayout/param_visitor.cpp | 6 ++++++ dnn/src/cuda/relayout/param_visitor.cuh | 5 ++++- dnn/test/cuda/elemwise_multi_type.cpp | 3 ++- dnn/test/cuda/relayout.cpp | 1 + dnn/test/cuda/type_cvt.cpp | 6 ++++-- 8 files changed, 35 insertions(+), 5 deletions(-) diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index eeee2479e..d5b93e581 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -424,12 +424,20 @@ size_t TensorLayout::access_bytes() const { if (dtype.is_low_bit()) { ret = 1; int align_size_in_elements = 8 / dtype.low_bit(); + auto min_stride = contig.stride[0]; for (size_t i = 0; i < contig.ndim; ++i) { if (contig.stride[i] == 1) { ret *= round_up((int)contig.shape[i], align_size_in_elements); } else { ret *= contig.shape[i]; } + if (min_stride > contig.stride[i]) { + min_stride = contig.stride[i]; + } + } + if (min_stride != 1) { + megdnn_assert(min_stride == align_size_in_elements); + ret *= min_stride; } ret /= align_size_in_elements; } else { diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp index 12fb03403..4127f3953 100644 --- a/dnn/src/cuda/elemwise_helper.cpp +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -240,6 +240,7 @@ template void ParamElemVisitor4bitBase::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { m_ptr = reinterpret_cast(rv.raw_ptr); + auto min_stride = rv.layout.stride[0]; for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; m_shape[i] = rv.layout.shape[i]; @@ -251,7 +252,12 @@ void ParamElemVisitor4bitBase::host_init( else m_align_shape_highdim[i] = rv.layout.shape[i + 1]; } + if (min_stride > rv.layout.stride[i]) { + min_stride = rv.layout.stride[i]; + } } + megdnn_assert(min_stride == 1 || min_stride == 2); + m_is_min_stride_2 = (min_stride == 2); for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { m_shape_highdim[i] = 1; m_align_shape_highdim[i] = 1; diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index 66712dde4..77c555363 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -542,6 +542,7 @@ protected: int m_stride[ndim]; int m_shape[ndim]; bool m_is_physical_contiguous; + bool m_is_min_stride_2; //! m_shape_highdim[i] = original_shape[i + 1] #ifdef _MSC_VER @@ -592,7 +593,7 @@ public: int idx = 0; if (m_is_physical_contiguous) { idx = access_idx; - } else { + } else if (!m_is_min_stride_2) { int shape_idx[ndim]; bool valid = true; get_shape_from_access(access_idx, shape_idx); @@ -605,6 +606,8 @@ public: idx = (idx + shape_idx[i]) * m_shape[i + 1]; } idx = valid ? idx + shape_idx[ndim - 1] : -1; + } else { // min_stride == 2 + idx = ((access_idx & 0x1) == 0) ? ((int)access_idx >> 1) : -1; } return idx; } diff --git a/dnn/src/cuda/relayout/param_visitor.cpp b/dnn/src/cuda/relayout/param_visitor.cpp index 73fab4c3f..899bb4b60 100644 --- a/dnn/src/cuda/relayout/param_visitor.cpp +++ b/dnn/src/cuda/relayout/param_visitor.cpp @@ -70,6 +70,7 @@ void ParamElemVisitor::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); m_ptr = reinterpret_cast(rv.raw_ptr); + auto min_stride = rv.layout.stride[0]; for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; m_shape[i] = rv.layout.shape[i]; @@ -81,7 +82,12 @@ void ParamElemVisitor::host_init( else m_align_shape_highdim[i] = rv.layout.shape[i + 1]; } + if (min_stride > rv.layout.stride[i]) { + min_stride = rv.layout.stride[i]; + } } + megdnn_assert(min_stride == 1 || min_stride == 2); + m_is_min_stride_2 = (min_stride == 2); for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { m_shape_highdim[i] = 1; m_align_shape_highdim[i] = 1; diff --git a/dnn/src/cuda/relayout/param_visitor.cuh b/dnn/src/cuda/relayout/param_visitor.cuh index c33a27757..28af16a57 100644 --- a/dnn/src/cuda/relayout/param_visitor.cuh +++ b/dnn/src/cuda/relayout/param_visitor.cuh @@ -132,6 +132,7 @@ class ParamElemVisitor { int m_shape[ndim]; bool m_is_contiguous; bool m_is_physical_contiguous; + bool m_is_min_stride_2; //! m_shape_highdim[i] = original_shape[i + 1] #ifdef _MSC_VER @@ -197,7 +198,7 @@ public: int idx = 0; if (m_is_physical_contiguous) { idx = access_idx; - } else { + } else if (!m_is_min_stride_2) { int shape_idx[ndim]; bool valid = true; get_shape_from_access(access_idx, shape_idx); @@ -209,6 +210,8 @@ public: idx = (idx + shape_idx[i]) * m_shape[i + 1]; } idx = valid ? idx + shape_idx[ndim - 1] : -1; + } else { // min_stride == 2 + idx = ((access_idx & 0x1) == 0) ? ((int)access_idx >> 1) : -1; } return idx; } diff --git a/dnn/test/cuda/elemwise_multi_type.cpp b/dnn/test/cuda/elemwise_multi_type.cpp index ca771a5ac..6b8e2b4b3 100644 --- a/dnn/test/cuda/elemwise_multi_type.cpp +++ b/dnn/test/cuda/elemwise_multi_type.cpp @@ -152,7 +152,8 @@ static void run_test_q4(int arity, Checker& checker, .execs({{1, 4, 5, 5}, {1, 4, 5, 5}}); } else if (arity == 2) { checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}}) - .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}}); + .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}}) + .execs({{2, 2, 3, 1}, {2, 2, 3, 1}, {2, 2, 3, 1}}); } else { megdnn_assert(0); } diff --git a/dnn/test/cuda/relayout.cpp b/dnn/test/cuda/relayout.cpp index 5ad2bacf9..82ae72847 100644 --- a/dnn/test/cuda/relayout.cpp +++ b/dnn/test/cuda/relayout.cpp @@ -925,6 +925,7 @@ TEST_F(CUDA, RELAYOUT_Q4) { .set_rng(1, &rng_int4) .set_dtype(0, dtype::QuantizedS4(1.f)) .set_dtype(1, dtype::QuantizedS4(1.f)) + .execs({{2, 2, 1, 1}, {1, 1, 2, 2}}) .execs({{1, 64, 15, 15}, {1, 15, 15, 64}}) .execs({{1, 5, 9, 32}, {1, 5, 32, 9}}) .execl(TensorLayoutArray{ diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp index 5bdde41b9..5759d3f80 100644 --- a/dnn/test/cuda/type_cvt.cpp +++ b/dnn/test/cuda/type_cvt.cpp @@ -123,11 +123,13 @@ TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { set_err(dst_dtype); checker.set_dtype(0, src_dtype) .set_dtype(1, dst_dtype) - .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}) + .execs({{16, 3, 224, 1}, {16, 3, 224, 1}}); set_err(src_dtype); checker.set_dtype(0, dst_dtype) .set_dtype(1, src_dtype) - .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}) + .execs({{16, 3, 224, 1}, {16, 3, 224, 1}}); }; run(dtype::Quantized4Asymm{1.19990518f, 8}, -- GitLab