diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index eeee2479eca205936c107b599e7e41b9dd9f74fb..d5b93e581fc11eb881b36c7c9caa82cf9ff1fc3e 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -424,12 +424,20 @@ size_t TensorLayout::access_bytes() const { if (dtype.is_low_bit()) { ret = 1; int align_size_in_elements = 8 / dtype.low_bit(); + auto min_stride = contig.stride[0]; for (size_t i = 0; i < contig.ndim; ++i) { if (contig.stride[i] == 1) { ret *= round_up((int)contig.shape[i], align_size_in_elements); } else { ret *= contig.shape[i]; } + if (min_stride > contig.stride[i]) { + min_stride = contig.stride[i]; + } + } + if (min_stride != 1) { + megdnn_assert(min_stride == align_size_in_elements); + ret *= min_stride; } ret /= align_size_in_elements; } else { diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp index 12fb03403f175f7d72c4e6285882b0a79a682994..4127f3953e2b86e47476f23b0dd272700ac3c9be 100644 --- a/dnn/src/cuda/elemwise_helper.cpp +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -240,6 +240,7 @@ template void ParamElemVisitor4bitBase::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { m_ptr = reinterpret_cast(rv.raw_ptr); + auto min_stride = rv.layout.stride[0]; for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; m_shape[i] = rv.layout.shape[i]; @@ -251,7 +252,12 @@ void ParamElemVisitor4bitBase::host_init( else m_align_shape_highdim[i] = rv.layout.shape[i + 1]; } + if (min_stride > rv.layout.stride[i]) { + min_stride = rv.layout.stride[i]; + } } + megdnn_assert(min_stride == 1 || min_stride == 2); + m_is_min_stride_2 = (min_stride == 2); for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { m_shape_highdim[i] = 1; m_align_shape_highdim[i] = 1; diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index 66712dde4a54fba193a194fe963cca1f90c4df1f..77c555363e4b312ed459d234f3952c280935d608 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -542,6 +542,7 @@ protected: int m_stride[ndim]; int m_shape[ndim]; bool m_is_physical_contiguous; + bool m_is_min_stride_2; //! m_shape_highdim[i] = original_shape[i + 1] #ifdef _MSC_VER @@ -592,7 +593,7 @@ public: int idx = 0; if (m_is_physical_contiguous) { idx = access_idx; - } else { + } else if (!m_is_min_stride_2) { int shape_idx[ndim]; bool valid = true; get_shape_from_access(access_idx, shape_idx); @@ -605,6 +606,8 @@ public: idx = (idx + shape_idx[i]) * m_shape[i + 1]; } idx = valid ? idx + shape_idx[ndim - 1] : -1; + } else { // min_stride == 2 + idx = ((access_idx & 0x1) == 0) ? ((int)access_idx >> 1) : -1; } return idx; } diff --git a/dnn/src/cuda/relayout/param_visitor.cpp b/dnn/src/cuda/relayout/param_visitor.cpp index 73fab4c3f8f88190efbc47fd618b2524439ee85c..899bb4b602c45680f108a9bb408e943259c62a45 100644 --- a/dnn/src/cuda/relayout/param_visitor.cpp +++ b/dnn/src/cuda/relayout/param_visitor.cpp @@ -70,6 +70,7 @@ void ParamElemVisitor::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); m_ptr = reinterpret_cast(rv.raw_ptr); + auto min_stride = rv.layout.stride[0]; for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; m_shape[i] = rv.layout.shape[i]; @@ -81,7 +82,12 @@ void ParamElemVisitor::host_init( else m_align_shape_highdim[i] = rv.layout.shape[i + 1]; } + if (min_stride > rv.layout.stride[i]) { + min_stride = rv.layout.stride[i]; + } } + megdnn_assert(min_stride == 1 || min_stride == 2); + m_is_min_stride_2 = (min_stride == 2); for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { m_shape_highdim[i] = 1; m_align_shape_highdim[i] = 1; diff --git a/dnn/src/cuda/relayout/param_visitor.cuh b/dnn/src/cuda/relayout/param_visitor.cuh index c33a27757a9c992a68b82352c720d0e1a6079e22..28af16a5786e5b5a53a7811bb36b7eca31a1f9ce 100644 --- a/dnn/src/cuda/relayout/param_visitor.cuh +++ b/dnn/src/cuda/relayout/param_visitor.cuh @@ -132,6 +132,7 @@ class ParamElemVisitor { int m_shape[ndim]; bool m_is_contiguous; bool m_is_physical_contiguous; + bool m_is_min_stride_2; //! m_shape_highdim[i] = original_shape[i + 1] #ifdef _MSC_VER @@ -197,7 +198,7 @@ public: int idx = 0; if (m_is_physical_contiguous) { idx = access_idx; - } else { + } else if (!m_is_min_stride_2) { int shape_idx[ndim]; bool valid = true; get_shape_from_access(access_idx, shape_idx); @@ -209,6 +210,8 @@ public: idx = (idx + shape_idx[i]) * m_shape[i + 1]; } idx = valid ? idx + shape_idx[ndim - 1] : -1; + } else { // min_stride == 2 + idx = ((access_idx & 0x1) == 0) ? ((int)access_idx >> 1) : -1; } return idx; } diff --git a/dnn/test/cuda/elemwise_multi_type.cpp b/dnn/test/cuda/elemwise_multi_type.cpp index ca771a5acb7660b387f90510d68fdfd0f57978e0..6b8e2b4b3a5f292249f2ad1e09860aaadcfec90d 100644 --- a/dnn/test/cuda/elemwise_multi_type.cpp +++ b/dnn/test/cuda/elemwise_multi_type.cpp @@ -152,7 +152,8 @@ static void run_test_q4(int arity, Checker& checker, .execs({{1, 4, 5, 5}, {1, 4, 5, 5}}); } else if (arity == 2) { checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}}) - .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}}); + .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}}) + .execs({{2, 2, 3, 1}, {2, 2, 3, 1}, {2, 2, 3, 1}}); } else { megdnn_assert(0); } diff --git a/dnn/test/cuda/relayout.cpp b/dnn/test/cuda/relayout.cpp index 5ad2bacf9a9f1af086f12a1e313ceec23c2770a5..82ae728474822943c5b6a9cd5f1f9c4cca694920 100644 --- a/dnn/test/cuda/relayout.cpp +++ b/dnn/test/cuda/relayout.cpp @@ -925,6 +925,7 @@ TEST_F(CUDA, RELAYOUT_Q4) { .set_rng(1, &rng_int4) .set_dtype(0, dtype::QuantizedS4(1.f)) .set_dtype(1, dtype::QuantizedS4(1.f)) + .execs({{2, 2, 1, 1}, {1, 1, 2, 2}}) .execs({{1, 64, 15, 15}, {1, 15, 15, 64}}) .execs({{1, 5, 9, 32}, {1, 5, 32, 9}}) .execl(TensorLayoutArray{ diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp index 5bdde41b9a3a4f7bb730aec1644de7fa8f9188a5..5759d3f8058cf9af1e4128459721bcc9e1375ead 100644 --- a/dnn/test/cuda/type_cvt.cpp +++ b/dnn/test/cuda/type_cvt.cpp @@ -123,11 +123,13 @@ TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { set_err(dst_dtype); checker.set_dtype(0, src_dtype) .set_dtype(1, dst_dtype) - .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}) + .execs({{16, 3, 224, 1}, {16, 3, 224, 1}}); set_err(src_dtype); checker.set_dtype(0, dst_dtype) .set_dtype(1, src_dtype) - .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}) + .execs({{16, 3, 224, 1}, {16, 3, 224, 1}}); }; run(dtype::Quantized4Asymm{1.19990518f, 8},