From 8da2f698a37dd9a6dfc09efb7c190cd468a4fffa Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 12 Apr 2021 19:48:33 +0800 Subject: [PATCH] feat(dnn/cuda): support warp perspective/pooling op when channel not aligned to 64 GitOrigin-RevId: 39f29ec990ebb4c61e8d217911976b51810df34c --- dnn/src/common/tensor_format.cpp | 2 +- dnn/src/cuda/elemwise_helper.cuh | 5 +- dnn/src/cuda/elemwise_multi_type/kern_ops.cuh | 49 ++-- dnn/src/cuda/integer_subbyte_utils.cuh | 146 ++++++++++ dnn/src/cuda/memory_utils.cuh | 253 ++++++++++++++++++ dnn/src/cuda/relayout/opr_impl.cpp | 8 +- dnn/src/cuda/relayout/param_visitor.cuh | 28 +- dnn/src/cuda/relayout_format/helper.cuh | 103 +++++++ .../cuda/relayout_format/relayout_format.cu | 140 +++++----- dnn/src/cuda/type_cvt/kern.cu | 10 +- dnn/src/cuda/utils.cuh | 99 ------- dnn/src/cuda/warp_perspective/forward.cpp | 1 + dnn/src/cuda/warp_perspective/forward.cu | 2 + dnn/test/cuda/pooling.cpp | 7 +- dnn/test/cuda/type_cvt.cpp | 2 +- dnn/test/cuda/warp_perspective.cpp | 4 +- src/gopt/impl/tensor_reformat.cpp | 24 +- src/gopt/test/inference.cpp | 189 ++++++------- src/opr/impl/imgproc.cpp | 3 +- src/opr/test/dnn/convolution.cpp | 31 ++- src/plugin/impl/opr_footprint.cpp | 20 +- test/src/helper.cpp | 32 ++- test/src/include/megbrain/test/helper.h | 9 + 23 files changed, 812 insertions(+), 355 deletions(-) create mode 100644 dnn/src/cuda/integer_subbyte_utils.cuh create mode 100644 dnn/src/cuda/memory_utils.cuh diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp index 4e5a1c0d4..2ed854cdf 100644 --- a/dnn/src/common/tensor_format.cpp +++ b/dnn/src/common/tensor_format.cpp @@ -568,7 +568,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec( res.stride[0] = 1; return res; } - if (res.shape[i] == 1) { + if (res.shape[i] == 1 && res.stride[i] != 1) { res.remove_axis_inplace(i); } } diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index 3b62312b0..5424389bc 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -16,6 +16,7 @@ #include "src/cuda/int_fastdiv.cuh" #include "src/cuda/query_blocksize.cuh" #include "src/cuda/utils.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" /* * please note that all arithmetics on GPU are 32-bit for best performance; this @@ -633,7 +634,7 @@ public: int vec_idx = offset_ >> 1; int lane_idx = offset_ & 0x1; - Storage item = Storage(unpack_integer_4bits( + Storage item = Storage(integer_subbyte::unpack_integer_4bits( *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); dt_qint4 result(item); @@ -664,7 +665,7 @@ public: int vec_idx = offset_ >> 1; int lane_idx = offset_ & 0x1; - Storage item = Storage(unpack_integer_4bits( + Storage item = Storage(integer_subbyte::unpack_integer_4bits( *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); dt_quint4 result(item); diff --git a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh index 658f12d89..9386f4c70 100644 --- a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh +++ b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh @@ -15,6 +15,7 @@ #include "src/cuda/elemwise_helper_q4.cuh" #include "src/cuda/elemwise_multi_type/kern.cuh" #include "src/cuda/utils.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" namespace megdnn { namespace cuda { @@ -380,10 +381,10 @@ struct QuantizedMultiTypeOp< } __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) { - dst_storage x = apply( - src_storage(unpack_integer_4bits(a.x, 0))); - dst_storage y = apply( - src_storage(unpack_integer_4bits(a.x, 4))); + dst_storage x = apply(src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 0))); + dst_storage y = apply(src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 4))); *(dst_vect_type*)(&dst[idx]) = elemwise_intl::VectTypeTrait::make_vector(x, y); @@ -470,14 +471,14 @@ struct QuantizedMultiTypeOp< __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, src_vect_type b) { - src_storage a_x = - src_storage(unpack_integer_4bits(a.x, 0)); - src_storage a_y = - src_storage(unpack_integer_4bits(a.x, 4)); - src_storage b_x = - src_storage(unpack_integer_4bits(b.x, 0)); - src_storage b_y = - src_storage(unpack_integer_4bits(b.x, 4)); + src_storage a_x = src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 0)); + src_storage a_y = src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 4)); + src_storage b_x = src_storage( + integer_subbyte::unpack_integer_4bits(b.x, 0)); + src_storage b_y = src_storage( + integer_subbyte::unpack_integer_4bits(b.x, 4)); dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y); @@ -572,18 +573,18 @@ struct QuantizedMultiTypeOp< __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, src_vect_type b, src_vect_type c) { - src_storage a_x = - src_storage(unpack_integer_4bits(a.x, 0)); - src_storage a_y = - src_storage(unpack_integer_4bits(a.x, 4)); - src_storage b_x = - src_storage(unpack_integer_4bits(b.x, 0)); - src_storage b_y = - src_storage(unpack_integer_4bits(b.x, 4)); - src_storage c_x = - src_storage(unpack_integer_4bits(c.x, 0)); - src_storage c_y = - src_storage(unpack_integer_4bits(c.x, 4)); + src_storage a_x = src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 0)); + src_storage a_y = src_storage( + integer_subbyte::unpack_integer_4bits(a.x, 4)); + src_storage b_x = src_storage( + integer_subbyte::unpack_integer_4bits(b.x, 0)); + src_storage b_y = src_storage( + integer_subbyte::unpack_integer_4bits(b.x, 4)); + src_storage c_x = src_storage( + integer_subbyte::unpack_integer_4bits(c.x, 0)); + src_storage c_y = src_storage( + integer_subbyte::unpack_integer_4bits(c.x, 4)); dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y); diff --git a/dnn/src/cuda/integer_subbyte_utils.cuh b/dnn/src/cuda/integer_subbyte_utils.cuh new file mode 100644 index 000000000..d9a80ac8a --- /dev/null +++ b/dnn/src/cuda/integer_subbyte_utils.cuh @@ -0,0 +1,146 @@ +/** + * \file dnn/src/cuda/integer_subbyte_utils.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#if MEGDNN_CC_CUDA +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace integer_subbyte { +template +struct integer_trait; + +template <> +struct integer_trait { + using type = int; +}; + +template <> +struct integer_trait { + using type = unsigned; +}; + +MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8( + int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { + unsigned out; +#if __CUDA_ARCH__ >= 750 && \ + ((__CUDACC_VER_MAJOR__ > 10) || \ + ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) + asm volatile( + "{ .reg .u32 r4;" + "cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;" + "cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;" + "cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;" + "cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;" + "}" + : "=r"(out) + : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), + "r"(s7)); +#else +#define CVT_SAT_S4_S32(r, bits) \ + r = r <= -8 ? -8 : r; \ + r = r > 7 ? 7 : r; \ + r = (((unsigned)r & 0xf) << bits); + CVT_SAT_S4_S32(s0, 0) + CVT_SAT_S4_S32(s1, 4) + CVT_SAT_S4_S32(s2, 8) + CVT_SAT_S4_S32(s3, 12) + CVT_SAT_S4_S32(s4, 16) + CVT_SAT_S4_S32(s5, 20) + CVT_SAT_S4_S32(s6, 24) + CVT_SAT_S4_S32(s7, 28) + out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; +#undef CVT_SAT_S4_S32 +#endif + return reinterpret_cast(out); +} + +MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8( + int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { + unsigned out; +#if __CUDA_ARCH__ >= 750 && \ + ((__CUDACC_VER_MAJOR__ > 10) || \ + ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) + asm volatile( + "{ .reg .u32 r4;" + "cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;" + "cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;" + "cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;" + "cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;" + "}" + : "=r"(out) + : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), + "r"(s7)); +#else +#define CVT_SAT_U4_S32(r, bits) \ + r = r <= 0 ? 0 : r; \ + r = r > 15 ? 15 : r; \ + r = (((unsigned)r & 0xf) << bits); + CVT_SAT_U4_S32(s0, 0) + CVT_SAT_U4_S32(s1, 4) + CVT_SAT_U4_S32(s2, 8) + CVT_SAT_U4_S32(s3, 12) + CVT_SAT_U4_S32(s4, 16) + CVT_SAT_U4_S32(s5, 20) + CVT_SAT_U4_S32(s6, 24) + CVT_SAT_U4_S32(s7, 28) + out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; +#undef CVT_SAT_U4_S32 +#endif + return reinterpret_cast(out); +} + +template +MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage, + int bits) { + //! size in bits of 32 bit integer - 4 bits + static constexpr int shift = 28; + using type = typename integer_trait::type; + unsigned intermediate = static_cast(storage); + type result = reinterpret_cast(intermediate); + return (result << (shift - bits)) >> shift; +} + +MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8( + int (&result)[8], const int& source) { +#pragma unroll + for (int i = 0; i < 8; i++) { + result[i] = unpack_integer_4bits( + reinterpret_cast(source), (i << 2)); + } +} + +MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8( + int (&result)[8], const int& source) { +#pragma unroll + for (int i = 0; i < 8; i++) { + result[i] = unpack_integer_4bits( + reinterpret_cast(source), (i << 2)); + } +} + +MEGDNN_DEVICE __forceinline__ static void transform_int4x2_to_int8( + int (&result)[2], const uint8_t& source) { + result[0] = unpack_integer_4bits(source, 0); + result[1] = unpack_integer_4bits(source, 4); +} + +MEGDNN_DEVICE __forceinline__ static void transform_uint4x2_to_int8( + int (&result)[2], const uint8_t& source) { + result[0] = unpack_integer_4bits(source, 0); + result[1] = unpack_integer_4bits(source, 4); +} +} // namespace integer_subbyte +} // namespace cuda +} // namespace megdnn +#endif +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/memory_utils.cuh b/dnn/src/cuda/memory_utils.cuh new file mode 100644 index 000000000..82c718ed3 --- /dev/null +++ b/dnn/src/cuda/memory_utils.cuh @@ -0,0 +1,253 @@ +/** + * \file dnn/src/cuda/memory_utils.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#if MEGDNN_CC_CUDA +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace memory { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template +struct global_load; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// The redundant mov PTX instruction is used to enforce the compiler to +// initialize data to zero before ld.global +template +struct global_load { + MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, + bool pred_guard, int val = 0) { + uint4* data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %9, 0;\n" + " mov.b32 %0, %10;\n" + " mov.b32 %1, %10;\n" + " mov.b32 %2, %10;\n" + " mov.b32 %3, %10;\n" + " mov.b32 %4, %10;\n" + " mov.b32 %5, %10;\n" + " mov.b32 %6, %10;\n" + " mov.b32 %7, %10;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n" + " @p ld.global.v4.u32 {%4, %5, %6, %7}, [%11];\n" + "}\n" + : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), + "=r"(data[0].w), "=r"(data[1].x), "=r"(data[1].y), + "=r"(data[1].z), "=r"(data[1].w) + : "l"(ptr), "r"((int)pred_guard), + "r"(reinterpret_cast(val)), + "l"(((uint8_t*)ptr) + 16)); + } +}; + +template +struct global_load { + MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, + bool pred_guard, int val) { + uint4& data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " mov.b32 %0, %6;\n" + " mov.b32 %1, %6;\n" + " mov.b32 %2, %6;\n" + " mov.b32 %3, %6;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n" + "}\n" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"(ptr), "r"((int)pred_guard), + "r"(reinterpret_cast(val))); + } +}; + +template +struct global_load { + MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, + bool pred_guard, int val) { + uint2& data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " mov.b32 %0, %4;\n" + " mov.b32 %1, %4;\n" + " @p ld.global.v2.u32 {%0, %1}, [%2];\n" + "}\n" + : "=r"(data.x), "=r"(data.y) + : "l"(ptr), "r"((int)pred_guard), + "r"(reinterpret_cast(val))); + } +}; + +template +struct global_load { + MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, + bool pred_guard, int val) { + unsigned& data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b32 %0, %3;\n" + " @p ld.global.u32 %0, [%1];\n" + "}\n" + : "=r"(data) + : "l"(ptr), "r"((int)pred_guard), + "r"(reinterpret_cast(val))); + } +}; + +template +struct global_load { + MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, + bool pred_guard, int val) { + if (pred_guard) + D = *(reinterpret_cast(ptr)); + else { + unsigned uv = reinterpret_cast(val); + uint8_t& data = reinterpret_cast(D); + data = uv & 0xff; + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes> +struct global_store; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + uint4 const* data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" + "}\n" + : + : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), + "r"(data[0].w), "r"((int)pred_guard), + "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y), + "r"(data[1].z), "r"(data[1].w)); + } +}; + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + uint4 const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), + "r"((int)pred_guard)); + } +}; + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + uint2 const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " @p st.global.v2.u32 [%0], {%1, %2};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + uint32_t const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u32 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "r"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + uint16_t const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u16 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "h"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, + bool pred_guard) { + if (pred_guard) + *(reinterpret_cast(ptr)) = D; + } +}; + +} // namespace memory +} // namespace cuda +} // namespace megdnn +#endif + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp index 64da13e7d..a55da9348 100644 --- a/dnn/src/cuda/relayout/opr_impl.cpp +++ b/dnn/src/cuda/relayout/opr_impl.cpp @@ -83,12 +83,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() { return false; if (lsrc.stride[0] != 1 || ldst.stride[0] != 1) return false; - size_t copy_size; - if (ldst.dtype.is_low_bit()) { - copy_size = ldst.access_bytes(); - } else { - copy_size = ldst.total_nr_elems() * dtype_size(); - } + size_t copy_size = ldst.span().dist_byte(); cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size, cudaMemcpyDeviceToDevice, m_opr->stream())); @@ -191,7 +186,6 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() { } void RelayoutForwardImpl::Param::copy_general() { - copy_noncontig_general(m_dst, m_src, m_opr->stream()); } diff --git a/dnn/src/cuda/relayout/param_visitor.cuh b/dnn/src/cuda/relayout/param_visitor.cuh index a44fb1b4d..c33a27757 100644 --- a/dnn/src/cuda/relayout/param_visitor.cuh +++ b/dnn/src/cuda/relayout/param_visitor.cuh @@ -6,14 +6,15 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "megdnn/basic_types.h" #include "src/cuda/int_fastdiv.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" #include "src/cuda/utils.cuh" - #pragma once namespace megdnn { @@ -56,13 +57,13 @@ y template class ParamElemVisitor; #define PARAM_ELEM_VISITOR_COMMON_DEV \ - devfunc ctype *ptr() { return m_ptr; } \ - devfunc ctype &at(uint32_t idx) { return m_ptr[offset(idx)]; } + devfunc ctype* ptr() { return m_ptr; } \ + devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; } //! specialization for CONTIG_OTHER template class ParamElemVisitor { - ctype *__restrict m_ptr; + ctype* __restrict m_ptr; int m_stride[ndim]; //! m_shape_highdim[i] = original_shape[i + 1] @@ -75,7 +76,7 @@ class ParamElemVisitor { public: static const int NDIM = ndim; - void host_init(const TensorND &rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA devfunc void thread_init(uint32_t) {} @@ -86,7 +87,7 @@ public: int offset = 0; #pragma unroll for (int i = ndim - 1; i >= 1; --i) { - Uint32Fastdiv &shp = m_shape_highdim[i - 1]; + Uint32Fastdiv& shp = m_shape_highdim[i - 1]; uint32_t idx_div = idx / shp; offset += (idx - idx_div * shp.divisor()) * m_stride[i]; idx = idx_div; @@ -102,12 +103,12 @@ public: //! specialization for CONTIG_FULL template class ParamElemVisitor { - ctype *__restrict m_ptr; + ctype* __restrict m_ptr; public: static const int NDIM = ndim; - void host_init(const TensorND &rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA devfunc void thread_init(uint32_t) {} @@ -126,7 +127,6 @@ template class ParamElemVisitor { using Storage = uint8_t; -protected: Storage* __restrict m_ptr; int m_stride[ndim]; int m_shape[ndim]; @@ -205,7 +205,6 @@ public: for (int i = 0; i < ndim; ++i) { valid &= (shape_idx[i] < m_shape[i]); } -#pragma unroll for (int i = 0; i < ndim - 1; ++i) { idx = (idx + shape_idx[i]) * m_shape[i + 1]; } @@ -213,7 +212,6 @@ public: } return idx; } - devfunc Storage* ptr() { return m_ptr; } devfunc Storage at(uint32_t idx) { @@ -221,7 +219,7 @@ public: int vec_idx = offset_ >> 1; int lane_idx = offset_ & 0x1; - Storage item = Storage(unpack_integer_4bits( + Storage item = Storage(integer_subbyte::unpack_integer_4bits( *(Storage*)&m_ptr[vec_idx], lane_idx * 4)); return item; @@ -235,7 +233,7 @@ public: #endif }; -} // namespace cuda -} // namespace megdnn +} // namespace cuda +} // namespace megdnn // vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/relayout_format/helper.cuh b/dnn/src/cuda/relayout_format/helper.cuh index f77c7cc9d..69d98b70f 100644 --- a/dnn/src/cuda/relayout_format/helper.cuh +++ b/dnn/src/cuda/relayout_format/helper.cuh @@ -143,6 +143,109 @@ struct global_load_with_zero_point { } }; +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes> +struct global_store; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + uint4 const* data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" + "}\n" + : + : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), + "r"(data[0].w), "r"((int)pred_guard), + "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y), + "r"(data[1].z), "r"(data[1].w)); + } +}; + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + uint4 const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), + "r"((int)pred_guard)); + } +}; + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + uint2 const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " @p st.global.v2.u32 [%0], {%1, %2};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + uint32_t const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u32 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "r"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + uint16_t const& data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u16 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "h"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { + if (pred_guard) + *(reinterpret_cast(ptr)) = D; + } +}; + #undef devfunc } // namespace relayout_format } // namespace cuda diff --git a/dnn/src/cuda/relayout_format/relayout_format.cu b/dnn/src/cuda/relayout_format/relayout_format.cu index c28d5ffcf..9c7b7b9f7 100644 --- a/dnn/src/cuda/relayout_format/relayout_format.cu +++ b/dnn/src/cuda/relayout_format/relayout_format.cu @@ -10,17 +10,14 @@ * implied. */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "cutlass/fast_math.h" -#include "cutlass/arch/memory.h" -#pragma GCC diagnostic pop +#include "src/cuda/int_fastdiv.cuh" #include "src/cuda/query_blocksize.cuh" #include "src/cuda/relayout_format/relayout_format.cuh" -#include "src/cuda/relayout_format/helper.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" +#include "src/cuda/memory_utils.cuh" using namespace megdnn; using namespace cuda; +using namespace integer_subbyte; namespace { @@ -322,26 +319,34 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4, int* dst_frag = reinterpret_cast(dst_width); #pragma unroll for (int i = 0; i < 64; i += 8) { -#define unpack_int4x2(_idx) \ - intermediate[_idx][0] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 0); \ - intermediate[_idx][1] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 4); - // clang-format off - unpack_int4x2(0) - unpack_int4x2(1) - unpack_int4x2(2) - unpack_int4x2(3) - unpack_int4x2(4) - unpack_int4x2(5) - unpack_int4x2(6) - unpack_int4x2(7) - // clang-format on + transform_int4x2_to_int8( + intermediate[0], + reinterpret_cast(read_channel[i + 0])); + transform_int4x2_to_int8( + intermediate[1], + reinterpret_cast(read_channel[i + 1])); + transform_int4x2_to_int8( + intermediate[2], + reinterpret_cast(read_channel[i + 2])); + transform_int4x2_to_int8( + intermediate[3], + reinterpret_cast(read_channel[i + 3])); + transform_int4x2_to_int8( + intermediate[4], + reinterpret_cast(read_channel[i + 4])); + transform_int4x2_to_int8( + intermediate[5], + reinterpret_cast(read_channel[i + 5])); + transform_int4x2_to_int8( + intermediate[6], + reinterpret_cast(read_channel[i + 6])); + transform_int4x2_to_int8( + intermediate[7], + reinterpret_cast(read_channel[i + 7])); int frag_idx = i / 8; dst_frag[0 * 8 + frag_idx] = pack_channel(0); dst_frag[1 * 8 + frag_idx] = pack_channel(1); -#undef unpack_int4x2 } } using Fragment = array_wrapper; @@ -429,26 +434,34 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm, int* dst_frag = reinterpret_cast(dst_width); #pragma unroll for (int i = 0; i < 64; i += 8) { -#define unpack_int4x2(_idx) \ - intermediate[_idx][0] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 0); \ - intermediate[_idx][1] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 4); - // clang-format off - unpack_int4x2(0) - unpack_int4x2(1) - unpack_int4x2(2) - unpack_int4x2(3) - unpack_int4x2(4) - unpack_int4x2(5) - unpack_int4x2(6) - unpack_int4x2(7) - // clang-format on + transform_uint4x2_to_int8( + intermediate[0], + reinterpret_cast(read_channel[i + 0])); + transform_uint4x2_to_int8( + intermediate[1], + reinterpret_cast(read_channel[i + 1])); + transform_uint4x2_to_int8( + intermediate[2], + reinterpret_cast(read_channel[i + 2])); + transform_uint4x2_to_int8( + intermediate[3], + reinterpret_cast(read_channel[i + 3])); + transform_uint4x2_to_int8( + intermediate[4], + reinterpret_cast(read_channel[i + 4])); + transform_uint4x2_to_int8( + intermediate[5], + reinterpret_cast(read_channel[i + 5])); + transform_uint4x2_to_int8( + intermediate[6], + reinterpret_cast(read_channel[i + 6])); + transform_uint4x2_to_int8( + intermediate[7], + reinterpret_cast(read_channel[i + 7])); int frag_idx = i / 8; dst_frag[0 * 8 + frag_idx] = pack_channel(0); dst_frag[1 * 8 + frag_idx] = pack_channel(1); -#undef unpack_int4x2 } } using Fragment = array_wrapper; @@ -744,6 +757,16 @@ inline __device__ int4 make_zero_pad(const uint8_t zero_point) { return {zero_point, zero_point, zero_point, zero_point}; } +template +inline __device__ int make_zero(int zero_point); + +template <> +inline __device__ int make_zero<4>(int zero_point) { + return transform_int8_to_uint4x8(zero_point, zero_point, zero_point, + zero_point, zero_point, zero_point, + zero_point, zero_point); +} + template inline __device__ void write_helper(DstDtype* ptr, DstDtype val) { *ptr = val; @@ -1062,11 +1085,11 @@ public: using AccessType = array_wrapper; using Fragment = array_wrapper; - MEGDNN_DEVICE TensorIteratorOverChannel() + MEGDNN_HOST TensorIteratorOverChannel() : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {} - MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_, - int chan_stride_in_elements_, - int channel_, int, int) + MEGDNN_HOST TensorIteratorOverChannel(Type* pointer_, + int chan_stride_in_elements_, + int channel_, int, int) : pointer{pointer_}, chan_stride_in_elements{chan_stride_in_elements_}, channel{channel_} {} @@ -1093,8 +1116,7 @@ public: (lane_size_in_type / pack_size_in_type) + j; bool guard = i < channel; - relayout_format::global_load_with_zero_point( + memory::global_load( frag_ptr[frag_idx], reinterpret_cast(pointer_ + j * pack_size_in_type), @@ -1115,7 +1137,7 @@ public: (lane_size_in_type / pack_size_in_type) + j; bool guard = i < channel; - cutlass::arch::global_store( + memory::global_store( frag_ptr[frag_idx], reinterpret_cast(pointer_ + j * pack_size_in_type), @@ -1160,20 +1182,18 @@ public: using AccessType = array_wrapper; using Fragment = array_wrapper; - MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel() + MEGDNN_HOST MaskedTensorIteratorOverChannel() : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {} - MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel( + MEGDNN_HOST MaskedTensorIteratorOverChannel( Type* pointer_, int chan_stride_in_elements_, int channel_, int bound_, int div_) : pointer{pointer_}, chan_stride_in_elements{chan_stride_in_elements_}, channel{channel_}, bound{bound_}, - div{div_} { - cutlass::find_divisor(mul, shr, div); - } + div{uint32_t(div_)} {} MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) { pointer += (c_idx / pack_size) * chan_stride_in_elements; @@ -1187,8 +1207,8 @@ public: #pragma unroll for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) { int offset = hw_idx + j; - int h, w; - cutlass::fast_divmod(h, w, offset, div, mul, shr); + int h = (int)((uint32_t)(offset) / div); + int w = (int)((uint32_t)(offset) % div); bool guard = (i < channel) && (w < bound); int index = (i / pack_size) * (lane_size_in_type / pack_size_in_type) + @@ -1219,8 +1239,7 @@ public: int mask_index = (frag_idx >> 5); int mask_shift = (frag_idx & 0x1f); bool guard = (mask[mask_index] & (1 << mask_shift)); - relayout_format::global_load_with_zero_point( + memory::global_load( frag_ptr[frag_idx], reinterpret_cast(pointer_ + stride[j]), guard, zero_point); @@ -1242,7 +1261,7 @@ public: int mask_index = (frag_idx >> 5); int mask_shift = (frag_idx & 0x1f); bool guard = (mask[mask_index] & (1 << mask_shift)); - cutlass::arch::global_store( + memory::global_store( frag_ptr[frag_idx], reinterpret_cast(pointer_ + stride[j]), guard); } @@ -1260,9 +1279,7 @@ private: int chan_stride_in_elements; int channel; int bound; - int div; - uint32_t mul; - uint32_t shr; + Uint32Fastdiv div; uint32_t mask[mask_size]; size_t stride[lane_size_in_type / pack_size_in_type]; }; @@ -1355,8 +1372,7 @@ __global__ void relayout_kern(typename RelayoutProblem_::Param param) { param.dst_iterator.initialize(c_idx, hw_idx); typename SrcIterator::Fragment src_frag; typename DstIterator::Fragment dst_frag; - int zp = relayout_format::make_zero( - param.zero_point); + int zp = make_zero(param.zero_point); param.src_iterator.load(src_frag, zp); RelayoutProblem_::Transpose::trans( reinterpret_cast(dst_frag), @@ -1456,7 +1472,7 @@ void relayout_format::relayout_format_cuda_nchw_nchwx( megdnn_assert(src_layout.dtype.is_low_bit()); int n = src.layout[0]; int ic = src.layout[1]; - int oc = dst.layout[1] * 64; + int oc = dst.layout[1] * pack_oc; int h = src.layout[2]; // align to byte int w = src.layout[3]; diff --git a/dnn/src/cuda/type_cvt/kern.cu b/dnn/src/cuda/type_cvt/kern.cu index 349770aec..46063ddba 100644 --- a/dnn/src/cuda/type_cvt/kern.cu +++ b/dnn/src/cuda/type_cvt/kern.cu @@ -223,10 +223,12 @@ struct TypeCvtOpFromQuantizedToQuantized4bit< } __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type src) { - dst_storage x = apply( - src_storage(unpack_integer_4bits(src.x, 0))); - dst_storage y = apply( - src_storage(unpack_integer_4bits(src.x, 4))); + dst_storage x = apply(src_storage( + integer_subbyte::unpack_integer_4bits(src.x, + 0))); + dst_storage y = apply(src_storage( + integer_subbyte::unpack_integer_4bits(src.x, + 4))); *(dst_vect_type*)(&dest[idx]) = VectTypeTrait::make_vector(x, y); diff --git a/dnn/src/cuda/utils.cuh b/dnn/src/cuda/utils.cuh index 87983f808..5e6244091 100644 --- a/dnn/src/cuda/utils.cuh +++ b/dnn/src/cuda/utils.cuh @@ -21,7 +21,6 @@ #include "cuda.h" #include "src/cuda/cudnn_with_check.h" #include "cutlass/cutlass.h" -#include "cutlass/platform/platform.h" #define cuda_check(_x) \ do { \ @@ -376,104 +375,6 @@ MEGDNN_DEVICE __forceinline__ static float4 operator+(float4 lval, lval.w + rval.w); } -MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8( - int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { - unsigned out; -#if __CUDA_ARCH__ >= 750 && \ - ((__CUDACC_VER_MAJOR__ > 10) || \ - ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) - asm volatile( - "{ .reg .u32 r4;" - "cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;" - "cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;" - "cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;" - "cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;" - "}" - : "=r"(out) - : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), - "r"(s7)); -#else -#define CVT_SAT_S4_S32(r, bits) \ - r = r <= -8 ? -8 : r; \ - r = r > 7 ? 7 : r; \ - r = (((unsigned)r & 0xf) << bits); - CVT_SAT_S4_S32(s0, 0) - CVT_SAT_S4_S32(s1, 4) - CVT_SAT_S4_S32(s2, 8) - CVT_SAT_S4_S32(s3, 12) - CVT_SAT_S4_S32(s4, 16) - CVT_SAT_S4_S32(s5, 20) - CVT_SAT_S4_S32(s6, 24) - CVT_SAT_S4_S32(s7, 28) - out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; -#undef CVT_SAT_S4_S32 -#endif - return reinterpret_cast(out); -} - -MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8( - int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { - unsigned out; -#if __CUDA_ARCH__ >= 750 && \ - ((__CUDACC_VER_MAJOR__ > 10) || \ - ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) - asm volatile( - "{ .reg .u32 r4;" - "cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;" - "cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;" - "cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;" - "cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;" - "}" - : "=r"(out) - : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), - "r"(s7)); -#else -#define CVT_SAT_U4_S32(r, bits) \ - r = r <= 0 ? 0 : r; \ - r = r > 15 ? 15 : r; \ - r = (((unsigned)r & 0xf) << bits); - CVT_SAT_U4_S32(s0, 0) - CVT_SAT_U4_S32(s1, 4) - CVT_SAT_U4_S32(s2, 8) - CVT_SAT_U4_S32(s3, 12) - CVT_SAT_U4_S32(s4, 16) - CVT_SAT_U4_S32(s5, 20) - CVT_SAT_U4_S32(s6, 24) - CVT_SAT_U4_S32(s7, 28) - out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; -#undef CVT_SAT_U4_S32 -#endif - return reinterpret_cast(out); -} - -template -MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage, - int bits) { - static constexpr int shift = 28; - using type = typename cutlass::platform::conditional::type; - unsigned intermediate = static_cast(storage); - type result = reinterpret_cast(intermediate); - return (result << (shift - bits)) >> shift; -} - -MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8( - int (&result)[8], const int& source) { -#pragma unroll - for (int i = 0; i < 8; i++) { - result[i] = unpack_integer_4bits( - reinterpret_cast(source), (i << 2)); - } -} - -MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8( - int (&result)[8], const int& source) { -#pragma unroll - for (int i = 0; i < 8; i++) { - result[i] = unpack_integer_4bits( - reinterpret_cast(source), (i << 2)); - } -} #endif } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/warp_perspective/forward.cpp b/dnn/src/cuda/warp_perspective/forward.cpp index cffcb07ae..a1ad4f257 100644 --- a/dnn/src/cuda/warp_perspective/forward.cpp +++ b/dnn/src/cuda/warp_perspective/forward.cpp @@ -348,6 +348,7 @@ void WarpPerspectiveForwardImpl::exec(_megdnn_tensor_in ssrc, RelayoutFormat::Param trans_param; trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; + trans_param.oc = sdst.layout[1]; relayout_opr->param() = trans_param; relayout_opr->exec(dst, sdst, {}); } diff --git a/dnn/src/cuda/warp_perspective/forward.cu b/dnn/src/cuda/warp_perspective/forward.cu index 24b57ef6d..24cded33b 100644 --- a/dnn/src/cuda/warp_perspective/forward.cu +++ b/dnn/src/cuda/warp_perspective/forward.cu @@ -17,10 +17,12 @@ #include "src/common/rounding_converter.cuh" #include "megdnn/dtype.h" #include +#include "src/cuda/integer_subbyte_utils.cuh" using namespace megdnn; using namespace cuda; using namespace warp_perspective; +using namespace integer_subbyte; namespace { diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp index a98b8028f..033be709b 100644 --- a/dnn/test/cuda/pooling.cpp +++ b/dnn/test/cuda/pooling.cpp @@ -247,14 +247,13 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; - checker.set_dtype(0, dtype::QuantizedS4(0.1f)); + checker.set_dtype(0, dtype::QuantizedS4(3.1415926f)); param.format = Param::Format::NCHW; - checker.set_epsilon(1 + 1e-3); checker.set_param(param).exec({{20, 64, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE; - checker.set_param(param).exec({{20, 64, 22, 33}, {}}); + checker.set_param(param).exec({{20, 96, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; - checker.set_param(param).exec({{20, 64, 22, 33}, {}}); + checker.set_param(param).exec({{20, 24, 22, 33}, {}}); } TEST_F(CUDA, POOLING_FORWARD_NCHW4) { diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp index 8567c9f75..5bdde41b9 100644 --- a/dnn/test/cuda/type_cvt.cpp +++ b/dnn/test/cuda/type_cvt.cpp @@ -107,7 +107,7 @@ TEST_F(CUDA, QUANTIZED_TYPECVT) { } TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { - UniformIntRNG int_rng{0, 8}; + UniformIntRNG int_rng{-8, 8}; Checker checker(handle_cuda()); checker.set_rng(0, &int_rng).set_rng(1, &int_rng); diff --git a/dnn/test/cuda/warp_perspective.cpp b/dnn/test/cuda/warp_perspective.cpp index 2d323a219..dd4627063 100644 --- a/dnn/test/cuda/warp_perspective.cpp +++ b/dnn/test/cuda/warp_perspective.cpp @@ -627,9 +627,9 @@ TEST_F(CUDA, WARP_PERSPECTIVE_FORWARD_QINT4) { Checker checker(handle_cuda()); WarpPerspectiveMatRNG rng; checker.set_rng(1, &rng); - checker.set_dtype(0, dtype::QuantizedS4(0.1f)) + checker.set_dtype(0, dtype::QuantizedS4(1.25f)) .set_dtype(1, dtype::Float32()) - .set_dtype(2, dtype::QuantizedS4(0.1f)); + .set_dtype(2, dtype::QuantizedS4(1.25f)); for (auto bmode : {WarpPerspective::BorderMode::WRAP, WarpPerspective::BorderMode::REFLECT, WarpPerspective::BorderMode::REPLICATE, diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index ef4500e8f..1b8a064e6 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -68,7 +68,7 @@ using namespace gopt; * oprs should not get involved in any actual computing. */ MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder, - cg::SingleCNOperatorNodeBase) // { + cg::SingleCNOperatorNodeBase) // { public: //! relayout type of this opr enum class LayoutType { @@ -124,14 +124,14 @@ public: NCHW4_TO_NCHW64, //! VarNode* { + const TensorShape& orig_shape) -> VarNode* { mgb_assert(inp->shape().ndim == 4); + mgb_assert(inp->shape()[0] == orig_shape[0]); + mgb_assert(inp->shape()[2] == orig_shape[2]); + mgb_assert(inp->shape()[3] == orig_shape[3]); + size_t orig_channels = orig_shape[1]; auto x = SymbolVar(inp); auto cv = [&x](int v) { return x.make_scalar(v); }; using AIdx = opr::Subtensor::AxisIndexer; @@ -4108,8 +4111,7 @@ void PaddingChannelPass::apply(OptState& opt) const { bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; if (padding_cur_inp) { - size_t orig_channels = cur_inp->shape()[1]; - inps[i] = extract_subtensor(inps[i], orig_channels); + inps[i] = extract_subtensor(inps[i], cur_inp->shape()); } } return serialization::copy_opr_shallow(*opr, inps, opr->config()); @@ -4133,8 +4135,7 @@ void PaddingChannelPass::apply(OptState& opt) const { auto cur_inp = opr->input(i); bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; if (padding_cur_inp) { - size_t orig_channels = cur_inp->shape()[1]; - inps[i] = extract_subtensor(inps[i], orig_channels); + inps[i] = extract_subtensor(inps[i], cur_inp->shape()); } } return serialization::copy_opr_shallow(*opr, inps, opr->config()); @@ -4142,6 +4143,8 @@ void PaddingChannelPass::apply(OptState& opt) const { opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; + opr_replace_funcs[opr::Reduce::typeinfo()] = replace_nonpadding_oprs; + opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_nonpadding_oprs; auto on_opr = [&opt, &rewriter, &opr_replace_funcs, &extract_subtensor](OperatorNodeBase* opr) { @@ -4169,8 +4172,7 @@ void PaddingChannelPass::apply(OptState& opt) const { auto dst = out1[i]; if (opt.graph().endpoint_contain(src) && !src->shape().eq_shape(dst->shape())) { - size_t orig_channels = src->shape()[1]; - dst = extract_subtensor(dst, orig_channels); + dst = extract_subtensor(dst, src->shape()); } rewriter.replace_var(src, dst, nullptr); } diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 2483a2eeb..a875f3ccf 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4183,14 +4183,7 @@ TEST(TestGoptInference, PaddingChannels) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4263,15 +4256,8 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } - + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); + HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; @@ -4332,19 +4318,11 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) { MGB_ASSERT_TENSOR_EQ(t1, t2); } -// FIXME replace cpu with gpu to enable gpu validation TEST(TestGoptInference, PaddingChannelsWithPooling) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4408,17 +4386,7 @@ TEST(TestGoptInference, PaddingChannelsWithPooling) { // FIXME replace cpu with gpu to enable gpu validation TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { - REQUIRE_GPU(1); auto cn = CompNode::load("cpu0"); -// cn.activate(); -// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; -// auto sm_ver = prop.major * 10 + prop.minor; -// if (sm_ver < 61) { -// printf("This testcast ignored due to insufficient cuda cap(got: %d, " -// "expected: %d)\n", -// sm_ver, 61); -// return; -// } HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4488,16 +4456,9 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { TEST(TestGoptInference, EnableNCHW64Basic) { REQUIRE_GPU(1); - auto cn = CompNode::load("cpu0"); -// cn.activate(); -// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; -// auto sm_ver = prop.major * 10 + prop.minor; -// if (sm_ver < 61) { -// printf("This testcast ignored due to insufficient cuda cap(got: %d, " -// "expected: %d)\n", -// sm_ver, 61); -// return; -// } + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4517,8 +4478,8 @@ TEST(TestGoptInference, EnableNCHW64Basic) { }; auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), - b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); + w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), + b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); opr::ConvBias::Param param; param.format = opr::ConvBias::Param::Format::NCHW; param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; @@ -4527,7 +4488,7 @@ TEST(TestGoptInference, EnableNCHW64Basic) { auto y = opr::ConvBias::make(x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS8(2.5f)), + auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); @@ -4541,14 +4502,14 @@ TEST(TestGoptInference, EnableNCHW64Basic) { auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, OperatorNodeConfig{dtype::QuantizedS4{40.f}}); y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); - auto w4 = mkcvar("w4", {32, 64, 3, 3}, dtype::QuantizedS8(2.5f)), - b4 = mkcvar("b4", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); + auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), + b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; auto y5 = opr::ElemwiseMultiType::make( {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, - OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); + OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); y5 = opr::TypeCvt::make(y5, dtype::Float32()); SymbolVar y5_pad; unpack_vector( @@ -4573,10 +4534,10 @@ TEST(TestGoptInference, EnableNCHW64Basic) { ASSERT_EQ(o.param().format, Format::_fmt); \ } CHECK(0, NCHW4); - CHECK(1, NCHW32); + CHECK(1, NCHW4); CHECK(2, NCHW32); CHECK(3, NCHW64); - CHECK(4, NCHW32); + CHECK(4, NCHW4); #undef CHECK HostTensorND t1, t2; auto func1 = graph->compile({make_callback_copy(y5, t1)}); @@ -4588,16 +4549,9 @@ TEST(TestGoptInference, EnableNCHW64Basic) { TEST(TestGoptInference, EnableNCHW64PaddingChannel) { REQUIRE_GPU(1); - auto cn = CompNode::load("cpu0"); -// cn.activate(); -// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; -// auto sm_ver = prop.major * 10 + prop.minor; -// if (sm_ver < 61) { -// printf("This testcast ignored due to insufficient cuda cap(got: %d, " -// "expected: %d)\n", -// sm_ver, 61); -// return; -// } + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4616,8 +4570,8 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { dtype); }; - auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), + auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), + w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); opr::ConvBias::Param param; param.format = opr::ConvBias::Param::Format::NCHW; @@ -4630,7 +4584,7 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { opr::Pooling::Param pool; pool.format = opr::Pooling::Param::Format::NCHW; y = opr::Pooling::make(y, pool); - + auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, @@ -4657,11 +4611,12 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; deconv.stride_h = deconv.stride_w = 2; deconv.pad_h = deconv.pad_w = 1; - auto w6 = mkcvar("w6", {20, 20, 4, 4}, dtype::QuantizedS8{2.5f}); + auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); auto y6 = opr::ConvolutionBackwardData::make( w6, y5, deconv, {}, OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); - + y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); + std::shared_ptr mat = std::make_shared( cn, TensorShape{16, 3, 3}, dtype::Float32()); warp_perspective_mat_gen(*mat, 16, 14, 14); @@ -4676,25 +4631,31 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { opt.enable_nchw64(); unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); - SmallVector oprs; - auto cb = [&oprs](cg::OperatorNodeBase* opr) { - if (opr->same_type()) { - oprs.push_back(opr); - } - }; - cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); - ASSERT_EQ(oprs.size(), 5); + HostTensorND t1, t2; + auto func1 = graph->compile({make_callback_copy(y7, t1)}); + func1->execute(); + auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); using Format = opr::ConvBiasForward::Param::Format; + SmallVector oprs; + auto cb = [&oprs](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + oprs.push_back(opr); + } + }; + cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); + ASSERT_EQ(oprs.size(), 5); #define CHECK(_i, _fmt) \ { \ const auto& o = oprs[_i]->cast_final(); \ ASSERT_EQ(o.param().format, Format::_fmt); \ } - CHECK(0, NCHW4); - CHECK(1, NCHW32); - CHECK(2, NCHW32); - CHECK(3, NCHW64); - CHECK(4, NCHW64); + CHECK(0, NCHW4); + CHECK(1, NCHW32); + CHECK(2, NCHW32); + CHECK(3, NCHW64); + CHECK(4, NCHW64); #undef CHECK { const auto& deconv = find_opr(y7_pad); @@ -4702,30 +4663,19 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { const auto& pool = find_opr(y7_pad); ASSERT_EQ(pool.param().format, Format::NCHW4); const auto& warp = find_opr(y7_pad); - ASSERT_EQ(warp.param().format, Format::NCHW4); + ASSERT_EQ(warp.param().format, Format::NCHW64); } size_t nr_dimshuffle = find_opr_num(y7_pad); - HostTensorND t1, t2; - auto func1 = graph->compile({make_callback_copy(y7, t1)}); - func1->execute(); - auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); - func2->execute(); - MGB_ASSERT_TENSOR_EQ(t1, t2); + ASSERT_EQ(nr_dimshuffle, 8); } TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { REQUIRE_GPU(1); - auto cn = CompNode::load("cpu0"); -// cn.activate(); -// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; -// auto sm_ver = prop.major * 10 + prop.minor; -// if (sm_ver < 61) { -// printf("This testcast ignored due to insufficient cuda cap(got: %d, " -// "expected: %d)\n", -// sm_ver, 61); -// return; -// } + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + HostTensorND t1, t2; HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; @@ -4757,7 +4707,7 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); @@ -4772,6 +4722,9 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, OperatorNodeConfig{dtype::QuantizedS4{40.f}}); y4 = opr::TypeCvt::make(y4, dtype::Float32()); + auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, + OperatorNodeConfig{dtype::QuantizedS4(40.f)}); + y5 = opr::TypeCvt::make(y5, dtype::Float32()); SymbolVar y4_pad; auto opt = gopt::OptimizeForInferenceOptions{}; opt.enable_nchw64(); @@ -4779,15 +4732,35 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); size_t nr_elem_mult_type = find_opr_num(y4_pad); ASSERT_EQ(nr_elem_mult_type, 0); - // FIXME need impl of elemwise/elemwise_multi_type on CUDA -#if 0 - HostTensorND t1, t2; - auto func1 = graph->compile({make_callback_copy(y4, t1)}); - func1->execute(); - auto func2 = graph->compile({make_callback_copy(y4_pad, t2)}); - func2->execute(); + auto func = graph->compile({make_callback_copy(y4_pad, t1)}); + func->execute(); + + { + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make( + x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto y1 = opr::ConvBias::make( + y, w1, b1, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); + auto y2 = opr::ConvBias::make( + y1, w2, b2, param, {}, + OperatorNodeConfig{dtype::QuantizedS4{40.f}}); + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + auto y3 = opr::ConvBias::make( + y2, w3, b3, y1, param, {}, + OperatorNodeConfig{dtype::QuantizedS4(40.f)}); + y3 = opr::TypeCvt::make(y3, dtype::Float32()); + auto func = graph->compile({make_callback_copy(y3, t2)}); + func->execute(); + } MGB_ASSERT_TENSOR_EQ(t1, t2); -#endif } #endif diff --git a/src/opr/impl/imgproc.cpp b/src/opr/impl/imgproc.cpp index 1d424b407..eedc3e537 100644 --- a/src/opr/impl/imgproc.cpp +++ b/src/opr/impl/imgproc.cpp @@ -102,7 +102,8 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape( default: size_t height_idx = 0; if (param().format == Param::Format::NCHW || - param().format == Param::Format::NCHW4) { + param().format == Param::Format::NCHW4 || + param().format == Param::Format::NCHW64) { height_idx = 2; } else { height_idx = 1; diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 12e6b182f..b30aab099 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -2604,11 +2604,21 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { #endif namespace { -// FIXME change comp node from "cpu0" to "gpu0" TEST(TestOprDNN, ConvBiasInt4NCHW) { - auto run = [](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, - size_t P) { - auto cn = CompNode::load("cpu0"); + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver != 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + + auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, + size_t P) { auto graph = ComputingGraph::make(); HostTensorGenerator gen; @@ -2671,6 +2681,18 @@ TEST(TestOprDNN, ConvBiasInt4NCHW) { } TEST(TestOprDNN, ConvBiasInt4NCHW64) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver != 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + auto nchw2nchw64 = [](SymbolVar x) { auto y = opr::RelayoutFormat::make( x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); @@ -2685,7 +2707,6 @@ TEST(TestOprDNN, ConvBiasInt4NCHW64) { auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, size_t P) { - auto cn = CompNode::load("cpu0"); auto graph = ComputingGraph::make(); HostTensorGenerator gen; diff --git a/src/plugin/impl/opr_footprint.cpp b/src/plugin/impl/opr_footprint.cpp index cc29f84be..e5f71e502 100644 --- a/src/plugin/impl/opr_footprint.cpp +++ b/src/plugin/impl/opr_footprint.cpp @@ -138,15 +138,19 @@ uint64_t eval_conv_computation(const TensorShape& src_shape, src_shape[1] / group * 2; return hybird_nchwx ? computation : computation * 4; } - if (param.format == Param::Format::NCHW32 || - param.format == Param::Format::NCHW32_NCHW4) { - return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 / - group * 2; + size_t packed_size; + if (param.format == Param::Format::NCHW64) { + packed_size = 64; + } else if (param.format == Param::Format::NCHW32 || + param.format == Param::Format::NCHW32_NCHW4) { + packed_size = 32; + } else { + mgb_assert(param.format == Param::Format::NCHW4 || + param.format == Param::Format::NCHW4_NCHW || + param.format == Param::Format::NCHW4_NCHW32, + "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32"); + packed_size = 4; } - mgb_assert(param.format == Param::Format::NCHW4 || - param.format == Param::Format::NCHW4_NCHW || - param.format == Param::Format::NCHW4_NCHW32, - "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32"); return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group * 2; }; diff --git a/test/src/helper.cpp b/test/src/helper.cpp index ccdd2bbef..9a2a0f311 100644 --- a/test/src/helper.cpp +++ b/test/src/helper.cpp @@ -390,7 +390,37 @@ bool mgb::check_compute_capability(int major, int minor) { MGB_CUDA_CHECK(cudaGetDevice(&dev)); cudaDeviceProp prop; MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev)); - return prop.major > major || (prop.major == major && prop.minor >= minor); + bool available = prop.major > major || (prop.major == major && prop.minor >= minor); + if (!available) { + mgb_log_warn( + "This testcase is ignored due to insufficient cuda cap(got: " + "%d.%d, " + "expected: %d.%d)", + prop.major, prop.minor, major, minor); + } + return available; +#else + MGB_MARK_USED_VAR(major); + MGB_MARK_USED_VAR(minor); + return false; +#endif +} + +bool mgb::check_compute_capability_eq(int major, int minor) { +#if MGB_CUDA + int dev; + MGB_CUDA_CHECK(cudaGetDevice(&dev)); + cudaDeviceProp prop; + MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev)); + bool available = prop.major == major && prop.minor == minor; + if (!available) { + mgb_log_warn( + "This testcase is ignored due to insufficient cuda cap(got: " + "%d.%d, " + "expected: %d.%d)", + prop.major, prop.minor, major, minor); + } + return available; #else MGB_MARK_USED_VAR(major); MGB_MARK_USED_VAR(minor); diff --git a/test/src/include/megbrain/test/helper.h b/test/src/include/megbrain/test/helper.h index 5bfee7e59..fc566033c 100644 --- a/test/src/include/megbrain/test/helper.h +++ b/test/src/include/megbrain/test/helper.h @@ -504,6 +504,9 @@ bool check_cambricon_device_available(size_t num); //! check current capability >= major.minor bool check_compute_capability(int major, int minor); +//! check current capability == major.minor +bool check_compute_capability_eq(int major, int minor); + //! check compnode avaiable bool check_device_type_avaiable(CompNode::DeviceType device_type); @@ -540,6 +543,12 @@ public: return; \ } while (0) +#define REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(major, minor) \ + do { \ + if (!check_compute_capability_eq(major, minor)) \ + return; \ + } while (0) + //! skip a testcase if amd gpu not available #define REQUIRE_AMD_GPU(n) do { \ if (!check_amd_gpu_available(n)) \ -- GitLab