diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp index 4d33cee7bcdffce67835da0569ceec3cb3db1018..1bf32c87e0aca9927fdfac614f818c9208732d8d 100644 --- a/dnn/src/common/conv_bias.cpp +++ b/dnn/src/common/conv_bias.cpp @@ -35,7 +35,9 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, size_t workspace_in_bytes, const PreprocessedFilter* preprocessed_filter) { - megdnn_assert(src.dtype.enumv() == filter.dtype.enumv()); + megdnn_assert((src.dtype.enumv() == filter.dtype.enumv()) || + (src.dtype.enumv() == DTypeEnum::Quantized4Asymm && + filter.dtype.enumv() == DTypeEnum::QuantizedS4)); // check compatibility of bias's scale if (src.dtype.category() == DTypeCategory::QUANTIZED) { if (bias.dtype.enumv() == DTypeEnum::QuantizedS32) { diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 366618ddbcfb74aaf9f2b95994841faba7da4027..be94b9ec84cd0177181f60fc2b05a4b70ac5dc65 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -598,8 +598,10 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, megdnn_assert_contiguous(src); megdnn_assert_contiguous(filter); megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); - megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s", - errmsg().c_str()); + megdnn_assert(((src.dtype.enumv() == filter.dtype.enumv()) || + (src.dtype.enumv() == DTypeEnum::Quantized4Asymm && + filter.dtype.enumv() == DTypeEnum::QuantizedS4)), + "%s", errmsg().c_str()); check_or_deduce_dtype_fwd(src.dtype, filter.dtype, dst.dtype); size_t img_dim; if (param().format == Param::Format::NCHW || diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp index 2ed854cdf33d3199b00f54d597d73fba00c8a17e..ae1841ad7a033dd89afb5531d03b064a1145c449 100644 --- a/dnn/src/common/tensor_format.cpp +++ b/dnn/src/common/tensor_format.cpp @@ -488,6 +488,10 @@ void LowbitsAlignedTensorFormatBase::assert_valid( "bad stride:%s, %zu", layout.to_string().c_str(), layout.stride[i]); } + if (!has_dim_unity_stride && + (int)layout.stride[layout.ndim - 1] == + round_up(1, (int)m_align_size_in_elements)) + has_dim_unity_stride = true; megdnn_assert(layout.ndim == 0 || has_dim_unity_stride, "innermost dim not contiguous"); } @@ -546,7 +550,12 @@ bool LowbitsAlignedTensorFormatBase::is_contiguous_spec( assert_valid(layout); ptrdiff_t expected = 1; for (int i = static_cast(layout.ndim) - 1; i >= 0; --i) { - if (layout.shape[i] != 1 && layout.stride[i] != expected) + bool is_valid_stride = + (layout.stride[i] == expected) || + (expected == 1 && + (int)layout.stride[i] == + round_up(1, (int)m_align_size_in_elements)); + if (layout.shape[i] != 1 && !is_valid_stride) return false; auto multiplier = layout.shape[i]; if (i == static_cast(layout.ndim) - 1) @@ -568,7 +577,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec( res.stride[0] = 1; return res; } - if (res.shape[i] == 1 && res.stride[i] != 1) { + if (res.shape[i] == 1) { res.remove_axis_inplace(i); } } diff --git a/dnn/src/common/utils.cpp b/dnn/src/common/utils.cpp index 5609d2804b018251159b5fdba7470362508bf399..6d340de6e5671d0b00f57c76ef89ebdb37d428cf 100644 --- a/dnn/src/common/utils.cpp +++ b/dnn/src/common/utils.cpp @@ -232,6 +232,7 @@ float megdnn::mul_scale(DType lhs, DType rhs) { (rhs.enumv() == DTypeTrait::enumv)) \ return lhs.param().scale * rhs.param().scale; cb_binary(::megdnn::dtype::QuantizedS8, ::megdnn::dtype::QuantizedS16) + cb_binary(::megdnn::dtype::Quantized4Asymm, ::megdnn::dtype::QuantizedS4) #undef cb_binary megdnn_assert(lhs.enumv() == rhs.enumv()); diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index e25973500ea10a6c17e39a5af139a02f4523afdc..4cfdd6ed68164d1b887bf4c86aa154b2cb5b9f74 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -66,7 +66,8 @@ public: CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8, CUDA_IMPLICIT_GEMM_SASS_NCHW32_IMMA_INT8, CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8, - CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4, + CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4, + CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4, }; using Mapper = std::unordered_map; diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp index a9bed4fb878df2980b43a21041e9eb306243f82c..3dab44629ad766c13287dfac70c1f989bcdf7f19 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp @@ -15,7 +15,7 @@ #include "./quint4x4x32_wmma/activation_u4.cuh" #include "./quint4x4x32_wmma/reduce_with_scale_data.cuh" -#include "./quint4x4x32_wmma/reduce_with_scale_filter.cuh" +#include "./reduce_with_scale_filter.cuh" #include "./quint4x4x32_wmma/wmma_conv_integer_u4.cuh" using namespace megdnn; @@ -75,7 +75,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::get_workspace_bundle( // for reduce filter { size_t A = OC, B = IC * FH * FW / 8, C = 1; - ws_size_zp_filter += _do_dispatch_reduce_workspace_in_bytes(A, B, C); + ws_size_zp_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); } size_t ws_size_zp_data = N * OH * OW * sizeof(int32_t); size_t ws_size_relayout_filter = get_workspace_in_bytes_do_conv(args); @@ -135,11 +135,11 @@ void ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::exec( int32_t zp_data_filter = zp_data * zp_filter * FH * FW * IC; auto&& stream = cuda_stream(handle); // zp filter - _do_dispatch_reduce_with_scale_filter_u4( + do_dispatch_reduce_with_scale_filter_4bit( static_cast(args.filter_tensor->raw_ptr), -zp_data, OC, FH * FW * IC / 8, ws_zp_filter.ptr(), stream); // zp data - _do_dispatch_reduce_with_scale_data_u4( + do_dispatch_reduce_with_scale_data_u4( ws_zp_data.ptr(), static_cast(args.src_tensor->raw_ptr), N, IH, IW, OH, OW, PH, PW, FH, FW, SH, SW, IC, -zp_filter, @@ -173,12 +173,12 @@ void ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::exec( args.bias_tensor->compatible_ptr(), s0, s1, s2, s3}; auto&& param = args.opr->param(); if (param.nonlineMode == Param::NonlineMode::RELU) { - _do_dispatch_activation_u4( + do_dispatch_activation_u4( args.dst_tensor->compatible_ptr(), visitor, ws_zp_data.ptr(), ws_zp_filter.ptr(), zp_data_filter, N, OC, OH, OW, stream); } else if (param.nonlineMode == Param::NonlineMode::IDENTITY) { - _do_dispatch_activation_u4( + do_dispatch_activation_u4( args.dst_tensor->compatible_ptr(), visitor, ws_zp_data.ptr(), ws_zp_filter.ptr(), zp_data_filter, N, OC, OH, OW, stream); diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu index aaf42b198300be797dbab4ae634c211c7c986855..83fc87e32e21bdfe2c6c5bdc25fc5d6df2f1121a 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu @@ -87,11 +87,10 @@ __global__ void kern_activation_u4(int32_t* dst, const int32_t* zp_data, } // namespace template -void _do_dispatch_activation_u4(int32_t* dst, BiasVisitor visitor, - const int32_t* zp_data, - const int32_t* zp_filter, - int32_t zp_data_filter, int batch_size, int co, - int ho, int wo, cudaStream_t stream) { +void do_dispatch_activation_u4(int32_t* dst, BiasVisitor visitor, + const int32_t* zp_data, const int32_t* zp_filter, + int32_t zp_data_filter, int batch_size, int co, + int ho, int wo, cudaStream_t stream) { void (*fptr)(int32_t*, const int32_t*, const int32_t*, int32_t, int, int OC, int, int, BiasVisitor) = kern_activation_u4; dim3 grids{0, 0, 0}; @@ -105,7 +104,7 @@ void _do_dispatch_activation_u4(int32_t* dst, BiasVisitor visitor, } #define INST(_op) \ - template void _do_dispatch_activation_u4<_op>( \ + template void do_dispatch_activation_u4<_op>( \ int32_t * dst, BiasVisitor visitor, const int32_t* zp_data, \ const int32_t* zp_filter, int32_t zp_data_filter, int batch_size, \ int co, int ho, int wo, cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh index ee4e482d019c922dc780f24e7bbd43bfc954ad8e..162f031fbf0b05c248a217fe3a7db24a24077a52 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh @@ -82,12 +82,10 @@ struct ActivationIdentity { } // namespace activation_u4 template -void _do_dispatch_activation_u4(int32_t* dst, - activation_u4::BiasVisitor visitor, - const int32_t* zp_data, - const int32_t* zp_filter, - int32_t zp_data_filter, int batch_size, int co, - int ho, int wo, cudaStream_t stream); +void do_dispatch_activation_u4(int32_t* dst, activation_u4::BiasVisitor visitor, + const int32_t* zp_data, const int32_t* zp_filter, + int32_t zp_data_filter, int batch_size, int co, + int ho, int wo, cudaStream_t stream); } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu index e8f2ab10b9f830a1cf034a9f1dd00b133725d291..99853cf2bad1fe72cc8e2db4b8f14b7609c03f38 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu @@ -444,7 +444,7 @@ reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels( } // namespace -void megdnn::cuda::_do_dispatch_reduce_with_scale_data_u4( +void megdnn::cuda::do_dispatch_reduce_with_scale_data_u4( int32_t* dst, const uint8_t* src, int batch_size, int ih, int iw, int oh, int ow, int ph, int pw, int fh, int fw, int sh, int sw, int ic, int32_t scale, uint8_t zp_data, cudaStream_t stream) { diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh index 3136795cd8de1a138adafbab3996b9ae6a5dc9ab..80b005635951c7929a7ba61038c5d98f7e1be433 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh @@ -37,7 +37,7 @@ namespace megdnn { namespace cuda { -void _do_dispatch_reduce_with_scale_data_u4( +void do_dispatch_reduce_with_scale_data_u4( int32_t* dst, const uint8_t* src, int batch_size, int ih, int iw, int oh, int ow, int ph, int pw, int fh, int fw, int sh, int sw, int ic, int32_t scale, uint8_t zp_data, cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu deleted file mode 100644 index 280d9b39f2cdeb4d075ed520abf9beece2b8b9d6..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu +++ /dev/null @@ -1,100 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/** - * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ - -#include "./reduce_with_scale_filter.cuh" -#include "src/cuda/reduce_helper.cuh" - -using namespace megdnn; -using namespace cuda; - -namespace { - -struct ReduceWithScaleUInt4Op { - typedef int32_t wtype; - const uint8_t* src; - int32_t* dst; - int32_t scale; - static const wtype INIT = 0; - -#if MEGDNN_CC_CUDA - __host__ __device__ void write(uint32_t idx, wtype val) { - dst[idx] = val * scale; - } - - __host__ __device__ static wtype apply(wtype a, wtype b) { return a + b; } - - __device__ wtype read(uint32_t idx) { - constexpr uint32_t subbytes_per_pixel = 8; - const uint32_t* sptr = - (const uint32_t*)(src + subbytes_per_pixel * idx / 2); - uint32_t val = *sptr; - int32_t ret = 0; -#pragma unroll - for (int j = 0; j < 8; j++) { - uint8_t cur = (val & 0xF); - ret += cur; - val = (val >> 4); - } - return ret; - } -#endif -}; - -} // namespace - -void megdnn::cuda::_do_dispatch_reduce_with_scale_filter_u4( - const uint8_t* src, int32_t scale, uint32_t rows, uint32_t cols, - int32_t* dst, cudaStream_t stream) { - // rows = OC - // cols is measured in pixels, i.e. IC * FH * FW / 8, a pixel consists of 8 - // subbyte data, - ReduceWithScaleUInt4Op op; - op.src = src; - op.scale = scale; - op.dst = dst; - static_cast(op); - static_cast(stream); - static_cast(rows); - static_cast(cols); - run_reduce(dst + rows, rows, cols, 1, stream, - op); -} - -size_t megdnn::cuda::_do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, - size_t C) { - return get_reduce_workspace_in_bytes(A, B, C); -} - -// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh deleted file mode 100644 index 9074f4eb75e33f508a3c5091261d1d31f5ef955c..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh +++ /dev/null @@ -1,48 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/** - * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ - -#include "src/cuda/utils.cuh" - -namespace megdnn { -namespace cuda { -void _do_dispatch_reduce_with_scale_filter_u4(const uint8_t* src, int32_t scale, - uint32_t rows, uint32_t cols, - int32_t* dst, - cudaStream_t stream); -size_t _do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, size_t C); -} // namespace cuda -} // namespace megdnn - -// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cu b/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cu new file mode 100644 index 0000000000000000000000000000000000000000..0fa352bf4d2bec3640ea9d4c7c62d0bc515e4c59 --- /dev/null +++ b/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cu @@ -0,0 +1,114 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + *modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in the + *documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its + *contributors may be used to endorse or promote products derived from this + *software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, + *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING + *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/reduce_with_scale_filter.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./reduce_with_scale_filter.cuh" +#include "src/cuda/reduce_helper.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" + +using namespace megdnn; +using namespace cuda; + +namespace { + +template +struct ReduceWithScaleInt4Op { + typedef int32_t wtype; + const uint8_t* src; + int32_t* dst; + int32_t scale; + static const wtype INIT = 0; + +#if MEGDNN_CC_CUDA + __host__ __device__ void write(uint32_t idx, wtype val) { + dst[idx] = val * scale; + } + + __host__ __device__ static wtype apply(wtype a, wtype b) { return a + b; } + + __device__ wtype read(uint32_t idx) { + constexpr uint32_t subbytes_per_pixel = 8; + const uint32_t* sptr = + (const uint32_t*)(src + subbytes_per_pixel * idx / 2); + uint32_t val = *sptr; + int32_t ret = 0; +#pragma unroll + for (int j = 0; j < 8; j++) { + ret += integer_subbyte::unpack_integer_4bits(val, + (j << 2)); + } + return ret; + } +#endif +}; + +} // namespace + +template +void megdnn::cuda::do_dispatch_reduce_with_scale_filter_4bit( + const uint8_t* src, int32_t scale, uint32_t rows, uint32_t cols, + int32_t* dst, cudaStream_t stream) { + // rows = OC + // cols is measured in pixels, i.e. IC * FH * FW / 8, a pixel consists of 8 + // subbyte data, + ReduceWithScaleInt4Op op; + op.src = src; + op.scale = scale; + op.dst = dst; + static_cast(op); + static_cast(stream); + static_cast(rows); + static_cast(cols); + run_reduce, false>(dst + rows, rows, cols, + 1, stream, op); +} + +#define INST(signedness) \ + template void \ + megdnn::cuda::do_dispatch_reduce_with_scale_filter_4bit( \ + const uint8_t* src, int32_t scale, uint32_t rows, uint32_t cols, \ + int32_t* dst, cudaStream_t stream) +INST(false); +INST(true); +#undef INST + +size_t megdnn::cuda::do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, + size_t C) { + return get_reduce_workspace_in_bytes>(A, B, C); +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cuh b/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cuh new file mode 100644 index 0000000000000000000000000000000000000000..595b3765fa259de9a9d9d1ceb7d5c488ad84af04 --- /dev/null +++ b/dnn/src/cuda/conv_bias/reduce_with_scale_filter.cuh @@ -0,0 +1,52 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + *modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in the + *documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its + *contributors may be used to endorse or promote products derived from this + *software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, + *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING + *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/reduce_with_scale_filter.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +template +void do_dispatch_reduce_with_scale_filter_4bit(const uint8_t* src, + int32_t scale, uint32_t rows, + uint32_t cols, int32_t* dst, + cudaStream_t stream); +size_t do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, size_t C); +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/sass_implicit_gemm_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/sass_implicit_gemm_int4_nchw64_imma.cpp deleted file mode 100644 index 4899531e894d0b805cf9870ca17242033ce9260a..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/conv_bias/sass_implicit_gemm_int4_nchw64_imma.cpp +++ /dev/null @@ -1,312 +0,0 @@ -/** - * \file dnn/src/cuda/conv_bias/sass_implicit_gemm_int4_nchw64_imma.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or - * implied. - */ - -#include "./algo.h" -#include "src/cuda/conv_bias/sass_helper.cuh" -#include "src/cuda/sass_loader.h" -#include "src/cuda/utils.h" -#include "src/common/conv_bias.h" - -using namespace megdnn; -using namespace cuda; -using namespace sass; - -namespace { -#if !MEGDNN_TEGRA_X1 -// all stride are in bytes -void compute_conv2d_offset(size_t fh, size_t fw, size_t ics, size_t ihs, - Conv2dConstantOffset& constant_offset) { - constexpr int interleaved = 64; - constexpr int size_bits = 4; - constexpr int threablock_k = 128; - constexpr int inc_step = threablock_k / interleaved; - size_t i = 0; - int* s32 = reinterpret_cast(&(constant_offset.c_offset[0])); - for (; i < inc_step; i++) { - int c = i / (fh * fw); - int khkw = i % (fh * fw); - int kh = khkw / fw; - int kw = khkw % fw; - s32[2 * i] = c * ics + kh * ihs + kw * interleaved * size_bits / 8; - int8_t* s8 = reinterpret_cast(&(s32[2 * i + 1])); - s8[0] = kh; - s8[1] = kw; - s8[2] = -kh; - s8[3] = -kw; - } - for (; i < (inc_step + fh * fw * inc_step); i++) { - int c = i / (fh * fw); - int khkw = i % (fh * fw); - int kh = khkw / fw; - int kw = khkw % fw; - s32[2 * i] = c * ics + kh * ihs + kw * interleaved * size_bits / 8; - int8_t* s8 = reinterpret_cast(&(s32[2 * i + 1])); - s8[0] = kh; - s8[1] = kw; - s8[2] = -kh; - s8[3] = -kw; - int i_ = i - inc_step; - c = i_ / (fh * fw); - khkw = i_ % (fh * fw); - kh = khkw / fw; - kw = khkw % fw; - s32[2 * i] -= c * ics + kh * ihs + kw * interleaved * size_bits / 8; - } -} -#endif -}; // namespace - -std::string ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm::kernel_key( - const SizeArgs& args) const { - std::string kernel_key; - using NonlineMode = Param::NonlineMode; - auto&& param = args.opr->param(); - if (args.z_layout->ndim > 0) { - kernel_key = - ssprintf("%s_conv_bias_int4_fuse_z_imma8832_ldg16_%ux%u", - current_device_arch_name(), m_tile_nhw, m_tile_oc); - } else { - kernel_key = - ssprintf("%s_conv_bias_int4_imma8832_ldg16_%ux%u", - current_device_arch_name(), m_tile_nhw, m_tile_oc); - } - if (param.nonlineMode == NonlineMode::H_SWISH) { - kernel_key += "_hswish"; - } else { - megdnn_assert(param.nonlineMode == NonlineMode::RELU || - param.nonlineMode == NonlineMode::IDENTITY); - kernel_key += "_relu"; - } - return kernel_key; -} - -bool ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm::is_available( - const SizeArgs& args) const { - if (args.bias_layout->ndim <= 0) - return false; - using Param = param::ConvBias; - using Format = Param::Format; - using Sparse = Param::Sparse; - using Mode = Param::Mode; - bool available = true; - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) - return false; - if (param.format != Format::NCHW64) - return false; - UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); - // TODO support group conv - available &= param.sparse == Sparse::DENSE; - // mode must be cross correlation - available &= param.mode == Mode::CROSS_CORRELATION; - // check data type - auto src_dtype = args.src_layout->dtype, - filter_dtype = args.filter_layout->dtype, - bias_dtype = args.bias_layout->dtype, - dst_dtype = args.dst_layout->dtype; - available &= (src_dtype.enumv() == DTypeEnum::QuantizedS4 && - filter_dtype.enumv() == DTypeEnum::QuantizedS4 && - bias_dtype.enumv() == DTypeEnum::QuantizedS32 && - dst_dtype.enumv() == DTypeEnum::QuantizedS4); - // TODO: support dialtion - available &= dh == 1 && dw == 1; - // ensure precomputed offsets are positive integers - available &= hi >= fh && wi >= fw; - // only support sm_75 or later, platform should have tensorcore int8 - // support - available &= is_compute_capability_required(7, 5); - // param buffer size is 4K, use 3K to store precomputed offset, fh * fw <= - // (3*1024/4/2/2) - 1 - available &= fh * fw <= 191; - return available; -} - -size_t -ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm::get_workspace_in_bytes( - const SizeArgs& args) const { - if (args.preprocessed_filter == nullptr) { - return args.filter_layout->span().dist_byte() + - args.bias_layout->span().dist_byte(); - } - return 0_z; -} - -void ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm::exec( - const ExecArgs& args) const { -#if MEGDNN_TEGRA_X1 - megdnn_throw("sass kernel is disabled at compile time for TX1"); -#else - using Format = Param::Format; - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); - auto&& stream = cuda_stream(args.opr->handle()); - constexpr int interleaved = 64; - - void* bias_ptr = nullptr; - void* filter_ptr = nullptr; - if (args.preprocessed_filter) { - megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; - } else { - // reorder filter and bias - filter_ptr = reinterpret_cast(args.workspace.raw_ptr); - bias_ptr = - reinterpret_cast(args.workspace.raw_ptr + - args.filter_layout->span().dist_byte()); - if (args.z_layout->ndim > 0) { - reorder_imma_filter_bias<4, 64>( - reinterpret_cast(filter_ptr), - reinterpret_cast(bias_ptr), - reinterpret_cast(args.filter_tensor->raw_ptr), - args.bias_tensor->compatible_ptr(), co, ci, fh, fw, - stream); - } else { - reorder_imma_filter_bias<4, 64, true>( - reinterpret_cast(filter_ptr), - reinterpret_cast(bias_ptr), - reinterpret_cast(args.filter_tensor->raw_ptr), - args.bias_tensor->compatible_ptr(), co, ci, fh, fw, - stream); - } - } - - uint32_t u32_n = n, u32_ci = ci, u32_hi = hi, u32_wi = wi, u32_fh = fh, - u32_fw = fw, u32_sh = sh, u32_sw = sw, u32_ph = ph, u32_pw = pw, - u32_co = co, u32_ho = ho, u32_wo = wo; - Conv2dInt4Param kern_param(u32_n, u32_ci, u32_hi, u32_wi, u32_fh, u32_fw, - u32_sh, u32_sw, u32_ph, u32_pw, u32_co, u32_ho, - u32_wo, interleaved); - - Conv2dConstantOffset kern_coffset; - compute_conv2d_offset(fh, fw, kern_param.ics, kern_param.ihs, kern_coffset); - // The starting address of Turing param buffer is c[0x0][0x160] - kern_coffset.c_offset_param.begin = param_buffer_start_address(); - kern_coffset.c_offset_param.size = 16 * (1 + fh * fw); - kern_coffset.c_offset_param.max = 16 * fh * fw; - kern_coffset.c_offset_param.rewind = 16 * (1 - fh * fw); - - auto kern_key = kernel_key(args); - float src_scale = args.src_layout->dtype.param().scale, - filter_scale = - args.filter_layout->dtype.param().scale, - bias_scale = - args.bias_layout->dtype.param().scale, - dst_scale = args.dst_layout->dtype.param().scale; - float alpha = src_scale * filter_scale / dst_scale, - beta = bias_scale / dst_scale; - float inv_dst_scale = 1.f / dst_scale; - - unsigned int tx = m_threads, ty = 1; - unsigned int gridx = div_ceil( - static_cast(n * ho * wo), m_tile_nhw); - unsigned int gridy = - div_ceil(static_cast(co), m_tile_oc); - void* src_ptr = const_cast(args.src_tensor->raw_ptr); - void* dst_ptr = const_cast(args.dst_tensor->raw_ptr); - - using NonlineMode = Param::NonlineMode; - auto&& kernel = SASSKernelLoader::instance().get_kernel(kern_key, kern_key); - if (args.z_layout->ndim > 0) { - void* z_ptr = const_cast(args.z_tensor->raw_ptr); - float z_scale = args.z_layout->dtype.param().scale; - float gamma = z_scale / dst_scale; - std::vector params = {&src_ptr, &filter_ptr, &bias_ptr, &z_ptr, - &dst_ptr, &alpha, &beta, &gamma}; - kern_coffset.c_offset_param.begin += - sizeof(src_ptr) + sizeof(filter_ptr) + sizeof(bias_ptr) + - sizeof(z_ptr) + sizeof(dst_ptr) + sizeof(alpha) + sizeof(beta) + - sizeof(gamma); - - uint32_t relu = param.nonlineMode == NonlineMode::RELU ? 1 : 0; - if (param.nonlineMode == NonlineMode::H_SWISH) { - params.push_back(&dst_scale); - params.push_back(&inv_dst_scale); - kern_coffset.c_offset_param.begin += - sizeof(dst_scale) + sizeof(inv_dst_scale); - } else { - params.push_back(&relu); - kern_coffset.c_offset_param.begin += sizeof(relu); - } - params.push_back(&kern_param); - kern_coffset.c_offset_param.begin += sizeof(kern_param); - kern_coffset.c_offset_param.begin += - sizeof(kern_coffset.c_offset_param); - kern_coffset.c_offset_param.max += kern_coffset.c_offset_param.begin; - params.push_back(&kern_coffset); - cucheck(cuLaunchKernel(kernel, gridx, gridy, 1, tx, ty, 1, 0, stream, - params.data(), 0)); - } else { - std::vector params = {&src_ptr, &filter_ptr, &bias_ptr, - &dst_ptr, &alpha, &beta}; - - kern_coffset.c_offset_param.begin += - sizeof(src_ptr) + sizeof(filter_ptr) + sizeof(bias_ptr) + - sizeof(dst_ptr) + sizeof(alpha) + sizeof(beta); - - uint32_t relu = param.nonlineMode == NonlineMode::RELU ? 1 : 0; - if (param.nonlineMode == NonlineMode::H_SWISH) { - params.push_back(&dst_scale); - params.push_back(&inv_dst_scale); - kern_coffset.c_offset_param.begin += - sizeof(dst_scale) + sizeof(inv_dst_scale); - } else { - params.push_back(&relu); - kern_coffset.c_offset_param.begin += sizeof(relu); - } - params.push_back(&kern_param); - kern_coffset.c_offset_param.begin += sizeof(kern_param); - kern_coffset.c_offset_param.begin += - sizeof(kern_coffset.c_offset_param); - kern_coffset.c_offset_param.max += kern_coffset.c_offset_param.begin; - params.push_back(&kern_coffset); - cucheck(cuLaunchKernel(kernel, gridx, gridy, 1, tx, ty, 1, 0, stream, - params.data(), 0)); - } - after_kernel_launch(); -#endif -} - -size_t ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm:: - get_preprocess_workspace_in_bytes(const SizeArgs& args) const { - return 0_z; -} - -SmallVector ConvBiasForwardImpl:: - AlgoSASSInt4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( - const SizeArgs& args) const { - return {args.filter_layout->collapse_contiguous(), - args.bias_layout->collapse_contiguous()}; -} - -void ConvBiasForwardImpl::AlgoSASSInt4NCHW64IMMAImplicitGemm::exec_preprocess( - const ExecArgs& args) const { - using Format = Param::Format; - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); - auto&& stream = cuda_stream(args.opr->handle()); - reorder_imma_filter_bias<4, 64>( - reinterpret_cast( - args.preprocessed_filter->tensors[0].raw_ptr), - args.preprocessed_filter->tensors[1].compatible_ptr(), - reinterpret_cast(args.filter_tensor->raw_ptr), - args.bias_tensor->compatible_ptr(), co, ci, fh, fw, - stream); -} - -// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 227353dec1bd8eb78fb134e801e9c9d55281b11f..20fa04e4bca3542e4fb93576062810356048b81b 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -161,6 +161,38 @@ void forward_bias( forward_bias( new_src, new_flt, bias, dst, nullptr, new_filter_meta); } + +template <> +void forward_bias( + _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, dt_byte* workspace_ptr, + const ConvBiasForward::CanonizedFilterMeta& filter_meta) { + auto convert_layout_src = [](const TensorLayout& layout) { + auto ret = layout; + auto param = layout.dtype.param(); + ret.dtype = dtype::QuantizedS8(param.scale); + ret.format = TensorFormat(ret.dtype); + ret.init_contiguous_stride(); + return ret; + }; + auto convert_layout_flt = [](const TensorLayout& layout) { + auto ret = layout; + auto param = layout.dtype.param(); + ret.dtype = dtype::QuantizedS8(param.scale); + ret.format = TensorFormat(ret.dtype); + ret.init_contiguous_stride(); + return ret; + }; + TensorND new_src = {workspace_ptr, convert_layout_src(src.layout)}; + TensorND new_flt = {workspace_ptr + new_src.layout.span().dist_byte(), + convert_layout_flt(filter.layout)}; + uint4_to_int8(src, new_src); + int4_to_int8(filter, new_flt); + auto new_filter_meta = filter_meta; + new_filter_meta.dtype = new_flt.layout.dtype; + forward_bias( + new_src, new_flt, bias, dst, nullptr, new_filter_meta); +} } // namespace convolution size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, @@ -211,9 +243,10 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, TensorLayout{dst.layout, bias.layout.dtype}}; workspace_ptr += sfb.layout.span().dist_byte(); } -#define DISPATCH_RAW(in_dt, bias_dt, out_dt, cmode, func) \ +#define DISPATCH_RAW(in_dt, flt_dt, bias_dt, out_dt, cmode, func) \ else if (src.layout.dtype.enumv() == DTypeTrait::enumv && \ - filter.layout.dtype.enumv() == DTypeTrait::enumv && \ + filter.layout.dtype.enumv() == \ + DTypeTrait::enumv && \ bias.layout.dtype.enumv() == DTypeTrait::enumv && \ sfb.layout.dtype.enumv() == DTypeTrait::enumv && \ param().compute_mode == Param::ComputeMode::cmode) { \ @@ -222,7 +255,7 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, } #define DISPATCH(in_dt, out_dt) \ DISPATCH_RAW( \ - in_dt, out_dt, out_dt, DEFAULT, \ + in_dt, in_dt, out_dt, out_dt, DEFAULT, \ (convolution::forward_bias::ctype, \ DTypeTrait::ctype, \ DTypeTrait::ctype, \ @@ -236,16 +269,21 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, DISPATCH(QuantizedS8, Float32) DISPATCH(Quantized8Asymm, QuantizedS32) DISPATCH(Quantized4Asymm, QuantizedS32) - DISPATCH_RAW(QuantizedS8, QuantizedS32, QuantizedS32, FLOAT32, + DISPATCH_RAW(QuantizedS8, QuantizedS8, QuantizedS32, QuantizedS32, + FLOAT32, (convolution::forward_bias)) DISPATCH(QuantizedS4, QuantizedS32) + DISPATCH_RAW(Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, + DEFAULT, + (convolution::forward_bias)) #if !MEGDNN_DISABLE_FLOAT16 DISPATCH(Float16, Float16) - DISPATCH_RAW(Float16, Float16, Float16, FLOAT32, + DISPATCH_RAW(Float16, Float16, Float16, Float16, FLOAT32, (convolution::forward_bias)) - DISPATCH_RAW(BFloat16, BFloat16, BFloat16, FLOAT32, + DISPATCH_RAW(BFloat16, BFloat16, BFloat16, BFloat16, FLOAT32, (convolution::forward_bias)) #endif diff --git a/dnn/src/naive/lowbit_utils.cpp b/dnn/src/naive/lowbit_utils.cpp index aab9dbbaab939a49970d2d64bfce15ebfdbb75de..0a340b284d9df8d7872ad89d00bf060165dcf12c 100644 --- a/dnn/src/naive/lowbit_utils.cpp +++ b/dnn/src/naive/lowbit_utils.cpp @@ -57,6 +57,54 @@ void megdnn::naive::uint8_to_uint4(const TensorND& in, const TensorND& out) { } } +void megdnn::naive::uint4_to_int8(const TensorND& in, const TensorND& out) { + auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto out_ptr = out.compatible_ptr() + out.layout.span().low_byte; + const auto& ly = in.layout; + int8_t zero_point = + (int8_t)ly.dtype.param().zero_point; + auto dim_in = ly.shape[ly.ndim - 1]; + auto elems = ly.total_nr_elems(); + auto dim_out = elems / dim_in; + auto stride_out = div_ceil(dim_in, 2_z); + for (size_t i = 0; i < dim_out; ++i) { + for (size_t j = 0; j < dim_in; j += 2) { + uint8_t val = in_ptr[j / 2]; + out_ptr[j] = (int8_t)(val & 0xF) - zero_point; + if (j + 1 < dim_in) + out_ptr[j + 1] = (int8_t)((val >> 4) & 0xF) - zero_point; + } + in_ptr += stride_out; + out_ptr += dim_in; + } +} + +void megdnn::naive::int8_to_uint4(const TensorND& in, const TensorND& out) { + auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto out_ptr = + static_cast(out.raw_ptr) + out.layout.span().low_byte; + auto zero_point = + out.layout.dtype.param().zero_point; + const auto& ly = in.layout; + auto dim_in = ly.shape[ly.ndim - 1]; + auto elems = ly.total_nr_elems(); + auto dim_out = elems / dim_in; + auto stride_out = div_ceil(dim_in, 2_z); + for (size_t i = 0; i < dim_out; ++i) { + for (size_t j = 0; j < dim_in; j += 2) { + uint8_t a = (uint8_t)std::max((int32_t)in_ptr[j] + zero_point, 0); + uint8_t b = 0; + if (j + 1 < dim_in) + b = (uint8_t)std::max((int32_t)in_ptr[j + 1] + zero_point, 0); + a = std::min(a, DTypeTrait::max()); + b = std::min(b, DTypeTrait::max()); + out_ptr[j / 2] = a + (b << 4); + } + in_ptr += dim_in; + out_ptr += stride_out; + } +} + // ==================================qint4====================================== void megdnn::naive::int4_to_int8(const TensorND& in, const TensorND& out) { auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; diff --git a/dnn/src/naive/lowbit_utils.h b/dnn/src/naive/lowbit_utils.h index f4d99e51725b8772b34182f0a7f36f3e398f9b9d..2d7021f798dd50a153a2bfed60a259c96d8a1f3b 100644 --- a/dnn/src/naive/lowbit_utils.h +++ b/dnn/src/naive/lowbit_utils.h @@ -20,6 +20,10 @@ void uint4_to_uint8(const TensorND& in, const TensorND& out); void uint8_to_uint4(const TensorND& in, const TensorND& out); +void uint4_to_int8(const TensorND& in, const TensorND& out); + +void int8_to_uint4(const TensorND& in, const TensorND& out); + void int4_to_int8(const TensorND& in, const TensorND& out); void int8_to_int4(const TensorND& in , const TensorND& out); diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index b12e4f2e1955f52ec614f9926b59a248c619275e..9ffe92b3ad1ad9d748134c2d2115a47b94674ce5 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -733,19 +733,33 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, param::ConvBias::Format format, const std::vector& args, bool fuse_z, bool stable_test) { - megdnn_assert(src_dtype.enumv() == filter_dtype.enumv()); + megdnn_assert((src_dtype.enumv() == filter_dtype.enumv()) || + (src_dtype.enumv() == DTypeEnum::Quantized4Asymm && + filter_dtype.enumv() == DTypeEnum::QuantizedS4)); Checker checker(handle, !stable_test); if (algo) { checker.set_before_exec_callback( ConvBiasAlgoChecker(algo)); } std::unique_ptr rng; + std::unique_ptr flt_rng; std::unique_ptr bias_rng; std::unique_ptr const_rng; std::unique_ptr zero_rng; // TODO: check range of rng if (src_dtype.enumv() == DTypeEnum::QuantizedS8) { rng = std::make_unique(-3, 3); + flt_rng = std::make_unique(-3, 3); + const_rng = std::make_unique(1, 1); + zero_rng = std::make_unique(0, 0); + megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); + bias_rng = std::make_unique(-50, 50); + checker.set_epsilon(1 + 1e-3) + .set_max_avg_error(1e-1) + .set_max_avg_biased_error(1e-3); + } else if (src_dtype.enumv() == DTypeEnum::Quantized4Asymm) { + rng = std::make_unique(0, 6); + flt_rng = std::make_unique(-3, 3); const_rng = std::make_unique(1, 1); zero_rng = std::make_unique(0, 0); megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); @@ -755,6 +769,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, .set_max_avg_biased_error(1e-3); } else if (src_dtype.enumv() == DTypeEnum::QuantizedS4) { rng = std::make_unique(-3, 3); + flt_rng = std::make_unique(-3, 3); const_rng = std::make_unique(1, 1); zero_rng = std::make_unique(0, 0); megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); @@ -764,11 +779,13 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, .set_max_avg_biased_error(1e-3); } else if (src_dtype.enumv() == DTypeEnum::Float16) { rng = std::make_unique(2.f); + flt_rng = std::make_unique(2.f); megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float16); bias_rng = std::make_unique(2.f); checker.set_epsilon(1e-2); } else if (src_dtype.enumv() == DTypeEnum::Float32) { rng = std::make_unique(2.f); + flt_rng = std::make_unique(2.f); megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float32); bias_rng = std::make_unique(2.f); } @@ -819,9 +836,9 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, } return z; }; - megdnn_assert(rng != nullptr && bias_rng != nullptr); + megdnn_assert(rng != nullptr && flt_rng != nullptr && bias_rng != nullptr); checker.set_rng(0, rng.get()) - .set_rng(1, rng.get()) + .set_rng(1, flt_rng.get()) .set_rng(2, bias_rng.get()) .set_rng(3, rng.get()); if (stable_test) { diff --git a/dnn/test/cuda/conv_test_utils.cpp b/dnn/test/cuda/conv_test_utils.cpp index ff9ae0dfebd976337bdb4199e68959ee3ae4c671..f6c4abe397fe613c12d1ce9a99064bd3d3de720b 100644 --- a/dnn/test/cuda/conv_test_utils.cpp +++ b/dnn/test/cuda/conv_test_utils.cpp @@ -257,7 +257,9 @@ void benchmark_target_algo_with_cudnn_tsc( param::ConvBias::Format change_cudnn_format, DType change_cudnn_src_dtype, DType change_cudnn_filter_dtype, DType change_cudnn_bias_dtype, DType change_cudnn_dst_dtype) { - megdnn_assert(src_dtype.enumv() == filter_dtype.enumv()); + megdnn_assert((src_dtype.enumv() == filter_dtype.enumv()) || + (src_dtype.enumv() == DTypeEnum::Quantized4Asymm && + filter_dtype.enumv() == DTypeEnum::QuantizedS4)); CUBenchmarker benchmarker(handle); CUBenchmarker benchmarker_cudnn(handle); size_t RUNS = 200; @@ -299,30 +301,30 @@ void benchmark_target_algo_with_cudnn_tsc( using Param = ConvBias::Param; using Format = Param::Format; // helper function to change format - auto get_tensor_shape = [](TensorShape shape, + auto get_tensor_shape = [](TensorShape shape, DType dtype, Format format) -> TensorShape { TensorShape ret; if (format == Format::NCHW4) { ret = static_cast( - TensorLayout{shape, dtype::Int8()} + TensorLayout{shape, dtype} .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]}) .dimshuffle({0, 1, 3, 4, 2})); } else if (format == Format::NCHW32) { ret = static_cast( - TensorLayout{shape, dtype::Int8()} + TensorLayout{shape, dtype} .reshape({shape[0], shape[1] / 32, 32, shape[2], shape[3]}) .dimshuffle({0, 1, 3, 4, 2})); } else if (format == Format::NCHW64) { ret = static_cast( - TensorLayout{shape, dtype::QuantizedS4(1.f)} + TensorLayout{shape, dtype} .reshape({shape[0], shape[1] / 64, 64, shape[2], shape[3]}) .dimshuffle({0, 1, 3, 4, 2})); } else if (format == Format::CHWN4) { ret = static_cast( - TensorLayout{shape, dtype::Int8()} + TensorLayout{shape, dtype} .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]}) .dimshuffle({1, 3, 4, 0, 2})); @@ -370,21 +372,24 @@ void benchmark_target_algo_with_cudnn_tsc( if (algo) { time_in_ms = algo_benchmark, - CUTimer>(benchmarker, - {get_tensor_shape(src, format), - get_tensor_shape(filter, format), - get_tensor_shape(bias, format), - {}, - {}}, - algo) / + CUTimer>( + benchmarker, + {get_tensor_shape(src, src_dtype, format), + get_tensor_shape(filter, filter_dtype, format), + get_tensor_shape(bias, bias_dtype, format), + {}, + {}}, + algo) / RUNS; } else { - time_in_ms = benchmarker.execs({get_tensor_shape(src, format), - get_tensor_shape(filter, format), - get_tensor_shape(bias, format), - {}, - {}}) / - RUNS; + time_in_ms = + benchmarker.execs( + {get_tensor_shape(src, src_dtype, format), + get_tensor_shape(filter, filter_dtype, format), + get_tensor_shape(bias, bias_dtype, format), + {}, + {}}) / + RUNS; } float time_in_ms_cudnn = 0; if (with_cudnn) { @@ -393,9 +398,11 @@ void benchmark_target_algo_with_cudnn_tsc( algo_benchmark, CUTimer>( benchmarker_cudnn, - {get_tensor_shape(src, format_cudnn), - get_tensor_shape(filter, format_cudnn), - get_tensor_shape(bias, format_cudnn), + {get_tensor_shape(src, src_dtype, format_cudnn), + get_tensor_shape(filter, filter_dtype, + format_cudnn), + get_tensor_shape(bias, bias_dtype, + format_cudnn), {}, {}}, change_cudnn_algo) / @@ -403,9 +410,11 @@ void benchmark_target_algo_with_cudnn_tsc( } else { time_in_ms_cudnn = benchmarker_cudnn.execs( - {get_tensor_shape(src, format_cudnn), - get_tensor_shape(filter, format_cudnn), - get_tensor_shape(bias, format_cudnn), + {get_tensor_shape(src, src_dtype, format_cudnn), + get_tensor_shape(filter, filter_dtype, + format_cudnn), + get_tensor_shape(bias, bias_dtype, + format_cudnn), {}, {}}) / RUNS; @@ -426,21 +435,24 @@ void benchmark_target_algo_with_cudnn_tsc( if (algo) { time_in_ms = algo_benchmark, - CUTimer>(benchmarker, - {get_tensor_shape(src, format), - get_tensor_shape(filter, format), - get_tensor_shape(bias, format), - get_tensor_shape(z, format), - {}}, - algo) / + CUTimer>( + benchmarker, + {get_tensor_shape(src, src_dtype, format), + get_tensor_shape(filter, filter_dtype, format), + get_tensor_shape(bias, bias_dtype, format), + get_tensor_shape(z, src_dtype, format), + {}}, + algo) / RUNS; } else { - time_in_ms = benchmarker.execs({get_tensor_shape(src, format), - get_tensor_shape(filter, format), - get_tensor_shape(bias, format), - get_tensor_shape(z, format), - {}}) / - RUNS; + time_in_ms = + benchmarker.execs( + {get_tensor_shape(src, src_dtype, format), + get_tensor_shape(filter, filter_dtype, format), + get_tensor_shape(bias, bias_dtype, format), + get_tensor_shape(z, src_dtype, format), + {}}) / + RUNS; } time_in_ms_cudnn = 0; if (with_cudnn) { @@ -449,20 +461,24 @@ void benchmark_target_algo_with_cudnn_tsc( algo_benchmark, CUTimer>( benchmarker_cudnn, - {get_tensor_shape(src, format_cudnn), - get_tensor_shape(filter, format_cudnn), - get_tensor_shape(bias, format_cudnn), - get_tensor_shape(z, format_cudnn), + {get_tensor_shape(src, src_dtype, format_cudnn), + get_tensor_shape(filter, filter_dtype, + format_cudnn), + get_tensor_shape(bias, bias_dtype, + format_cudnn), + get_tensor_shape(z, src_dtype, format_cudnn), {}}, change_cudnn_algo) / RUNS; } else { time_in_ms_cudnn = benchmarker_cudnn.execs( - {get_tensor_shape(src, format_cudnn), - get_tensor_shape(filter, format_cudnn), - get_tensor_shape(bias, format_cudnn), - get_tensor_shape(z, format_cudnn), + {get_tensor_shape(src, src_dtype, format_cudnn), + get_tensor_shape(filter, filter_dtype, + format_cudnn), + get_tensor_shape(bias, bias_dtype, + format_cudnn), + get_tensor_shape(z, src_dtype, format_cudnn), {}}) / RUNS; } diff --git a/dnn/test/naive/conv_bias.cpp b/dnn/test/naive/conv_bias.cpp index b4ae4df07eb6240d5ed6e375a82f243beaaaa6ab..3a6381f51853d1531f66a26970f623d8b732ae76 100644 --- a/dnn/test/naive/conv_bias.cpp +++ b/dnn/test/naive/conv_bias.cpp @@ -746,6 +746,45 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED4) { checker.set_param(param).exect(Testcase{input, filter, bias, z, {}}, Testcase{{}, {}, {}, {}, output}); + + // test qu4 x q4 + + for (size_t i = 0; i < input_values.size(); i++) { + input_values[i] = input_values[i] + 8; + } + + for (size_t i = 0; i < z_values.size(); i++) { + z_values[i] = z_values[i] + 8; + } + + std::vector output_uint4; + auto dtype_qu4 = dtype::Quantized4Asymm(0.01, 8); + for (size_t i = 0; i < output_values.size(); i++) { + int result = + static_cast(dtype_qu4.param() + .quantize(output_values.at(i) * 0.01) + .as_uint8()); + output_uint4.push_back(result); + } + + auto input_qu4 = TensorValueLowbit4( + {1, 1, 4, 4}, dtype::Quantized4Asymm(0.1, 8), input_values); + + auto filter_q4 = TensorValueLowbit4({3, 1, 3, 3}, dtype::QuantizedS4(0.1), + filter_values); + + auto bias_s32 = GenTensorValue({1, 3, 1, 1}, dtype::QuantizedS32(0.01), + bias_values); + + auto z_qu4 = TensorValueLowbit4({1, 3, 2, 2}, + dtype::Quantized4Asymm(0.01, 8), z_values); + + auto output_qu4 = TensorValueLowbit4( + {1, 3, 2, 2}, dtype::Quantized4Asymm(0.01, 8), output_uint4); + + checker.set_param(param).exect( + Testcase{input_qu4, filter_q4, bias_s32, z_qu4, {}}, + Testcase{{}, {}, {}, {}, output_qu4}); } TEST_F(NAIVE, CONV_BIAS_NCHW64_Q4) { @@ -3329,7 +3368,7 @@ TEST_F(NAIVE, CONV_BIAS_NCHW64_Q4) { auto input_64 = TensorValueLowbit4({1, 1, 4, 4, 64}, dtype::QuantizedS4(0.1), input_values); - auto fliter_64 = TensorValueLowbit4({64, 1, 3, 3, 64}, + auto filter_64 = TensorValueLowbit4({64, 1, 3, 3, 64}, dtype::QuantizedS4(0.1), filter_values); auto bias1_64 = GenTensorValue({1, 1, 1, 1, 64}, dtype::QuantizedS32(0.01), bias_1); @@ -3338,7 +3377,31 @@ TEST_F(NAIVE, CONV_BIAS_NCHW64_Q4) { {1, 1, 2, 2, 64}, dtype::QuantizedS4(1), output_values); checker.set_param(param).exect( - Testcase{input_64, fliter_64, bias1_64, {}, {}}, + Testcase{input_64, filter_64, bias1_64, {}, {}}, Testcase{{}, {}, {}, {}, output_64}); + + // test qu4 x q4 + + for (size_t i = 0; i < input_values.size(); i++) { + input_values[i] = input_values[i] + 8; + } + + for (size_t i = 0; i < output_values.size(); i++) { + output_values[i] = output_values[i] + 8; + } + + auto input_qu4_64 = TensorValueLowbit4( + {1, 1, 4, 4, 64}, dtype::Quantized4Asymm(0.1, 8), input_values); + auto filter_q4_64 = TensorValueLowbit4( + {64, 1, 3, 3, 64}, dtype::QuantizedS4(0.1), filter_values); + auto bias_64 = + GenTensorValue({1, 1, 1, 1, 64}, dtype::QuantizedS32(0.01), bias_1); + + auto output_q4_64 = TensorValueLowbit4( + {1, 1, 2, 2, 64}, dtype::Quantized4Asymm(1, 8), output_values); + + checker.set_param(param).exect( + Testcase{input_qu4_64, filter_q4_64, bias_64, {}, {}}, + Testcase{{}, {}, {}, {}, output_q4_64}); } // vim: syntax=cpp.doxygen