diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index 117a87506761664a8f4396a56780e8dd7c79fa96..5b71c8ec04a401ade4f6331ea6348d234f29d5b4 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -92,6 +92,12 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { fill_dwconv_algos(); all_algos.push_back(&int8_chwn4_dotprod); all_algos.push_back(&fallback_nchw_qs8); + + fill_ptx_algos(); + for (auto&& algo : algo_ptx_conv2d_u4_s4) { + all_algos.push_back(&algo); + } + for (size_t i = all_algo_size; i < all_algos.size(); ++i) { non_cudnn_algos.push_back(all_algos[i]); } @@ -364,6 +370,15 @@ void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 1, 1, 4, 2}); } +void ConvBiasForwardImpl::AlgoPack::fill_ptx_algos() { + algo_ptx_conv2d_u4_s4.emplace_back( + AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm{128, 256, 256}); + algo_ptx_conv2d_u4_s4.emplace_back( + AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm{128, 128, 128}); + algo_ptx_conv2d_u4_s4.emplace_back( + AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm{256, 64, 128}); +} + ConvBiasForwardImpl::AlgoBase* ConvBiasForwardImpl::AlgoPack::cudnn_conv_from_enum( cudnnConvolutionFwdAlgo_t algo) { for (auto&& i : cudnn_convs) { diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index f57ed2196af7c5b496ba78a73fd11dbc46bee11c..57e5344f8dc9b6024a7e64ca1d08b825e77df94b 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -78,6 +78,7 @@ public: CUDA_SIMPLE_INT1, CUDA_CUDNN_CONV_V8, CUDA_CUDNN_CONVBIAS_V8, + CUDA_IMPLICIT_GEMM_PTX_NCHW64_IMMA_UINT4_INT4, }; using Mapper = std::unordered_map; @@ -1203,6 +1204,45 @@ private: WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; }; +class ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm final + : public AlgoBase { +public: + AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm( + unsigned int tile_nhw, unsigned int tile_oc, unsigned int threads) + : m_tile_nhw{tile_nhw}, m_tile_oc{tile_oc}, m_threads{threads} { + m_name = ConvBias::algo_name( + ssprintf( + "PTX_UINT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%uX%u_%u", m_tile_nhw, + m_tile_oc, m_threads), + ConvBias::DirectParam{}); + } + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } + size_t get_preprocess_workspace_in_bytes(const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_PTX_NCHW64_IMMA_UINT4_INT4) + std::string param() const override { + std::string ret; + serialize_write_pod(m_tile_nhw, ret); + serialize_write_pod(m_tile_oc, ret); + serialize_write_pod(m_threads, ret); + return ret; + } + +private: + std::string kernel_key(const SizeArgs& args) const; + unsigned int m_tile_nhw, m_tile_oc, m_threads; + std::string m_name; + void reorder_filter_bias( + const ExecArgs& args, void* reduce_filter, void* reordered_filter, + void* reordered_bias) const; +}; + class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { private: AlgoBase::Mapper m_all_algos_map; @@ -1251,6 +1291,7 @@ public: AlgoCUDNNConvV8 cudnn_conv_v8; AlgoCUDNNConvBiasActivationV8 cudnn_conv_bias_activation_v8; #endif + std::vector algo_ptx_conv2d_u4_s4; AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); @@ -1265,6 +1306,7 @@ private: void fill_cudnn_algos(); void fill_dp4a_algos(); void fill_dwconv_algos(); + void fill_ptx_algos(); }; } // namespace cuda diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 7a7268a972e000551a5a855b22e888324becfd51..8749d7a745ca06d3904e901b5093a4b8321775da 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -72,6 +72,7 @@ public: class AlgoCUDNNConvV8; class AlgoCUDNNConvBiasActivationV8; #endif + class AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm; class AlgoPack; diff --git a/dnn/src/cuda/conv_bias/ptx_helper.cu b/dnn/src/cuda/conv_bias/ptx_helper.cu new file mode 100644 index 0000000000000000000000000000000000000000..a7546192406244595f5ff33f729c97c4c9fa0ce8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/ptx_helper.cu @@ -0,0 +1,156 @@ +#include "src/cuda/conv_bias/ptx_helper.cuh" +#include "src/cuda/integer_subbyte_utils.cuh" +#include "src/cuda/query_blocksize.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace ptx; + +namespace { +template +__device__ __forceinline__ void reorder_imma_filter_func( + int8_t* dst, const int8_t* src, uint32_t OC, uint32_t IC, uint32_t FH, + uint32_t FW, uint32_t lane) { + static constexpr uint32_t elements_per_lane = 128 / size_bits; + uint32_t elements = lane * elements_per_lane; + uint32_t row = elements / (IC * FH * FW); + uint32_t col = elements - row * IC * FH * FW; + uint32_t sec = row / 4; + uint32_t res = col & (interleaved - 1); + uint32_t sec_sec = row & 3; + uint32_t sec_res = (row & 15) / 4; + uint32_t crosswise_offset = ((sec_sec >> 1) * 2 * interleaved) + + (((sec_sec & 1) ^ (sec_res >> 1)) * interleaved); + uint32_t residue_offset = + ((res / elements_per_lane) ^ (sec_res & 1)) * elements_per_lane; + uint32_t dst_offset = + (sec / 2) * 8 * FH * FW * IC + (col / interleaved) * (8 * interleaved) + + (sec & 1) * (4 * interleaved) + crosswise_offset + residue_offset; + static constexpr uint32_t instruction_shape_col = 8; + // 4 threads per Quad + static constexpr uint32_t elements_per_thread = instruction_shape_col / 4; + // 4 threads per Quad + static constexpr uint32_t reordered_elements_per_thread = interleaved / 4; + + uint32_t elem_in_interleaved = row % interleaved; + uint32_t elem_in_interleaved_pack = elem_in_interleaved / elements_per_thread; + int elem_new = (row / interleaved * interleaved + + elem_in_interleaved_pack % 4 * reordered_elements_per_thread + + elem_in_interleaved_pack / 4 * elements_per_thread + + elem_in_interleaved % elements_per_thread) * + (IC * FH * FW) + + col; + + *(reinterpret_cast(dst + (dst_offset * size_bits / 8))) = + *(reinterpret_cast(src + (elem_new * size_bits / 8))); +} + +template +__device__ __forceinline__ void reorder_imma_bias_func( + float* __restrict__ dst, float src_value, uint32_t OC, uint32_t lane) { + dst[lane] = src_value; +} + +template +__global__ void reorder_imma_filter_bias_kernel( + int8_t* __restrict__ dst_filter, float* __restrict__ dst_bias, + const int8_t* __restrict__ src_filter, const int32_t* __restrict__ src_bias, + float bias_scale, uint32_t OC, uint32_t IC, uint32_t FH, uint32_t FW) { + static constexpr uint32_t elements_per_lane = 128 / size_bits; + const uint32_t size1 = OC * IC * FH * FW / elements_per_lane; + const uint32_t size2 = OC; + uint32_t lane = threadIdx.x + blockIdx.x * blockDim.x; + if (lane < size1) { + reorder_imma_filter_func( + dst_filter, src_filter, OC, IC, FH, FW, lane); + } else if (lane < size1 + size2) { + lane = lane - size1; + float src_bias_value = src_bias[lane] * bias_scale; + reorder_imma_bias_func(dst_bias, src_bias_value, OC, lane); + } +} + +template +__global__ void reorder_imma_filter_bias_fusion_zero_point_kernel( + int8_t* __restrict__ dst_filter, float* __restrict__ dst_bias, + const int8_t* __restrict__ src_filter, const int32_t* __restrict__ src_bias, + float bias_scale, const int32_t* reduce_filter, float zero_point, uint32_t OC, + uint32_t IC, uint32_t FH, uint32_t FW) { + static constexpr uint32_t elements_per_lane = 128 / size_bits; + const uint32_t size1 = OC * IC * FH * FW / elements_per_lane; + const uint32_t size2 = OC; + uint32_t lane = threadIdx.x + blockIdx.x * blockDim.x; + if (lane < size1) { + reorder_imma_filter_func( + dst_filter, src_filter, OC, IC, FH, FW, lane); + } else if (lane < size1 + size2) { + lane = lane - size1; + // fusion bias and zero_point + // zero_point = zero_point * src_scale * filter_scale + float src_bias_value = + src_bias[lane] * bias_scale - reduce_filter[lane] * zero_point; + reorder_imma_bias_func(dst_bias, src_bias_value, OC, lane); + } +} + +} // namespace + +template +void megdnn::cuda::ptx::reorder_imma_filter_bias( + int8_t* dst_filter, float* dst_bias, const int8_t* src_filter, + const int32_t* src_bias, float bias_scale, uint32_t OC, uint32_t IC, + uint32_t FH, uint32_t FW, cudaStream_t stream) { + static constexpr uint32_t elements_per_lane = 128 / size_bits; + uint32_t nr_threads = query_blocksize_for_kernel(reinterpret_cast( + reorder_imma_filter_bias_kernel)); + uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_lane) + OC; + nr_threads = std::min(nr_threads, vthreads); + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + reorder_imma_filter_bias_kernel + <<>>( + dst_filter, dst_bias, src_filter, src_bias, bias_scale, OC, IC, FH, + FW); + after_kernel_launch(); +} + +template +void megdnn::cuda::ptx::reorder_imma_filter_bias_fusion_zero_point( + int8_t* dst_filter, float* dst_bias, const int8_t* src_filter, + const int32_t* src_bias, float bias_scale, const int32_t* reduce_filter, + float zero_point, uint32_t OC, uint32_t IC, uint32_t FH, uint32_t FW, + cudaStream_t stream) { + static constexpr uint32_t elements_per_lane = 128 / size_bits; + uint32_t nr_threads = query_blocksize_for_kernel(reinterpret_cast( + reorder_imma_filter_bias_fusion_zero_point_kernel)); + uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_lane) + OC; + nr_threads = std::min(nr_threads, vthreads); + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + reorder_imma_filter_bias_fusion_zero_point_kernel + <<>>( + dst_filter, dst_bias, src_filter, src_bias, bias_scale, + reduce_filter, zero_point, OC, IC, FH, FW); + after_kernel_launch(); +} + +#define INST(_size_bits, _interleaved) \ + template void \ + megdnn::cuda::ptx::reorder_imma_filter_bias<_size_bits, _interleaved>( \ + int8_t * dst_filter, float* dst_bias, const int8_t* src_filter, \ + const int32_t* src_bias, float bias_scale, uint32_t OC, uint32_t IC, \ + uint32_t FH, uint32_t FW, cudaStream_t stream); + +INST(8, 32) +INST(4, 64) +#undef INST + +#define INST(_size_bits, _interleaved) \ + template void megdnn::cuda::ptx::reorder_imma_filter_bias_fusion_zero_point< \ + _size_bits, _interleaved>( \ + int8_t * dst_filter, float* dst_bias, const int8_t* src_filter, \ + const int32_t* src_bias, float bias_scale, const int32_t* reduce_filter, \ + float zero_point, uint32_t OC, uint32_t IC, uint32_t FH, uint32_t FW, \ + cudaStream_t stream); +INST(4, 64) +#undef INST + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/ptx_helper.cuh b/dnn/src/cuda/conv_bias/ptx_helper.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4dff77dc4968738da813132e174c7b6a2e38ecbe --- /dev/null +++ b/dnn/src/cuda/conv_bias/ptx_helper.cuh @@ -0,0 +1,120 @@ +#pragma once +#include "src/cuda/int_fastdiv.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace ptx { + +struct Conv2dInt8Param { + uint32_t n, ic, ih, iw, fh, fw, sh, sw, ph, pw, oc, oh, ow; + uint32_t ibs, ics, ihs; + uint32_t obs, ocs, ohs; + uint32_t icfhfw; + uint32_t nhw; + uint32_t oc32; + Uint32Fastdiv div_ohow; + Uint32Fastdiv div_ow; + Conv2dInt8Param( + uint32_t n, uint32_t ic, uint32_t ih, uint32_t iw, uint32_t fh, uint32_t fw, + uint32_t sh, uint32_t sw, uint32_t ph, uint32_t pw, uint32_t oc, + uint32_t oh, uint32_t ow, uint32_t interleaved) + : n(n), + ic(ic), + ih(ih), + iw(iw), + fh(fh), + fw(fw), + sh(sh), + sw(sw), + ph(ph), + pw(pw), + oc(oc), + oh(oh), + ow(ow) { + ibs = ic * ih * iw; + ics = ih * iw * interleaved; + ihs = iw * interleaved; + obs = oc * oh * ow; + ocs = oh * ow * interleaved; + ohs = ow * interleaved; + icfhfw = ic * fh * fw; + div_ohow = oh * ow; + div_ow = ow; + nhw = n * oh * ow; + // used for dp4a kernel, reduce usage of register file + oc32 = oc * 32; + } +}; + +struct Conv2dInt4Param { + uint32_t n, ic, ih, iw, fh, fw, sh, sw, ph, pw, oc, oh, ow; + uint32_t ibs, ics, ihs; + uint32_t obs, ocs, ohs; + uint32_t icfhfw; + uint32_t nhw; + Uint32Fastdiv div_ohow; + Uint32Fastdiv div_ow; + Conv2dInt4Param( + uint32_t n, uint32_t ic, uint32_t ih, uint32_t iw, uint32_t fh, uint32_t fw, + uint32_t sh, uint32_t sw, uint32_t ph, uint32_t pw, uint32_t oc, + uint32_t oh, uint32_t ow, uint32_t interleaved = 64) + : n(n), + ic(ic), + ih(ih), + iw(iw), + fh(fh), + fw(fw), + sh(sh), + sw(sw), + ph(ph), + pw(pw), + oc(oc), + oh(oh), + ow(ow) { + constexpr uint32_t size_bits = 4; + // all stride size in bytes + ibs = ic * ih * iw * size_bits / 8; + ics = ih * iw * interleaved * size_bits / 8; + ihs = iw * interleaved * size_bits / 8; + obs = oc * oh * ow * size_bits / 8; + ocs = oh * ow * interleaved * size_bits / 8; + ohs = ow * interleaved * size_bits / 8; + icfhfw = ic * fh * fw; + nhw = n * oh * ow; + div_ohow = oh * ow; + div_ow = ow; + } +}; + +struct Conv2dConstantOffsetParam { + int32_t begin; + int32_t size; + int32_t max; + int32_t rewind; +}; + +#define CONSTANT_BUFFER_SIZE 848 + +struct Conv2dConstantOffset { + Conv2dConstantOffsetParam c_offset_param; + int c_offset[CONSTANT_BUFFER_SIZE]; +}; + +template +void reorder_imma_filter_bias( + int8_t* dst_filter, float* dst_bias, const int8_t* src_filter, + const int32_t* src_bias, float bias_scale, uint32_t OC, uint32_t IC, + uint32_t FH, uint32_t FW, cudaStream_t stream); + +template +void reorder_imma_filter_bias_fusion_zero_point( + int8_t* dst_filter, float* dst_bias, const int8_t* src_filter, + const int32_t* src_bias, float bias_scale, const int32_t* reduce_filter, + float zero_point, uint32_t OC, uint32_t IC, uint32_t FH, uint32_t FW, + cudaStream_t stream); +} // namespace ptx +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/ptx_implicit_gemm_uint4_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/ptx_implicit_gemm_uint4_int4_nchw64_imma.cpp new file mode 100644 index 0000000000000000000000000000000000000000..16fb4ce3cf216936446f09fb1173dd8bb353aa35 --- /dev/null +++ b/dnn/src/cuda/conv_bias/ptx_implicit_gemm_uint4_int4_nchw64_imma.cpp @@ -0,0 +1,341 @@ +/** + * \file dnn/src/cuda/conv_bias/ptx_implicit_gemm_uint4_int4_nchw64_imma.cpp + */ + +#include "./algo.h" +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/ptx_helper.cuh" +#include "src/cuda/conv_bias/reduce_filter.cuh" +#include "src/cuda/ptx/uint4_int4/kern.cuh" +#include "src/cuda/ptx_loader.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace ptx; + +namespace { +// all stride are in bytes +void compute_conv2d_offset( + size_t fh, size_t fw, size_t ics, size_t ihs, + Conv2dConstantOffset& constant_offset) { + constexpr int interleaved = 64; + constexpr int size_bits = 4; + constexpr int threablock_k = 128; + constexpr int inc_step = threablock_k / interleaved; + size_t i = 0; + int* s32 = &(constant_offset.c_offset[0]); + for (; i < inc_step; i++) { + int c = i / (fh * fw); + int khkw = i % (fh * fw); + int kh = khkw / fw; + int kw = khkw % fw; + s32[2 * i] = c * ics + kh * ihs + kw * interleaved * size_bits / 8; + int8_t* s8 = reinterpret_cast(&(s32[2 * i + 1])); + s8[0] = kh; + s8[1] = kw; + s8[2] = -kh; + s8[3] = -kw; + } + for (; i < (inc_step + fh * fw * inc_step); i++) { + int c = i / (fh * fw); + int khkw = i % (fh * fw); + int kh = khkw / fw; + int kw = khkw % fw; + s32[2 * i] = c * ics + kh * ihs + kw * interleaved * size_bits / 8; + int8_t* s8 = reinterpret_cast(&(s32[2 * i + 1])); + s8[0] = kh; + s8[1] = kw; + s8[2] = -kh; + s8[3] = -kw; + int i_ = i - inc_step; + c = i_ / (fh * fw); + khkw = i_ % (fh * fw); + kh = khkw / fw; + kw = khkw % fw; + s32[2 * i] -= c * ics + kh * ihs + kw * interleaved * size_bits / 8; + } +} +}; // namespace + +std::string ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm::kernel_key( + const SizeArgs& args) const { + std::string kernel_key; + using NonlineMode = Param::NonlineMode; + auto&& param = args.opr->param(); + if (args.z_layout->ndim > 0) { + kernel_key = ssprintf( + "%s_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_%ux%u", + current_device_arch_name(), m_tile_nhw, m_tile_oc); + } else { + kernel_key = ssprintf( + "%s_conv_bias_uint4_int4_imma8832_ldg16_%ux%u", + current_device_arch_name(), m_tile_nhw, m_tile_oc); + } + megdnn_assert( + param.nonlineMode == NonlineMode::RELU || + param.nonlineMode == NonlineMode::IDENTITY); + kernel_key += "_relu"; + return kernel_key; +} + +bool ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm::is_available( + const SizeArgs& args) const { + if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { + return false; + } + if (args.bias_layout->ndim <= 0) + return false; + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + using NonlineMode = Param::NonlineMode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) + return false; + if (param.format != Format::NCHW64) + return false; + UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // nonlineMode must be RELU or IDENTITY + available &= + (param.nonlineMode == NonlineMode::RELU || + param.nonlineMode == NonlineMode::IDENTITY); + // check data type + auto src_dtype = args.src_layout->dtype, filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, dst_dtype = args.dst_layout->dtype; + available &= + (src_dtype.enumv() == DTypeEnum::Quantized4Asymm && + filter_dtype.enumv() == DTypeEnum::QuantizedS4 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::Quantized4Asymm); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // ensure precomputed offsets are positive integers + available &= hi >= fh && wi >= fw; + // only support sm_86 or later, platform should have tensorcore int4 + // support + available &= + (is_compute_capability_equalto(8, 0) || + is_compute_capability_equalto(8, 6)); + // param buffer size is 4K, use 3K to store precomputed offset + size_t kMaxFilterPixels = CONSTANT_BUFFER_SIZE / (2 * 128 / 64) - 1; + available &= fh * fw <= kMaxFilterPixels; + return available; +} + +size_t ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm:: + get_workspace_in_bytes(const SizeArgs& args) const { + if (args.preprocessed_filter == nullptr) { + size_t OC = args.filter_layout->operator[](0), + IC = args.filter_layout->operator[](1) * 64, + FH = args.filter_layout->operator[](2), + FW = args.filter_layout->operator[](3); + size_t ws_size_reduce_filter = OC * sizeof(int32_t); + // for reduce filter + { + size_t A = OC, B = IC * FH * FW / 8, C = 1; + ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); + } + return args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte() + ws_size_reduce_filter; + } + return 0_z; +} + +void ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); + auto&& stream = cuda_stream(args.opr->handle()); + constexpr int interleaved = 64; + + void* bias_ptr = nullptr; + void* filter_ptr = nullptr; + if (args.preprocessed_filter) { + megdnn_assert(args.preprocessed_filter->tensors.size() == 2); + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); + } else { + // reorder filter and bias + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + bias_ptr = reinterpret_cast( + args.workspace.raw_ptr + args.filter_layout->span().dist_byte()); + void* reduce_filter_ptr = reinterpret_cast( + args.workspace.raw_ptr + args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte()); + reorder_filter_bias(args, reduce_filter_ptr, filter_ptr, bias_ptr); + } + + uint32_t u32_n = n, u32_ci = ci, u32_hi = hi, u32_wi = wi, u32_fh = fh, u32_fw = fw, + u32_sh = sh, u32_sw = sw, u32_ph = ph, u32_pw = pw, u32_co = co, + u32_ho = ho, u32_wo = wo; + Conv2dInt4Param kern_param( + u32_n, u32_ci, u32_hi, u32_wi, u32_fh, u32_fw, u32_sh, u32_sw, u32_ph, + u32_pw, u32_co, u32_ho, u32_wo, interleaved); + + Conv2dConstantOffset kern_coffset; + compute_conv2d_offset(fh, fw, kern_param.ics, kern_param.ihs, kern_coffset); + // begin is not need + kern_coffset.c_offset_param.begin = param_buffer_start_address(); + kern_coffset.c_offset_param.size = 4 * (1 + fh * fw); + kern_coffset.c_offset_param.max = 4 * fh * fw; + kern_coffset.c_offset_param.rewind = 4 * (1 - fh * fw); + + float src_scale = args.src_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale, + filter_scale = args.filter_layout->dtype.param().scale; + + uint32_t src_zero_point = + (uint32_t)(args.src_layout->dtype.param() + .zero_point); + uint32_t pk_src_zero_point = 0; + for (int i = 0; i < 8; i++) { + pk_src_zero_point <<= 4; + pk_src_zero_point |= (src_zero_point & 0xF); + } + float dst_zero_point = + (float)(args.dst_layout->dtype.param().zero_point); + float alpha = src_scale * filter_scale / dst_scale, beta = 1.f; + + unsigned int tx = m_threads, ty = 1; + unsigned int gridx = + div_ceil(static_cast(n * ho * wo), m_tile_nhw); + unsigned int gridy = + div_ceil(static_cast(co), m_tile_oc); + void* src_ptr = const_cast(args.src_tensor->raw_ptr()); + void* dst_ptr = const_cast(args.dst_tensor->raw_ptr()); + + using NonlineMode = Param::NonlineMode; + + auto kern_key = kernel_key(args); + auto&& kernel = PTXKernelLoader::instance().get_kernel(kern_key); + if (args.z_layout->ndim > 0) { + void* z_ptr = const_cast(args.z_tensor->raw_ptr()); + auto z_param = args.z_layout->dtype.param(); + int32_t z_zero_point = (int32_t)z_param.zero_point; + float z_scale = z_param.scale; + + float gamma = z_scale / dst_scale; + std::vector params = {&src_ptr, &filter_ptr, &bias_ptr, &z_ptr, + &dst_ptr, &alpha, &beta, &gamma}; + kern_coffset.c_offset_param.begin += sizeof(src_ptr) + sizeof(filter_ptr) + + sizeof(bias_ptr) + sizeof(z_ptr) + + sizeof(dst_ptr) + sizeof(alpha) + + sizeof(beta) + sizeof(gamma); + + kern_coffset.c_offset_param.begin += sizeof(pk_src_zero_point); + params.push_back(&pk_src_zero_point); + kern_coffset.c_offset_param.begin += sizeof(z_zero_point); + params.push_back(&z_zero_point); + kern_coffset.c_offset_param.begin += sizeof(dst_zero_point); + params.push_back(&dst_zero_point); + + uint32_t relu = param.nonlineMode == NonlineMode::RELU ? 1 : 0; + params.push_back(&relu); + kern_coffset.c_offset_param.begin += sizeof(relu); + params.push_back(&kern_param); + kern_coffset.c_offset_param.begin += sizeof(kern_param); + kern_coffset.c_offset_param.begin += sizeof(kern_coffset.c_offset_param); + params.push_back(&kern_coffset); + + dim3 grid(gridx, gridy, 1); + dim3 block(tx, ty, 1); + + kernel(grid, block, stream, params.data()); + } else { + std::vector params = {&src_ptr, &filter_ptr, &bias_ptr, + &dst_ptr, &alpha, &beta}; + kern_coffset.c_offset_param.begin += sizeof(src_ptr) + sizeof(filter_ptr) + + sizeof(bias_ptr) + sizeof(dst_ptr) + + sizeof(alpha) + sizeof(beta); + + kern_coffset.c_offset_param.begin += sizeof(pk_src_zero_point); + params.push_back(&pk_src_zero_point); + kern_coffset.c_offset_param.begin += sizeof(dst_zero_point); + params.push_back(&dst_zero_point); + + uint32_t relu = param.nonlineMode == NonlineMode::RELU ? 1 : 0; + params.push_back(&relu); + kern_coffset.c_offset_param.begin += sizeof(relu); + params.push_back(&kern_param); + kern_coffset.c_offset_param.begin += sizeof(kern_param); + kern_coffset.c_offset_param.begin += sizeof(kern_coffset.c_offset_param); + params.push_back(&kern_coffset); + + dim3 grid(gridx, gridy, 1); + dim3 block(tx, ty, 1); + + kernel(grid, block, stream, params.data()); + } + after_kernel_launch(); +} + +size_t ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + size_t OC = args.filter_layout->operator[](0), + IC = args.filter_layout->operator[](1) * 64, + FH = args.filter_layout->operator[](2), + FW = args.filter_layout->operator[](3); + size_t ws_size_reduce_filter = OC * sizeof(int32_t); + // for reduce filter + { + size_t A = OC, B = IC * FH * FW / 8, C = 1; + ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); + } + return ws_size_reduce_filter; +} + +SmallVector ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm:: + deduce_preprocessed_filter_layout(const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous(), + args.bias_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm::reorder_filter_bias( + const ExecArgs& args, void* reduce_filter, void* reordered_filter, + void* reordered_bias) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW64_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); + auto&& stream = cuda_stream(args.opr->handle()); + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = args.filter_layout->dtype.param().scale, + bias_scale = args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + + float scaled_src_zero_point = + args.src_layout->dtype.param().zero_point * + src_scale * filter_scale / dst_scale; + // NCHW64 reduce CHW64 + do_dispatch_reduce_with_scale_filter_4bit( + reinterpret_cast(args.filter_tensor->raw_ptr()), 1, co, + ci * fh * fw / 8, static_cast(reduce_filter), stream); + + reorder_imma_filter_bias_fusion_zero_point<4, 64>( + reinterpret_cast(reordered_filter), + reinterpret_cast(reordered_bias), + reinterpret_cast(args.filter_tensor->raw_ptr()), + args.bias_tensor->compatible_ptr(), bias_scale / dst_scale, + static_cast(reduce_filter), scaled_src_zero_point, co, ci, fh, fw, + stream); +} + +void ConvBiasForwardImpl::AlgoPTXUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + reorder_filter_bias( + args, args.workspace.raw_ptr, + args.preprocessed_filter->tensors[0].raw_ptr(), + args.preprocessed_filter->tensors[1].raw_ptr()); +} +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/ptx_loader.cpp b/dnn/src/cuda/ptx_loader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f0a971b6022e4ed268b16eb794c94d3d5f89f03a --- /dev/null +++ b/dnn/src/cuda/ptx_loader.cpp @@ -0,0 +1,49 @@ +/** + * \file dnn/src/cuda/ptx_loader.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/ptx_loader.h" +using namespace megdnn; +using namespace cuda; + +// ******************* PTXKernelLoader ********************* +const std::unordered_map PTXKernelLoader::KERNEL_MAP = + {{"ampere_conv_bias_uint4_int4_imma8832_ldg16_256x64_relu", + ptx::run_ampere_conv_bias_uint4_int4_imma8832_ldg16_256x64_relu}, + {"ampere_conv_bias_uint4_int4_imma8832_ldg16_128x128_relu", + ptx::run_ampere_conv_bias_uint4_int4_imma8832_ldgsts16_128x128_relu}, + {"ampere_conv_bias_uint4_int4_imma8832_ldg16_128x256_relu", + ptx::run_ampere_conv_bias_uint4_int4_imma8832_ldg16_128x256_relu}, + {"ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_256x64_relu", + ptx::run_ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_256x64_relu}, + {"ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_128x128_relu", + ptx::run_ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldgsts16_128x128_relu}, + {"ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_128x256_relu", + ptx::run_ampere_conv_bias_uint4_int4_fuse_z_imma8832_ldg16_128x256_relu}}; + +PTXKernelLoader& PTXKernelLoader::instance() { + static PTXKernelLoader ins; + return ins; +} + +const PTXKernelLoader::kernel PTXKernelLoader::get_kernel( + const std::string& kernel_name) { + decltype(KERNEL_MAP.begin()) kernel_iter; + kernel_iter = KERNEL_MAP.find(kernel_name); + megdnn_throw_if( + kernel_iter == KERNEL_MAP.end(), megdnn_error, + ssprintf("kernel name %s not found in KERNEL_MAP", kernel_name.c_str()) + .c_str()); + + return kernel_iter->second; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/ptx_loader.h b/dnn/src/cuda/ptx_loader.h new file mode 100644 index 0000000000000000000000000000000000000000..f14e6d87452fb877f997876e89eaa053787afbd7 --- /dev/null +++ b/dnn/src/cuda/ptx_loader.h @@ -0,0 +1,40 @@ +/** + * \file dnn/src/cuda/ptx_loader.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include +#include +#include "src/cuda/ptx/uint4_int4/kern.cuh" +#include "src/cuda/utils.h" +namespace megdnn { +namespace cuda { + +class PTXKernelLoader { +private: + PTXKernelLoader() = default; + using kernel = std::function; + +public: + PTXKernelLoader(const PTXKernelLoader&) = delete; + const PTXKernelLoader& operator=(const PTXKernelLoader&) = delete; + static PTXKernelLoader& instance(); + + const kernel get_kernel(const std::string& kernel_name); + + static const std::unordered_map KERNEL_MAP; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}