/** * \file dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "./algo.h" #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" #include "src/cuda/conv_bias/reduce_filter.cuh" #include "src/cuda/utils.h" using namespace megdnn; using namespace cuda; using namespace convolution; #if CUDA_VERSION >= 10020 size_t ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: get_workspace_in_bytes(const SizeArgs& args) const { if (args.preprocessed_filter) { return 0; } else { size_t ws_filter = args.filter_layout->span().dist_byte(), ws_bias = args.bias_layout->span().dist_byte(), ws_reduce_filter = get_preprocess_workspace_in_bytes(args); return ws_filter + ws_bias + ws_reduce_filter; } } size_t ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: get_preprocess_workspace_in_bytes(const SizeArgs& args) const { size_t co = args.filter_layout->operator[](0), ci = args.filter_layout->operator[](1) * 64, fh = args.filter_layout->operator[](2), fw = args.filter_layout->operator[](3); size_t ws_size_reduce_filter = co * sizeof(int32_t); size_t A = co, B = ci * fh * fw / 8, C = 1; ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); return ws_size_reduce_filter; } SmallVector ConvBiasForwardImpl:: AlgoUInt4Int4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( const SizeArgs& args) const { return {args.filter_layout->collapse_contiguous(), args.bias_layout->collapse_contiguous()}; } void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr); void* reduce_workspace = reinterpret_cast( args.workspace.raw_ptr + args.bias_layout->span().dist_byte()); reorder_filter(args, filter_ptr); update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); } std::tuple ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::prepare_filter_bias( const ExecArgs& args) const { void* filter_ptr = nullptr; void* bias_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; return {filter_ptr, bias_ptr}; } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); bias_ptr = reinterpret_cast(args.workspace.raw_ptr + args.filter_layout->span().dist_byte()); void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr + args.filter_layout->span().dist_byte() + args.bias_layout->span().dist_byte()); void* reduce_workspace = reinterpret_cast(args.workspace.raw_ptr + args.filter_layout->span().dist_byte() + args.bias_layout->span().dist_byte() + args.bias_layout->span().dist_byte()); reorder_filter(args, filter_ptr); update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); } return {filter_ptr, bias_ptr}; } std::tuple ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::get_constants( const ExecArgs& args) const { float src_scale = args.src_layout->dtype.param().scale, filter_scale = args.filter_layout->dtype.param().scale, bias_scale = args.bias_layout->dtype.param().scale, dst_scale = args.dst_layout->dtype.param().scale; uint8_t dst_zero = args.dst_layout->dtype.param().zero_point; float alpha = src_scale * filter_scale / dst_scale, beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = dst_zero; if (args.z_layout->ndim > 0) { float z_scale = args.z_layout->dtype.param().scale; gamma = z_scale / dst_scale; uint8_t z_zero = args.z_layout->dtype.param().zero_point; delta = -z_zero * gamma; } return {alpha, beta, gamma, delta, theta}; } void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::do_exec( const ExecArgs& args, void* filter_ptr, void* bias_ptr, void* z_ptr, ConvParam kern_param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float delta, float theta, cudaStream_t stream) const { float dst_scale = args.dst_layout->dtype.param().scale; uint8_t src_zero = args.src_layout->dtype.param().zero_point; cutlass_wrapper::GemmCoord threadblock_shape{m_algo_param.threadblock_m, m_algo_param.threadblock_n, m_algo_param.threadblock_k}; cutlass_wrapper::GemmCoord warp_shape{ m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}; cutlass_wrapper::do_conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64< true>(reinterpret_cast(args.src_tensor->raw_ptr), reinterpret_cast(filter_ptr), reinterpret_cast(bias_ptr), reinterpret_cast(z_ptr), reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, dst_scale, src_zero, threadblock_shape, warp_shape, stream); } void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::update_bias( const ExecArgs& args, void* updated_bias, void* reduce_filter_ptr, void* reduce_workspace) const { size_t co = args.filter_layout->operator[](0), ci = args.filter_layout->operator[](1) * 64, fh = args.filter_layout->operator[](2), fw = args.filter_layout->operator[](3); auto&& stream = cuda_stream(args.opr->handle()); int src_zero_point = args.src_tensor->layout.dtype.param() .zero_point; do_dispatch_reduce_filter_and_update_bias_4bit( reinterpret_cast(args.filter_tensor->raw_ptr), args.bias_tensor->compatible_ptr(), co, ci * fh * fw / 8, reinterpret_cast(updated_bias), reinterpret_cast(reduce_workspace), src_zero_point, stream); } #endif // vim: syntax=cpp.doxygen