/** * \file dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "src/cuda/conv_bias/algo.h" #include "src/cuda/cutlass/singleton.h" namespace megdnn { namespace cuda { using namespace cutlass::library; using namespace cutlass::epilogue; ConvBiasForwardImpl::AlgoCutlassConvolutionBase::AlgoParam::AlgoParam( int threadblock_m_, int threadblock_n_, int threadblock_k_, int warp_m_, int warp_n_, int warp_k_, int instruction_m_, int instruction_n_, int instruction_k_, int stage_, int access_size_) : threadblock_m(threadblock_m_), threadblock_n(threadblock_n_), threadblock_k(threadblock_k_), warp_m(warp_m_), warp_n(warp_n_), warp_k(warp_k_), instruction_m(instruction_m_), instruction_n(instruction_m_), instruction_k(instruction_k_), stage(stage_), access_size(access_size_) {} std::string ConvBiasForwardImpl::AlgoCutlassConvolutionBase::AlgoParam::to_string() const { /// default algorithm if (threadblock_m == 128 && threadblock_n == 128 && threadblock_k == 32 && warp_m == 32 && warp_n == 64 && warp_k == 32 && stage == 2) { return ""; } return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, threadblock_k, warp_m, warp_n, warp_k, stage); } namespace { using Base = ConvBiasForwardImpl::AlgoCutlassConvolutionBase; cutlass::conv::Operator convert_conv_op(Base::ConvOperator conv_op) { switch (conv_op) { case Base::ConvOperator::kFprop: return cutlass::conv::Operator::kFprop; case Base::ConvOperator::kDgrad: return cutlass::conv::Operator::kDgrad; case Base::ConvOperator::kWgrad: return cutlass::conv::Operator::kWgrad; default: megdnn_assert(0, "invalid conv op"); } } cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) { switch (conv_type) { case Base::ConvType::kConvolution: return cutlass::conv::ConvType::kConvolution; case Base::ConvType::kBatchConvolution: return cutlass::conv::ConvType::kBatchConvolution; case Base::ConvType::kLocal: return cutlass::conv::ConvType::kLocal; case Base::ConvType::kLocalShare: return cutlass::conv::ConvType::kLocalShare; default: megdnn_assert(0, "invalid conv type"); } } NumericTypeID convert_dtype(DTypeEnum dtype) { switch (dtype) { case DTypeEnum::Float32: return NumericTypeID::kF32; case DTypeEnum::Float16: return NumericTypeID::kF16; case DTypeEnum::Int8: return NumericTypeID::kS8; case DTypeEnum::QuantizedS32: return NumericTypeID::kS32; case DTypeEnum::QuantizedS8: return NumericTypeID::kS8; case DTypeEnum::QuantizedS4: return NumericTypeID::kS4; case DTypeEnum::Quantized4Asymm: return NumericTypeID::kU4; default: megdnn_assert(0, "invalid dtype"); } } struct LayoutPack { LayoutTypeID src; LayoutTypeID filter; LayoutTypeID dst; LayoutTypeID bias; }; LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type) { using Format = param::ConvBias::Format; switch (format) { case Format::NCHW4: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNC4HW4}; case Format::NCHW4_NCHW: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW}; case Format::NCHW4_NHWC: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case Format::NCHW4_NCHW32: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNC32HW32}; case Format::NCHW32: return {LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorC32RSK32, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNC32HW32}; case Format::NCHW32_NCHW4: return {LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorC32RSK32, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNC4HW4}; case Format::NCHW64: return {LayoutTypeID::kTensorNC64HW64, LayoutTypeID::kTensorC64RSK64, LayoutTypeID::kTensorNC64HW64, LayoutTypeID::kTensorNC64HW64}; case Format::NHWC: switch (access_type) { case 4: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 8: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC8HW8, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 16: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC16HW16, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 32: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; default: megdnn_assert(0, "invalid access_type"); } default: megdnn_assert(0, "invalid format"); } } EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool clamp) { using NonlineMode = param::ConvBias::NonlineMode; if (clamp) { if (mode == NonlineMode::IDENTITY) { return EpilogueType::kBiasAddLinearCombinationClamp; } else if (mode == NonlineMode::RELU) { return EpilogueType::kBiasAddLinearCombinationReluClamp; } else if (mode == NonlineMode::H_SWISH) { return EpilogueType::kBiasAddLinearCombinationHSwishClamp; } } else { if (mode == NonlineMode::IDENTITY) { return EpilogueType::kBiasAddLinearCombination; } else if (mode == NonlineMode::RELU) { return EpilogueType::kBiasAddLinearCombinationRelu; } else if (mode == NonlineMode::H_SWISH) { return EpilogueType::kBiasAddLinearCombinationHSwish; } } megdnn_assert(0, "invalid nonlinear mode"); } } // namespace const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, bool use_conv_filter_unity_opt, bool without_shared_load) const { auto&& param = args.opr->param(); auto layouts = get_layout_pack(param.format, m_algo_param.access_size); auto epilogue_type = get_epilogue_type( param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32); cutlass::conv::SpecialOptimizeDesc special_optimization = (use_conv_filter_unity_opt) ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY : cutlass::conv::SpecialOptimizeDesc::NONE; ConvolutionKey key{convert_conv_op(conv_op), convert_dtype(args.src_layout->dtype.enumv()), layouts.src, convert_dtype(args.filter_layout->dtype.enumv()), layouts.filter, convert_dtype(args.dst_layout->dtype.enumv()), layouts.dst, convert_dtype(args.bias_layout->dtype.enumv()), layouts.bias, convert_conv_type(conv_type), m_algo_param.threadblock_m, m_algo_param.threadblock_n, m_algo_param.threadblock_k, m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k, m_algo_param.instruction_m, m_algo_param.instruction_n, m_algo_param.instruction_k, epilogue_type, m_algo_param.stage, special_optimization, without_shared_load}; return Singleton::get().operation_table.find_op(key); } void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op( const Operation* op, const void* src, const void* filter, const void* bias, const void* z, void* dst, void* workspace, size_t n, size_t hi, size_t wi, size_t ci, size_t co, size_t fh, size_t fw, size_t ho, size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, const void* beta, const void* gamma, const void* delta, const void* theta, const void* threshold, const void* dst_scale, cudaStream_t stream, const void* extra_param) const { // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ int(n), int(hi), int(wi), int(ci), int(co), int(fh), int(fw), int(ho), int(wo), int(ph), int(pw), int(sh), int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation}; ConvolutionArguments conv_args{ problem_size, src, filter, bias, z, dst, alpha, beta, gamma, delta, theta, threshold, dst_scale, extra_param}; cutlass_check(op->run(&conv_args, workspace, stream)); } } // namespace cuda } // namespace megdnn