/** * \file dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "src/cuda/conv_bias/algo.h" #include "src/cuda/cutlass/singleton.h" namespace megdnn { namespace cuda { using namespace cutlass::library; using namespace cutlass::epilogue; ConvBiasForwardImpl::AlgoCutlassConvolutionBase::AlgoParam::AlgoParam( int threadblock_m_, int threadblock_n_, int threadblock_k_, int warp_m_, int warp_n_, int warp_k_, int instruction_m_, int instruction_n_, int instruction_k_, int stage_, int access_size_) : threadblock_m(threadblock_m_), threadblock_n(threadblock_n_), threadblock_k(threadblock_k_), warp_m(warp_m_), warp_n(warp_n_), warp_k(warp_k_), instruction_m(instruction_m_), instruction_n(instruction_m_), instruction_k(instruction_k_), stage(stage_), access_size(access_size_) {} std::string ConvBiasForwardImpl::AlgoCutlassConvolutionBase::AlgoParam::to_string() const { /// default algorithm if (threadblock_m == 128 && threadblock_n == 128 && threadblock_k == 32 && warp_m == 32 && warp_n == 64 && warp_k == 32 && stage == 2) { return ""; } return ssprintf( "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, threadblock_k, warp_m, warp_n, warp_k, stage); } namespace { using Base = ConvBiasForwardImpl::AlgoCutlassConvolutionBase; cutlass::conv::Operator convert_conv_op(Base::ConvOperator conv_op) { switch (conv_op) { case Base::ConvOperator::kFprop: return cutlass::conv::Operator::kFprop; case Base::ConvOperator::kDgrad: return cutlass::conv::Operator::kDgrad; case Base::ConvOperator::kWgrad: return cutlass::conv::Operator::kWgrad; default: megdnn_assert(0, "invalid conv op"); } } cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) { switch (conv_type) { case Base::ConvType::kConvolution: return cutlass::conv::ConvType::kConvolution; case Base::ConvType::kBatchConvolution: return cutlass::conv::ConvType::kBatchConvolution; case Base::ConvType::kLocal: return cutlass::conv::ConvType::kLocal; case Base::ConvType::kLocalShare: return cutlass::conv::ConvType::kLocalShare; case Base::ConvType::kDepthwiseConvolution: return cutlass::conv::ConvType::kDepthwiseConvolution; default: megdnn_assert(0, "invalid conv type"); } } NumericTypeID convert_dtype(DType dtype) { // just make convolution with no bias happy if (!dtype.valid()) return NumericTypeID::kF32; switch (dtype.enumv()) { case DTypeEnum::Float32: return NumericTypeID::kF32; case DTypeEnum::Float16: return NumericTypeID::kF16; case DTypeEnum::Int8: return NumericTypeID::kS8; case DTypeEnum::QuantizedS32: return NumericTypeID::kS32; case DTypeEnum::QuantizedS8: return NumericTypeID::kS8; case DTypeEnum::QuantizedS4: return NumericTypeID::kS4; case DTypeEnum::Quantized4Asymm: return NumericTypeID::kU4; default: megdnn_assert(0, "invalid dtype"); } } NumericTypeID get_accumulator_dtype( DType dtype, const param::ConvBias::ComputeMode comp_mode) { if (dtype.category() == DTypeCategory::QUANTIZED) { return NumericTypeID::kS32; } else { megdnn_assert(dtype.category() == DTypeCategory::FLOAT); if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) { return convert_dtype(dtype); } else { megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32); return NumericTypeID::kF32; } } } struct LayoutPack { LayoutTypeID src; LayoutTypeID filter; LayoutTypeID dst; LayoutTypeID bias; }; LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type) { using Format = param::ConvBias::Format; switch (format) { case Format::NCHW4: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNC4HW4}; case Format::NCHW4_NCHW: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW}; case Format::NCHW4_NHWC: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case Format::NCHW4_NCHW32: return {LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorC4RSK4, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNC32HW32}; case Format::NCHW32: return {LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorC32RSK32, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNC32HW32}; case Format::NCHW32_NCHW4: return {LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorC32RSK32, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNC4HW4}; case Format::NCHW64: return {LayoutTypeID::kTensorNC64HW64, LayoutTypeID::kTensorC64RSK64, LayoutTypeID::kTensorNC64HW64, LayoutTypeID::kTensorNC64HW64}; case Format::NHWC: switch (access_type) { case 4: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC4HW4, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 8: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC8HW8, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 16: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC16HW16, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; case 32: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC32HW32, LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNHWC}; default: megdnn_assert(0, "invalid access_type"); } case Format::NCHW: return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW}; default: megdnn_assert(0, "invalid format"); } } EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool clamp) { using NonlineMode = param::ConvBias::NonlineMode; if (clamp) { if (mode == NonlineMode::IDENTITY) { return EpilogueType::kBiasAddLinearCombinationClamp; } else if (mode == NonlineMode::RELU) { return EpilogueType::kBiasAddLinearCombinationReluClamp; } else if (mode == NonlineMode::H_SWISH) { return EpilogueType::kBiasAddLinearCombinationHSwishClamp; } } else { if (mode == NonlineMode::IDENTITY) { return EpilogueType::kBiasAddLinearCombination; } else if (mode == NonlineMode::RELU) { return EpilogueType::kBiasAddLinearCombinationRelu; } else if (mode == NonlineMode::H_SWISH) { return EpilogueType::kBiasAddLinearCombinationHSwish; } } megdnn_assert(0, "invalid nonlinear mode"); } std::pair get_tensor_alignment( const param::ConvBias::Format format, const TensorLayout& src, const TensorLayout& filter, const Base::AlgoParam& algo_param, bool is_chanwise) { int alignment_src = 0; int alignment_filter = 0; using Format = param::ConvBias::Format; // get tensor alignment for tensor op operations // for tensor op operations, the alignment is determined by the size of a vector auto get_tensor_alignment_tensor_op = [&]() { switch (format) { /// case int8 case Format::NCHW32: case Format::NCHW32_NCHW4: alignment_src = 16; alignment_filter = 16; break; /// case int4 or uint4 case Format::NCHW64: alignment_src = 32; alignment_filter = 32; break; case Format::NHWC: alignment_src = alignment_filter = algo_param.access_size; break; default: megdnn_throw("invalid format"); }; }; // get tensor alignment for dot product operations // for integer dot product operations, alignment src is always 4 // and the alignment filter is determined by the threadblock shape auto get_tensor_alignment_dp4a = [&]() { megdnn_assert( format == Format::NCHW4 || format == Format::NCHW4_NCHW || format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32); alignment_src = 4; // determine alignment filter constexpr int warp_size = 32; int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * algo_param.threadblock_k / (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); int threadblock_loads = filter.dtype.size( algo_param.threadblock_m * algo_param.threadblock_n * algo_param.threadblock_k); int load_per_thread = threadblock_loads / threads; if (load_per_thread >= 16) alignment_filter = 16; else if (load_per_thread >= 8) alignment_filter = 8; else { megdnn_assert(load_per_thread >= 4); alignment_filter = 4; } }; // get tensor alignment for depthwise convolution auto get_tensor_alignment_dwconv2d_nchw = [&]() { alignment_filter = 1; size_t wi = src.dtype.size(src[3]); // width extent in bytes for (size_t candidate : {16, 4, 2}) { if (wi % candidate == 0) { alignment_src = candidate; break; } } alignment_src /= src.dtype.size(1); }; /// TODO: need a better way to check whether tensor core instruction is used if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 || format == Format::NCHW64 || format == Format::NCHW64 || format == Format::NHWC) { get_tensor_alignment_tensor_op(); } else if ( format == Format::NCHW4 || format == Format::NCHW4_NCHW || format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) { get_tensor_alignment_dp4a(); } else { /// the following is used for depthwise convolution megdnn_assert(format == Format::NCHW && is_chanwise); get_tensor_alignment_dwconv2d_nchw(); } megdnn_assert(alignment_src >= 1 && alignment_filter >= 1); return {alignment_src, alignment_filter}; } } // namespace const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, bool use_conv_filter_unity_opt, bool without_shared_load) const { auto&& param = args.opr->param(); auto layouts = get_layout_pack(param.format, m_algo_param.access_size); auto epilogue_type = get_epilogue_type( param.nonlineMode, args.dst_layout->dtype.category() != DTypeCategory::FLOAT); cutlass::conv::SpecialOptimizeDesc special_optimization = (use_conv_filter_unity_opt) ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY : cutlass::conv::SpecialOptimizeDesc::NONE; int alignment_src, alignment_filter; auto&& fm = args.filter_meta; bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 && fm.ocpg == 1; std::tie(alignment_src, alignment_filter) = get_tensor_alignment( param.format, *args.src_layout, *args.filter_layout, m_algo_param, is_chanwise); auto accumulator_dtype = get_accumulator_dtype(args.src_layout->dtype, param.compute_mode); ConvolutionKey key{ convert_conv_op(conv_op), convert_dtype(args.src_layout->dtype), layouts.src, convert_dtype(args.filter_layout->dtype), layouts.filter, convert_dtype(args.dst_layout->dtype), layouts.dst, convert_dtype(args.bias_layout->dtype), layouts.bias, accumulator_dtype, convert_conv_type(conv_type), m_algo_param.threadblock_m, m_algo_param.threadblock_n, m_algo_param.threadblock_k, m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k, m_algo_param.instruction_m, m_algo_param.instruction_n, m_algo_param.instruction_k, epilogue_type, m_algo_param.stage, special_optimization, alignment_src, alignment_filter, without_shared_load}; return Singleton::get().operation_table.find_op(key); } void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op( const Operation* op, const void* src, const void* filter, const void* bias, const void* z, void* dst, void* workspace, size_t n, size_t hi, size_t wi, size_t ci, size_t co, size_t fh, size_t fw, size_t ho, size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, const void* beta, const void* gamma, const void* delta, const void* theta, const void* threshold, const void* dst_scale, cudaStream_t stream, const void* extra_param, size_t groups) const { // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ int(n), int(hi), int(wi), int(ci), int(co), int(fh), int(fw), int(ho), int(wo), int(ph), int(pw), int(sh), int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation, 1, // split k slices, always 1 int(groups), // groups }; ConvolutionArguments conv_args{ problem_size, src, filter, bias, z, dst, alpha, beta, gamma, delta, theta, threshold, dst_scale, extra_param}; cutlass_check(op->run(&conv_args, workspace, stream)); } } // namespace cuda } // namespace megdnn