diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py index 735f4d941c4b4261082b26d637236701d4d622c2..539574b983d6e9eb29eaedb48b431edac69e2578 100644 --- a/dnn/scripts/cutlass_generator/conv2d_operation.py +++ b/dnn/scripts/cutlass_generator/conv2d_operation.py @@ -19,8 +19,8 @@ class Conv2dOperation: # def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ - need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ - required_cuda_ver_major = 9, required_cuda_ver_minor = 2): + special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \ + without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2): self.operation_kind = OperationKind.Conv2d self.conv_kind = conv_kind @@ -34,7 +34,7 @@ class Conv2dOperation: self.element_epilogue = element_epilogue self.epilogue_functor = epilogue_functor self.swizzling_functor = swizzling_functor - self.need_load_from_const = need_load_from_const + self.special_optimization = special_optimization self.implicit_gemm_mode = implicit_gemm_mode self.without_shared_load = without_shared_load self.required_cuda_ver_major = required_cuda_ver_major @@ -60,16 +60,18 @@ class Conv2dOperation: else: inst_shape = '' - unity_kernel = '' - if not self.need_load_from_const: - unity_kernel = '_1x1' + special_opt = '' + if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity: + special_opt = '_1x1' + elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling: + special_opt = '_s2' reorder_k = '' if self.without_shared_load: reorder_k = '_roc' return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ - inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ + inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \ reorder_k, ShortEpilogueNames[self.epilogue_functor]) # @@ -183,7 +185,7 @@ using Convolution = ${stages}, ${alignment_src}, ${alignment_filter}, - ${nonuninity_kernel}, + ${special_optimization}, ${math_operator}, ${implicit_gemm_mode}, ${without_shared_load}>; @@ -226,7 +228,7 @@ using Convolution = 'stages': str(operation.tile_description.stages), 'alignment_src': str(operation.src.alignment), 'alignment_filter': str(operation.flt.alignment), - 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode], 'without_shared_load': str(operation.without_shared_load).lower() @@ -266,7 +268,7 @@ using Deconvolution = ${stages}, ${alignment_src}, ${alignment_filter}, - ${nonuninity_kernel}, + ${special_optimization}, ${math_operator}, ${implicit_gemm_mode}>; """ @@ -308,7 +310,7 @@ using Deconvolution = 'stages': str(operation.tile_description.stages), 'alignment_src': str(operation.src.alignment), 'alignment_filter': str(operation.flt.alignment), - 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] } @@ -323,9 +325,9 @@ using Deconvolution = ################################################################################################### # -def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ - skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \ - required_cuda_ver_minor = 2): +def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \ + use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): operations = [] element_epilogue = DataType.f32 @@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) - if not skip_unity_kernel: - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) + if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt: + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) return operations diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py index 5a4d3d35defceb9d599cf1e9728831980a73b515..b5814746318974a6b84a1ed6e0a35eeb58e744ad 100644 --- a/dnn/scripts/cutlass_generator/generator.py +++ b/dnn/scripts/cutlass_generator/generator.py @@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args): for dst_type, dst_layout in zip(dst_types, dst_layouts): if dst_type == DataType.s4 or dst_type == DataType.u4: min_cc = 75 - skip_unity_kernel = True + use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt else: min_cc = 61 - skip_unity_kernel = False + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity tile_descriptions = [ TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), @@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args): TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), - ] - operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 32, 32, 32, - skip_unity_kernel) + ] + for tile in tile_descriptions: + if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32: + continue + if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \ + and tile.threadblock_shape[0] > 16: + continue + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], + dst_layout, dst_type, min_cc, 32, 32, 32, + use_special_optimization) return operations @@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args): DataType.s8, ] + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity + min_cc = 75 max_cc = 1024 @@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args): TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) else: assert dst_layout == LayoutType.TensorNC4HW4 tile_descriptions = [ - TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) + + layouts_nhwc = [ + (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32), + (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64), + (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128), + ] + + dst_layouts_nhwc = [ + LayoutType.TensorNHWC, + ] + + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + dst_type = math_inst.element_b + tile_descriptions = [ + TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32: + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + + out_dtypes = [DataType.s4, DataType.u4, DataType.f32] + + #INT8x8x4 and INT8x8x32 + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + for out_dtype in out_dtypes: + tile_descriptions = [ + TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \ + else 8 * DataTypeSize[out_dtype] + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32): + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) return operations @@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args): LayoutType.TensorNC64HW64, ] + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity + min_cc = 75 max_cc = 1024 @@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args): TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) layouts_nhwc = [ (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), @@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args): for dst_layout in dst_layouts_nhwc: dst_type = math_inst.element_b tile_descriptions = [ + TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), ] for tile in tile_descriptions: - operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], - dst_layout, dst_type, min_cc, layout[2], layout[2], 32, - False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + dst_align = 16 if tile.threadblock_shape[1] == 16 else 32 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 - operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], - dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + # INT4x4x8 + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + tile_descriptions = [ + TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: + dst_align = 64 if tile.threadblock_shape[1] == 32 else 128 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) return operations @@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args): DataType.s8, ] + use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling + min_cc = 61 max_cc = 1024 @@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args): for layout in layouts: for dst_type, dst_layout in zip(dst_types, dst_layouts): tile_descriptions = [ - TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), @@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args): ] operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], dst_layout, dst_type, min_cc, 32, 32, 32, - True) + use_special_optimization) return operations ################################################################################ diff --git a/dnn/scripts/cutlass_generator/library.py b/dnn/scripts/cutlass_generator/library.py index 0559f28f745358d311c57709356e2ed0ac2502b6..9308357b90d2c275f53ab6293a28c79a6f68857d 100644 --- a/dnn/scripts/cutlass_generator/library.py +++ b/dnn/scripts/cutlass_generator/library.py @@ -562,6 +562,24 @@ StrideSupportNames = { StrideSupport.Unity: 'unity_stride', } +class SpecialOptimizeDesc(enum.Enum): + NoneSpecialOpt = enum_auto() + ConvFilterUnity = enum_auto() + DeconvDoubleUpsampling = enum_auto() + +SpecialOptimizeDescNames = { + SpecialOptimizeDesc.NoneSpecialOpt: 'none', + SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity', + SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling', +} + +SpecialOptimizeDescTag = { + SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE', + SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY', + SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING', +} + + class ImplicitGemmMode(enum.Enum): GemmNT = enum_auto() GemmTN = enum_auto() diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl index 7aaae2d617014de4e40573511f1702b8ee537ac9..52b16821132642f6565526c45e58f0c99e5d0446 100644 Binary files a/dnn/scripts/cutlass_generator/list.bzl and b/dnn/scripts/cutlass_generator/list.bzl differ diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 04934ac3986bae57cc102b4712e8a3ff10adce44..f0aa61d850ed327f718c7e6e0fdf25cbe5304f0a 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -553,7 +553,10 @@ void ConvolutionBase::check_or_deduce_dtype_fwd(DType src, dst.valid() && (dst.enumv() == src.enumv() || ((dst.enumv() == DTypeEnum::QuantizedS4 || dst.enumv() == DTypeEnum::Quantized4Asymm) && - src.enumv() == DTypeEnum::QuantizedS8)); + src.enumv() == DTypeEnum::QuantizedS8) || + ((src.enumv() == DTypeEnum::QuantizedS4 || + src.enumv() == DTypeEnum::Quantized4Asymm) && + dst.enumv() == DTypeEnum::QuantizedS8)); if (cond_dst) { supported_dst_dtype.push_back(dst); } diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index 6c687dfec0deec01249d6f72ae15effd24054833..4475c07183c6da9d09e076de55f652bff9df9890 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : int8_nchw32_imma) { all_algos.push_back(&algo); } + for (auto&& algo : int8_nhwc_imma) { + all_algos.push_back(&algo); + } for (auto&& algo : int4_int4_nchw64_imma) { all_algos.push_back(&algo); } @@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { int8_nchw32_imma.emplace_back( AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1}); } - + { + using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam; + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16}); + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8}); + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4}); + } { using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam; int4_int4_nchw64_imma.emplace_back( @@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { } { using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); int4_int4_nhwc_imma.emplace_back( AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); int4_int4_nhwc_imma.emplace_back( @@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { } { using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); uint4_int4_nhwc_imma.emplace_back( AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); uint4_int4_nhwc_imma.emplace_back( diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 2682391bcd86e891d3b0632c4c907593b9c7797f..90e0dc7b102923d712def5d39c1b295b8ac00b3b 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -72,6 +72,7 @@ public: CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8, CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8, CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8, + CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4, CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4, @@ -524,6 +525,7 @@ public: * + * +--- AlgoInt8NCHW4DotProdImplicitGemm * +--- AlgoInt8NCHW32IMMAImplicitGemm + * +--- AlgoInt8NHWCIMMAImplicitGemm * + * +--- AlgoInt4NCHW64IMMAImplicitGemmBase * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm @@ -582,7 +584,7 @@ public: // operation (cutlass kernel) from the global OperationTable const cutlass::library::Operation* get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, - bool load_from_const, bool without_shared_load) const; + bool use_conv_filter_unity_opt, bool without_shared_load) const; // execute the cutlass kernel found by get_cutlass_conv_op. we give // subclasses full freedom to decide where and how these arguments are @@ -829,6 +831,47 @@ private: std::string m_name; }; +class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final + : public AlgoCutlassConvolutionBase { +public: + AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param) + : AlgoCutlassConvolutionBase(algo_param) { + m_name = ConvBias::algo_name( + ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s", + to_string(m_algo_param).c_str()), + ConvBias::DirectParam{}); + } + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + static std::string to_string(AlgoParam algo_param); + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8) + + std::string param() const override { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; + } + +private: + std::tuple get_constants( + const ExecArgs& args) const; + + void reorder_filter(const ExecArgs& args, int interleaved, + void* reordered_filter) const; + + std::string m_name; +}; + class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase : public AlgoCutlassConvolutionBase { public: @@ -1087,6 +1130,7 @@ public: #endif #if CUDA_VERSION >= 10020 std::vector int8_nchw32_imma; + std::vector int8_nhwc_imma; std::vector int4_int4_nchw64_imma; std::vector uint4_int4_nchw64_imma; std::vector int4_int4_nhwc_imma; diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp index 4c4f04a26c50be11008aedf62d7fa234b7ddd1af..096686d13422f0021807e0e8e8393a94039f4749 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp @@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, LayoutTypeID::kTensorNC64HW64}; case Format::NHWC: switch (access_type) { + case 4: + return {LayoutTypeID::kTensorNHWC, + LayoutTypeID::kTensorNC4HW4, + LayoutTypeID::kTensorNHWC, + LayoutTypeID::kTensorNHWC}; case 8: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC8HW8, @@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, - bool load_from_const, bool without_shared_load) const { - using Format = param::ConvBias::Format; + bool use_conv_filter_unity_opt, bool without_shared_load) const { auto&& param = args.opr->param(); auto layouts = get_layout_pack(param.format, m_algo_param.access_size); - auto epilogue_type = get_epilogue_type(param.nonlineMode, - param.format != Format::NCHW4_NCHW); + auto epilogue_type = get_epilogue_type( + param.nonlineMode, + args.dst_layout->dtype.enumv() != DTypeEnum::Float32); + + cutlass::conv::SpecialOptimizeDesc special_optimization = + (use_conv_filter_unity_opt) + ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY + : cutlass::conv::SpecialOptimizeDesc::NONE; + ConvolutionKey key{convert_conv_op(conv_op), convert_dtype(args.src_layout->dtype.enumv()), layouts.src, @@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( m_algo_param.instruction_k, epilogue_type, m_algo_param.stage, - load_from_const, + special_optimization, without_shared_load}; return Singleton::get().operation_table.find_op(key); diff --git a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu index 2f296aca3faf93bd693c4f0e39e7fb4eced04599..c5ca513c362ab957e9bbb9d67dbde9c1b9c3357b 100644 --- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu +++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu @@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter( IC, FH, FW, trans_oc); after_kernel_launch(); } - -template +template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter( int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, - uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved, - cudaStream_t stream) { - static constexpr uint32_t elements_per_access = alignbits / size_bits; - uint32_t nr_threads = - query_blocksize_for_kernel(reinterpret_cast( - reorder_nhwc_imma_filter_kernel)); + uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits, + uint32_t interleaved, cudaStream_t stream) { + const uint32_t elements_per_access = alignbits / size_bits; + + void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t, + uint32_t, uint32_t, uint32_t, bool); + kern = nullptr; + + auto get_kern = [&kern](const uint32_t alignbits, + const uint32_t interleaved) { +#define DISPATCH_KERNEL(alignbits_, interleaved_) \ + if (alignbits == alignbits_ && interleaved == interleaved_) { \ + kern = reorder_nhwc_imma_filter_kernel; \ + return; \ + } + DISPATCH_KERNEL(128, 16); + DISPATCH_KERNEL(64, 16); + DISPATCH_KERNEL(32, 16); + DISPATCH_KERNEL(128, 32); + DISPATCH_KERNEL(64, 32); + DISPATCH_KERNEL(32, 32); + DISPATCH_KERNEL(128, 64); + DISPATCH_KERNEL(64, 64); + DISPATCH_KERNEL(32, 64); + +#undef DISPATCH_KERNEL + }; + + get_kern(alignbits, interleaved); + + uint32_t nr_threads = query_blocksize_for_kernel(kern); uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access); nr_threads = std::min(nr_threads, vthreads); uint32_t nr_blocks = DIVUP(vthreads, nr_threads); - if (oc_interleaved == 32) { - reorder_nhwc_imma_filter_kernel - <<>>( - dst_filter, src_filter, OC, IC, FH, FW, trans_oc); - } else { - reorder_nhwc_imma_filter_kernel - <<>>( - dst_filter, src_filter, OC, IC, FH, FW, trans_oc); - } + + kern<<>>(dst_filter, src_filter, OC, IC, + FH, FW, trans_oc); + after_kernel_launch(); } @@ -180,15 +200,14 @@ INST(8, 32) INST(4, 64) #undef INST -#define INST(_size_bits, _alignbits) \ - template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \ - _size_bits, _alignbits>( \ - int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ - uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ - uint32_t oc_interleaved, cudaStream_t stream); -INST(4, 32) -INST(4, 64) -INST(4, 128) +#define INST(_size_bits) \ + template void \ + megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \ + int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ + uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ + uint32_t alignbits, uint32_t interleaved, cudaStream_t stream); +INST(4) +INST(8) #undef INST // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh index dc46f9c60ba21a8dfa2e9a23a6ac7b970233149b..03f74e9028fb096374fb31ae67ed0b8bdebb6207 100644 --- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh @@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter, uint32_t FW, bool trans_oc, cudaStream_t stream); -template +template void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, uint32_t FH, - uint32_t FW, bool trans_oc, - uint32_t oc_interleaved, cudaStream_t stream); + uint32_t FW, bool trans_oc, uint32_t alignbits, + uint32_t interleaved, cudaStream_t stream); } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp index cbe3bc5d45df16bc1eb5cbce7fd20a173dd0f6a5..8b5f1cda028ba69e35644cf73c3e087d56e1e33d 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp @@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants( args.filter_layout->dtype.param().scale, bias_scale = args.bias_layout->dtype.param().scale, - dst_scale = args.dst_layout->dtype.param().scale; + dst_scale; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + dst_scale = args.dst_layout->dtype.param().scale; + } float alpha = src_scale * filter_scale / dst_scale, beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; if (args.z_layout->ndim > 0) { - float z_scale = args.z_layout->dtype.param().scale; + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + z_scale = args.z_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.z_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + z_scale = args.z_layout->dtype.param().scale; + } gamma = z_scale / dst_scale; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp index 5f4f11ef4956d2684f455cfc7018777bedde12c0..40bb0e82fe99eefad90d600f1811ea94702f8a7b 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp @@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available( if (fh * fw > kMaxFilterPixels) return false; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = true; + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + return true; } @@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( float dst_scale = 0.f; float threshold = 0.f; uint8_t src_zero = 0; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = true; if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { @@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp index 353e26d992a1b604551015584982e25a12b32153..cf53fd39ca97b143b342decebe4cf3f1be9b1115 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp @@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( if (args.src_layout->dtype.enumv() != src_dtype() || args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || - args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || - args.dst_layout->dtype.enumv() != src_dtype()) + args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32) + return false; + + if (!(args.dst_layout->dtype.enumv() == src_dtype() || + args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8)) return false; // uint4 do not support H_SWISH activition @@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0)) return false; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 32 || + m_algo_param.threadblock_n == 64)); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + return true; } @@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( float dst_scale = 0.f; float threshold = 0.f; uint8_t src_zero = 0; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && (m_algo_param.threadblock_n == 32 || m_algo_param.threadblock_n == 64)); + if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + src_zero = args.src_layout->dtype.param() + .zero_point; + } + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { dst_scale = args.dst_layout->dtype.param().scale; - src_zero = args.src_layout->dtype.param() - .zero_point; - } else { // DTypeEnum::QuantizedS4 + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { dst_scale = args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + dst_scale = args.dst_layout->dtype.param().scale; } cudaStream_t stream = cuda_stream(args.opr->handle()); const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, @@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter( cudaStream_t stream = cuda_stream(args.opr->handle()); // reformat filter from nhwc to ncxhwx and reorder oc - // use trans_oc threadblock_n must be 32 or 64 + // use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && (m_algo_param.threadblock_n == 32 || m_algo_param.threadblock_n == 64)); - uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32; - - if (iterleaved == 8) { - cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } else if (iterleaved == 16) { - cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } else { - megdnn_assert(iterleaved == 32); - cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } + uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32; + + uint32_t alignbits = iterleaved * 4; + + cutlass_wrapper::reorder_nhwc_imma_filter<4>( + reinterpret_cast(reordered_filter), + reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, + fw, trans_oc, alignbits, oc_iterleaved, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 4eda5cd7028d94072b19d45898b750882db781e6..f182f00bd2f1a83f94737d26d8576421f5f105c3 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( // FIXME: too large filter size is not supported now size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2; available &= fh * fw <= kMaxFilterPixels; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = (param.format == Format::NCHW32); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + available &= (op != nullptr); + return available; } @@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( gamma = z_scale / dst_scale; } float delta = 0.f, theta = 0.f, threshold = 0.f; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = (param.format == Format::NCHW32); const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 348c122d4844ebadc2cb6729a4592291334ce349..83a0c341b29d8c5b9883c94f9bac93a8ec0369a4 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( // FIXME: too large filter size is not supported now size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2; available &= fh * fw <= kMaxFilterPixels; - ; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = false; + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + available &= (op != nullptr); + return available; } @@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( } } float threshold = 0.f; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = false; - const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, - ConvType::kConvolution, - load_from_const, without_shared_load); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e35d1e9c3cbbc7ab0b9d78d48e44ceab6ebef8e --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp @@ -0,0 +1,294 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/cutlass_reorder_filter.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + using NonlineMode = megdnn::param::ConvBias::NonlineMode; + + auto&& param = args.opr->param(); + + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) + return false; + + if (param.format != Format::NHWC || param.sparse != Sparse::DENSE || + param.mode != Mode::CROSS_CORRELATION) + return false; + + if (param.nonlineMode != NonlineMode::IDENTITY && + param.nonlineMode != NonlineMode::RELU && + param.nonlineMode != NonlineMode::H_SWISH) + return false; + + if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 || + args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8) + return false; + + auto dst_dtype = args.dst_layout->dtype.enumv(); + + if (!(dst_dtype == DTypeEnum::QuantizedS8 || + dst_dtype == DTypeEnum::QuantizedS4 || + dst_dtype == DTypeEnum::Quantized4Asymm || + dst_dtype == DTypeEnum::Float32)) + return false; + + if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 || + (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && + dst_dtype == DTypeEnum::Float32))) + return false; + + if (!is_compute_capability_required(7, 5)) + return false; + + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + // param buffer size is 4K, use 3.4K to store precomputed offset + size_t kMaxFilterPixels = + 848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1; + if (fh * fw > kMaxFilterPixels) + return false; + // co should be aligned with 4, and ci should be aligned with + // algo_param.access_size + if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0)) + return false; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + dst_dtype != DTypeEnum::Float32))); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + + return true; +} + +size_t +ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + if (args.preprocessed_filter) { + return 0; + } else { + return args.filter_layout->span().dist_byte(); + } +} + +size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0; +} + +SmallVector ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: + deduce_preprocessed_filter_layout(const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + reorder_filter(args, m_algo_param.access_size, filter_ptr); +} + +std::tuple +ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants( + const ExecArgs& args) const { + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = 1.f, dst_scale; + + if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { + bias_scale = args.bias_layout->dtype.param().scale; + } + + uint8_t dst_zero = 0; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + dst_zero = args.dst_layout->dtype.param() + .zero_point; + } else { // DTypeEnum::Float32 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32); + dst_scale = 1.f; + } + + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, + theta = dst_zero; + + if (args.z_layout->ndim > 0) { + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + z_scale = + args.z_layout->dtype.param().scale; + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + gamma = z_scale / dst_scale; + delta = -z_zero * gamma; + } else { // DTypeEnum::Float32 + megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32); + gamma = 1.f; + } + } + + if (args.opr->param().nonlineMode == + param::ConvBias::NonlineMode::IDENTITY) { + delta += theta; + theta = 0.f; + } + + return {alpha, beta, gamma, delta, theta}; +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](3), + hi = args.src_layout->operator[](1), + wi = args.src_layout->operator[](2); + size_t co = args.dst_layout->operator[](3), + ho = args.dst_layout->operator[](1), + wo = args.dst_layout->operator[](2); + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR + + void* filter_ptr = nullptr; + void* bias_ptr = nullptr; + void* z_ptr = nullptr; + + if (args.preprocessed_filter) { + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + } else { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + reorder_filter(args, m_algo_param.access_size, filter_ptr); + } + bias_ptr = args.bias_tensor->raw_ptr; + + if (args.z_layout->ndim > 0) + z_ptr = args.z_tensor->raw_ptr; + + // \note these constants of cutlass epilogue will be passed to method + // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, + // a different dtype here results in undefined epilogue behaviors + float alpha, beta, gamma, delta, theta; + std::tie(alpha, beta, gamma, delta, theta) = get_constants(args); + + float dst_scale = 1.f; + float threshold = 0.f; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + + auto dst_dtype = args.dst_layout->dtype.enumv(); + + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + dst_dtype != DTypeEnum::Float32))); + + if (dst_dtype == DTypeEnum::QuantizedS8) { // DTypeEnum::QuantizedS8 + dst_scale = args.dst_layout->dtype.param().scale; + } else if (dst_dtype == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (dst_dtype == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::Float32 + dst_scale = 1.f; + } + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + + execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, + z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, + ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, + &alpha, &beta, &gamma, &delta, &theta, &threshold, + &dst_scale, stream); + + after_kernel_launch(); +} + +std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string( + AlgoParam algo_param) { + return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m, + algo_param.threadblock_n, algo_param.threadblock_k, + algo_param.warp_m, algo_param.warp_n, algo_param.warp_k, + algo_param.stage, algo_param.access_size); +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter( + const ExecArgs& args, const int iterleaved, + void* reordered_filter) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + // reformat filter from nhwc to ncxhwx and reorder oc + // use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype + bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + args.dst_layout->dtype.enumv() != DTypeEnum::Float32))); + uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16; + + uint32_t alignbits = iterleaved * 8; + + cutlass_wrapper::reorder_nhwc_imma_filter<8>( + reinterpret_cast(reordered_filter), + reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, + fw, trans_oc, alignbits, oc_iterleaved, stream); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp index d271449e2ee7ef74564a9b9271a948061d4f54d2..73bb4b170e8cbae1c675492d35da3f9aaa2ddb57 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp @@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants( args.filter_layout->dtype.param().scale, bias_scale = args.bias_layout->dtype.param().scale, - dst_scale = - args.dst_layout->dtype.param().scale; + dst_scale; + + uint8_t dst_zero = 0; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + + dst_zero = args.dst_layout->dtype.param() + .zero_point; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + dst_scale = args.dst_layout->dtype.param().scale; + } - uint8_t dst_zero = - args.dst_layout->dtype.param().zero_point; float alpha = src_scale * filter_scale / dst_scale, beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = dst_zero; if (args.z_layout->ndim > 0) { - float z_scale = - args.z_layout->dtype.param().scale; - gamma = z_scale / dst_scale; - uint8_t z_zero = - args.z_layout->dtype.param().zero_point; - delta = -z_zero * gamma; + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + z_scale = + args.z_layout->dtype.param().scale; + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + gamma = z_scale / dst_scale; + delta = -z_zero * gamma; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.z_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } } // identity epilogue has no theta: diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 4af7896b7d39da3b557918e8af42c5ab0293140d..d6ea710ede2dde350fc63b39423a75d73c8e3ee9 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -65,6 +65,7 @@ public: class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; class AlgoInt8NCHW32IMMAImplicitGemm; + class AlgoInt8NHWCIMMAImplicitGemm; class AlgoInt4NCHW64IMMAImplicitGemmBase; class AlgoInt4Int4NCHW64IMMAImplicitGemm; class AlgoUInt4Int4NCHW64IMMAImplicitGemm; diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index e9b3545471174b294604ec1445539e9e5ae9b65f..f87479e5b71a2fc56cc6bd2878a6f8d0a234e323 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -275,6 +275,7 @@ public: private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const; + const void* get_available_op(const SizeArgs& args) const; AlgoParam m_algo_param; std::string m_name; }; @@ -295,6 +296,7 @@ public: private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const; + const void* get_available_op(const SizeArgs& args) const; }; class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp index c4f631e6e4925d433e0e4b83a20b71688d7ef8f1..fc4107dbf8c904595abe32b10445917266eaca21 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -20,6 +20,43 @@ using namespace megdnn; using namespace cuda; +const void* +ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& fm = args.filter_meta; + size_t sh = fm.stride[0], sw = fm.stride[1]; + cutlass::conv::SpecialOptimizeDesc special_optimization = + (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: + DECONV_DOUBLE_UPSAMPLING + : cutlass::conv::SpecialOptimizeDesc::NONE; + ConvolutionKey key{ + cutlass::conv::Operator::kDgrad, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS8, + LayoutTypeID::kTensorK4RSC4, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS32, + LayoutTypeID::kTensorNC4HW4, + cutlass::conv::ConvType::kConvolution, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 4, + cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, + m_algo_param.stage, + special_optimization, + false}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: is_available(const SizeArgs& args) const { auto&& fm = args.filter_meta; @@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: // FIXME: too large filter size is not supported now available &= fm.spatial[0] * fm.spatial[1] <= (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2); + available &= (get_available_op(args) != nullptr); // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); @@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( args.grad_layout->dtype.param().scale; // \note these constants of cutlass epilogue will be passed to struct - // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a - // different dtype here results in undefined epilogue behaviors + // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, + // a different dtype here results in undefined epilogue behaviors float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f, gamma = 0.f, delta = 0.f; using namespace cutlass::library; - // only use 16x64x8_16x64x8_2stages impl - ConvolutionKey key{ - cutlass::conv::Operator::kDgrad, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS8, - LayoutTypeID::kTensorK4RSC4, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS32, - LayoutTypeID::kTensorNC4HW4, - cutlass::conv::ConvType::kConvolution, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 4, - cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, - m_algo_param.stage, - true, - false}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ @@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() { int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); - int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp index f45188028492cd488ccb58ce9444b32300527c00..50ebde323bac8e49b87449fbc58d66e8ff1867b8 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp @@ -19,6 +19,44 @@ using namespace megdnn; using namespace cuda; +const void* +ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& fm = args.filter_meta; + size_t sh = fm.stride[0], sw = fm.stride[1]; + cutlass::conv::SpecialOptimizeDesc special_optimization = + (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: + DECONV_DOUBLE_UPSAMPLING + : cutlass::conv::SpecialOptimizeDesc::NONE; + // only use 16x64x8_16x64x8_2stages impl + ConvolutionKey key{ + cutlass::conv::Operator::kDgrad, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS8, + LayoutTypeID::kTensorK4RSC4, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS32, + LayoutTypeID::kTensorNC4HW4, + cutlass::conv::ConvType::kConvolution, + 16, + 64, + 8, + 16, + 64, + 8, + 1, + 1, + 4, + cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, + 2, + special_optimization, + false}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( const SizeArgs& args) const { auto&& fm = args.filter_meta; @@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); // FIXME: too large filter size is not supported now available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2); + + available &= (get_available_op(args) != nullptr); + // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); @@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( using namespace cutlass::library; - // only use 16x64x8_16x64x8_2stages impl - ConvolutionKey key{ - cutlass::conv::Operator::kDgrad, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS8, - LayoutTypeID::kTensorK4RSC4, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS32, - LayoutTypeID::kTensorNC4HW4, - cutlass::conv::ConvType::kConvolution, - 16, - 64, - 8, - 16, - 64, - 8, - 1, - 1, - 4, - cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, - 2, - true, - false}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ diff --git a/dnn/src/cuda/cutlass/convolution_operation.h b/dnn/src/cuda/cutlass/convolution_operation.h index 663bf8b9cd4cd09d406f50e1e180179d31f93c60..c2405b8b684f2e30d83cacc2770f09d5bb3c3966 100644 --- a/dnn/src/cuda/cutlass/convolution_operation.h +++ b/dnn/src/cuda/cutlass/convolution_operation.h @@ -119,8 +119,8 @@ public: m_description.threadblock_swizzle = ThreadblockSwizzleMap< typename Operator::ThreadblockSwizzle>::kId; - m_description.need_load_from_const_mem = - Operator::kNeedLoadFromConstMem; + m_description.special_optimization = + Operator::kSpecialOpt; m_description.gemm_mode = Operator::kGemmMode; m_description.without_shared_load = Operator::kWithoutSharedLoad; } diff --git a/dnn/src/cuda/cutlass/library.h b/dnn/src/cuda/cutlass/library.h index 9907fad1b266b3446d383ed2da11502288f2505c..b9dd76a289b421fc0db9ecdd10434d1cef0ed5b3 100644 --- a/dnn/src/cuda/cutlass/library.h +++ b/dnn/src/cuda/cutlass/library.h @@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription { ThreadblockSwizzleID threadblock_swizzle; - bool need_load_from_const_mem; + conv::SpecialOptimizeDesc special_optimization; conv::ImplicitGemmMode gemm_mode; bool without_shared_load; }; diff --git a/dnn/src/cuda/cutlass/operation_table.cpp b/dnn/src/cuda/cutlass/operation_table.cpp index 166559180b91311359f4a77ecb2de047ae4ea9cb..2da08ea354c44dd2ca570c43b43835ea9653c827 100644 --- a/dnn/src/cuda/cutlass/operation_table.cpp +++ b/dnn/src/cuda/cutlass/operation_table.cpp @@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc( key.epilogue_type = desc.epilogue_type; key.stages = desc.tile_description.threadblock_stages; - key.need_load_from_const_mem = desc.need_load_from_const_mem; + key.special_optimization = desc.special_optimization; key.without_shared_load = desc.without_shared_load; return key; @@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) { ///////////////////////////////////////////////////////////////////////////////////////////////// Operation const* OperationTable::find_op(GemmKey const& key) const { - megdnn_assert(gemm_operations.count(key) > 0, - "key not found in cutlass operation table"); - auto const& ops = gemm_operations.at(key); - megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", - ops.size()); - return ops[0]; + if (gemm_operations.count(key)) { + auto const& ops = gemm_operations.at(key); + megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", + ops.size()); + return ops[0]; + } + return nullptr; } ///////////////////////////////////////////////////////////////////////////////////////////////// Operation const* OperationTable::find_op(ConvolutionKey const& key) const { - megdnn_assert(convolution_operations.count(key) > 0, - "key not found in cutlass operation table"); - auto const& ops = convolution_operations.at(key); - megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", - ops.size()); - return ops[0]; + if (convolution_operations.count(key) > 0) { + auto const& ops = convolution_operations.at(key); + megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", + ops.size()); + return ops[0]; + } + return nullptr; } ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/dnn/src/cuda/cutlass/operation_table.h b/dnn/src/cuda/cutlass/operation_table.h index 420e1e55c49c746c59fdfabbc7b516e3743fd5a6..1fef2ff783385a434e886b1b56990e18fc807077 100644 --- a/dnn/src/cuda/cutlass/operation_table.h +++ b/dnn/src/cuda/cutlass/operation_table.h @@ -211,7 +211,7 @@ struct ConvolutionKey { epilogue::EpilogueType epilogue_type; int stages; - bool need_load_from_const_mem; + conv::SpecialOptimizeDesc special_optimization; bool without_shared_load; inline bool operator==(ConvolutionKey const& rhs) const { @@ -234,7 +234,7 @@ struct ConvolutionKey { (instruction_shape_n == rhs.instruction_shape_n) && (instruction_shape_k == rhs.instruction_shape_k) && (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && - (need_load_from_const_mem == rhs.need_load_from_const_mem) && + (special_optimization == rhs.special_optimization) && (without_shared_load == rhs.without_shared_load); } @@ -270,8 +270,8 @@ struct ConvolutionKey { "\n instruction_shape: " + instruction_shape_str + "\n epilogue_type: " + to_string(epilogue_type) + "\n stages: " + std::to_string(stages) + - "\n need_load_from_const_mem: " + - to_string(need_load_from_const_mem) + + "\n special_optimization: " + + to_string(special_optimization) + "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; } @@ -308,8 +308,8 @@ struct ConvolutionKeyHasher { sizeof(key.instruction_shape_k)) .update(&key.epilogue_type, sizeof(key.epilogue_type)) .update(&key.stages, sizeof(key.stages)) - .update(&key.need_load_from_const_mem, - sizeof(key.need_load_from_const_mem)) + .update(&key.special_optimization, + sizeof(key.special_optimization)) .update(&key.without_shared_load, sizeof(key.without_shared_load)) .digest(); diff --git a/dnn/src/cuda/cutlass/util.cu b/dnn/src/cuda/cutlass/util.cu index 11e1d301463d64cb2ab0dc21c5faca5a91120b9e..6efcfa98077afa2e88364f0d41c95d1813bcbda1 100644 --- a/dnn/src/cuda/cutlass/util.cu +++ b/dnn/src/cuda/cutlass/util.cu @@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) { /////////////////////////////////////////////////////////////////////////////////////////////////// +static struct { + char const* text; + char const* pretty; + conv::SpecialOptimizeDesc enumerant; +} SpecialOptimizeDesc_enumerants[] = { + {"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE}, + {"conv_filter_unity", "ConvFilterUnity", + conv::SpecialOptimizeDesc::CONV_FILTER_UNITY}, + {"deconv_double_upsampling", "DeconvDoubleUpsampling", + conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING}, +}; + +/// Converts an SpecialOptimizeDesc enumerant to a string +char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) { + for (auto const& possible : SpecialOptimizeDesc_enumerants) { + if (special_opt == possible.enumerant) { + if (pretty) { + return possible.pretty; + } else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + static struct { char const* text; char const* pretty; diff --git a/dnn/src/cuda/cutlass/util.h b/dnn/src/cuda/cutlass/util.h index cb6e6f999cbaf7eafc021e034bb3811aaa238768..da36393e257c093cb5006e1d247a7dfd9c018ba0 100644 --- a/dnn/src/cuda/cutlass/util.h +++ b/dnn/src/cuda/cutlass/util.h @@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false); /// Converts a MathOperationID enumerant to a string char const* to_string(MathOperationID math_op, bool pretty = false); +/// Converts a SpecialOptimizeDesc enumerant to a string +char const* to_string(conv::SpecialOptimizeDesc special_opt, + bool pretty = false); + /// Converts an ImplicitGemmMode enumerant to a string char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false); diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index aa2807ebc598fb9a722876d9b4c35ca4abfb0c6b..df2c23cc5a6ee4f66277a5017c630d1a5959927b 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -235,6 +235,7 @@ public: m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s", m_algo_param.to_string().c_str())} {} bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; const char* name() const override { return m_name.c_str(); } AlgoAttribute attribute() const override { @@ -260,6 +261,7 @@ private: void do_exec(const ExecArgs& args) const override; int min_alignment_requirement() const override { return 1; } std::string m_name; + const void* get_available_op(const SizeArgs& args) const; }; class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final @@ -270,6 +272,7 @@ public: m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", m_algo_param.to_string().c_str())} {} bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; const char* name() const override { return m_name.c_str(); } AlgoAttribute attribute() const override { @@ -297,6 +300,7 @@ private: void do_exec(const ExecArgs& args) const override; int min_alignment_requirement() const override { return 1; } std::string m_name; + const void* get_available_op(const SizeArgs& args) const; }; class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp index 7278202d2839d9d534a53196b12eea2735407173..96d35303a09f2ad2cf399599a65be36b517f2820 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp @@ -19,6 +19,39 @@ using namespace megdnn; using namespace cuda; +const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& param = args.opr->param(); + auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + + int alignment = min_alignment_requirement(); + GemmKey key{NumericTypeID::kF32, + layoutA, + NumericTypeID::kF32, + layoutB, + NumericTypeID::kF32, + LayoutTypeID::kRowMajor, + NumericTypeID::kF32, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 1, + 2, + alignment, + alignment, + SplitKMode::kNone}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( const SizeArgs& args) const { bool available = @@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( m_algo_param.threadblock_n <= y_grid_limit); + available &= (get_available_op(args) != nullptr); + return available; } @@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec( using namespace cutlass::library; - auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - - int alignment = min_alignment_requirement(); - GemmKey key{NumericTypeID::kF32, - layoutA, - NumericTypeID::kF32, - layoutB, - NumericTypeID::kF32, - LayoutTypeID::kRowMajor, - NumericTypeID::kF32, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 1, - 2, - alignment, - alignment, - SplitKMode::kNone}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); GemmArguments gemm_args{problem_size, args.tensor_a.raw_ptr, diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp index dfbc3a069b09dc7624ae0c963449108fe7fc2488..d2128a1ac33d5cada22be761f6e660d57481b57f 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -19,6 +19,39 @@ using namespace megdnn; using namespace cuda; +const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& param = args.opr->param(); + auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + + int alignment = min_alignment_requirement(); + GemmKey key{NumericTypeID::kF32, + layoutA, + NumericTypeID::kF32, + layoutB, + NumericTypeID::kF32, + LayoutTypeID::kRowMajor, + NumericTypeID::kF32, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 1, + 2, + alignment, + alignment, + SplitKMode::kParallel}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( const SizeArgs& args) const { auto&& param = args.opr->param(); @@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( available &= ((m + m_algo_param.threadblock_m - 1) / m_algo_param.threadblock_m <= y_grid_limit); + available &= (get_available_op(args) != nullptr); + return available; } @@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec( float alpha = 1.f, beta = 0.f; using namespace cutlass::library; - - auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - - int alignment = min_alignment_requirement(); - GemmKey key{NumericTypeID::kF32, - layoutA, - NumericTypeID::kF32, - layoutB, - NumericTypeID::kF32, - LayoutTypeID::kRowMajor, - NumericTypeID::kF32, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 1, - 2, - alignment, - alignment, - SplitKMode::kParallel}; - - Operation const* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); GemmArguments gemm_args{problem_size, args.tensor_a.raw_ptr, diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 8f2da45cea2922e9a0fd5eb98514b74b6a00a3c5..aaabe442ea2b77f0fba766e582b8fc5ec94f3cfb 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { ConvBias::DirectParam{}); check(algo); } + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) { + require_compute_capability(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{19.990229f}) + .set_dtype(4, dtype::QuantizedS8{19.990228f}) + .set_epsilon(1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) { + require_compute_capability(7, 5); + Checker> checker( + handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformIntRNG bias_rng{-50, 50}; + UniformIntRNG rng_u4{0, 15}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng_u4) + .set_dtype(0, dtype::QuantizedS8{0.2f}) + .set_dtype(1, dtype::QuantizedS8{0.3f}) + .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f}) + .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8}) + .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4}) + .set_epsilon(1 + 1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) { + require_compute_capability(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformFloatRNG float_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &float_rng) + .set_rng(3, &float_rng) + .set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + #endif TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { @@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) { checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< ConvBiasForward>( ConvBias::algo_name( - "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2", + "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1", ConvBias::DirectParam{}) .c_str())); checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) @@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM", param::ConvBias::Format::NCHW32); } + +TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) { + require_compute_capability(7, 5); + benchmark_target_algo_with_cudnn_tsc( + handle_cuda(), get_det_first_bench_args(16), + dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, + dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, + "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM", + param::ConvBias::Format::NHWC); +} #endif TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) { diff --git a/dnn/test/cuda/conv_test_utils.cpp b/dnn/test/cuda/conv_test_utils.cpp index 387e97052b559326a61cd5065d1e5c02c9fcf0f7..91cac30c149cf473c20c24aaf600cf721de83440 100644 --- a/dnn/test/cuda/conv_test_utils.cpp +++ b/dnn/test/cuda/conv_test_utils.cpp @@ -102,9 +102,7 @@ std::vector get_det_first_bench_args(size_t batch) { args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2}); args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1}); - args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2}); args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1}); - args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2}); return args; } @@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc( .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]}) .dimshuffle({1, 3, 4, 0, 2})); + } else if (format == Format::NHWC) { + ret = static_cast( + TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1})); } return ret; }; @@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc( if ((format == Format::CHWN4 || format == Format::NCHW4) && (arg.ci % 16 != 0)) continue; + // skip testcase which cannot enable nhwc tensorcore + if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0)) + continue; Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 ? Format::NCHW32 : Format::NCHW4; diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index bef4bfd5a1e36f96369788ebb7712c6858e434be..b4172536a4507eed97acea33052a7c2a4cfb8708 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) { all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); - all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); for (auto algo_param : all_params) { Checker checker(handle_cuda());