From 11f022ff7cf5466896fe9f2a6e1d24cb8827e875 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 11 Aug 2021 19:01:26 +0800 Subject: [PATCH] feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt GitOrigin-RevId: 229e1eb4be0fe29932117f226523b446c9ca665e --- .../cutlass_generator/conv2d_operation.py | 36 ++- dnn/scripts/cutlass_generator/generator.py | 131 ++++++-- dnn/scripts/cutlass_generator/library.py | 18 ++ dnn/scripts/cutlass_generator/list.bzl | Bin 72638 -> 97199 bytes dnn/src/common/convolution.cpp | 5 +- dnn/src/cuda/conv_bias/algo.cpp | 31 +- dnn/src/cuda/conv_bias/algo.h | 46 ++- .../conv_bias/cutlass_convolution_base.cpp | 21 +- .../cuda/conv_bias/cutlass_reorder_filter.cu | 71 +++-- .../cuda/conv_bias/cutlass_reorder_filter.cuh | 6 +- .../implicit_gemm_int4_int4_nhwc_imma.cpp | 18 +- .../implicit_gemm_int4_nchw64_imma_base.cpp | 12 +- .../implicit_gemm_int4_nhwc_imma_base.cpp | 61 ++-- .../implicit_gemm_int8_nchw32_imma.cpp | 12 +- .../implicit_gemm_int8_nchw4_dp4a.cpp | 17 +- .../implicit_gemm_int8_nhwc_imma.cpp | 294 ++++++++++++++++++ .../implicit_gemm_uint4_int4_nhwc_imma.cpp | 39 ++- dnn/src/cuda/conv_bias/opr_impl.h | 1 + dnn/src/cuda/convolution/backward_data/algo.h | 2 + .../implicit_gemm_int8_nchw4_dp4a.cpp | 71 +++-- .../implicit_gemm_int8_nchw_dp4a.cpp | 69 ++-- dnn/src/cuda/cutlass/convolution_operation.h | 4 +- dnn/src/cuda/cutlass/library.h | 2 +- dnn/src/cuda/cutlass/operation_table.cpp | 28 +- dnn/src/cuda/cutlass/operation_table.h | 12 +- dnn/src/cuda/cutlass/util.cu | 29 ++ dnn/src/cuda/cutlass/util.h | 4 + dnn/src/cuda/matrix_mul/algos.h | 4 + .../cuda/matrix_mul/cutlass_float32_simt.cpp | 64 ++-- .../cutlass_float32_simt_split_k.cpp | 65 ++-- dnn/test/cuda/conv_bias_int8.cpp | 131 +++++++- dnn/test/cuda/conv_test_utils.cpp | 8 +- dnn/test/cuda/convolution.cpp | 1 - 33 files changed, 1042 insertions(+), 271 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py index 735f4d941..539574b98 100644 --- a/dnn/scripts/cutlass_generator/conv2d_operation.py +++ b/dnn/scripts/cutlass_generator/conv2d_operation.py @@ -19,8 +19,8 @@ class Conv2dOperation: # def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ - need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ - required_cuda_ver_major = 9, required_cuda_ver_minor = 2): + special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \ + without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2): self.operation_kind = OperationKind.Conv2d self.conv_kind = conv_kind @@ -34,7 +34,7 @@ class Conv2dOperation: self.element_epilogue = element_epilogue self.epilogue_functor = epilogue_functor self.swizzling_functor = swizzling_functor - self.need_load_from_const = need_load_from_const + self.special_optimization = special_optimization self.implicit_gemm_mode = implicit_gemm_mode self.without_shared_load = without_shared_load self.required_cuda_ver_major = required_cuda_ver_major @@ -60,16 +60,18 @@ class Conv2dOperation: else: inst_shape = '' - unity_kernel = '' - if not self.need_load_from_const: - unity_kernel = '_1x1' + special_opt = '' + if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity: + special_opt = '_1x1' + elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling: + special_opt = '_s2' reorder_k = '' if self.without_shared_load: reorder_k = '_roc' return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ - inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ + inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \ reorder_k, ShortEpilogueNames[self.epilogue_functor]) # @@ -183,7 +185,7 @@ using Convolution = ${stages}, ${alignment_src}, ${alignment_filter}, - ${nonuninity_kernel}, + ${special_optimization}, ${math_operator}, ${implicit_gemm_mode}, ${without_shared_load}>; @@ -226,7 +228,7 @@ using Convolution = 'stages': str(operation.tile_description.stages), 'alignment_src': str(operation.src.alignment), 'alignment_filter': str(operation.flt.alignment), - 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode], 'without_shared_load': str(operation.without_shared_load).lower() @@ -266,7 +268,7 @@ using Deconvolution = ${stages}, ${alignment_src}, ${alignment_filter}, - ${nonuninity_kernel}, + ${special_optimization}, ${math_operator}, ${implicit_gemm_mode}>; """ @@ -308,7 +310,7 @@ using Deconvolution = 'stages': str(operation.tile_description.stages), 'alignment_src': str(operation.src.alignment), 'alignment_filter': str(operation.flt.alignment), - 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] } @@ -323,9 +325,9 @@ using Deconvolution = ################################################################################################### # -def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ - skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \ - required_cuda_ver_minor = 2): +def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \ + use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ + required_cuda_ver_major = 9, required_cuda_ver_minor = 2): operations = [] element_epilogue = DataType.f32 @@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) - if not skip_unity_kernel: - new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) + if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt: + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) operations.append(new_operation) return operations diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py index 5a4d3d35d..b58147463 100644 --- a/dnn/scripts/cutlass_generator/generator.py +++ b/dnn/scripts/cutlass_generator/generator.py @@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args): for dst_type, dst_layout in zip(dst_types, dst_layouts): if dst_type == DataType.s4 or dst_type == DataType.u4: min_cc = 75 - skip_unity_kernel = True + use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt else: min_cc = 61 - skip_unity_kernel = False + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity tile_descriptions = [ TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), @@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args): TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), - ] - operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 32, 32, 32, - skip_unity_kernel) + ] + for tile in tile_descriptions: + if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32: + continue + if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \ + and tile.threadblock_shape[0] > 16: + continue + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], + dst_layout, dst_type, min_cc, 32, 32, 32, + use_special_optimization) return operations @@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args): DataType.s8, ] + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity + min_cc = 75 max_cc = 1024 @@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args): TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) else: assert dst_layout == LayoutType.TensorNC4HW4 tile_descriptions = [ - TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) + + layouts_nhwc = [ + (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32), + (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64), + (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128), + ] + + dst_layouts_nhwc = [ + LayoutType.TensorNHWC, + ] + + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + dst_type = math_inst.element_b + tile_descriptions = [ + TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32: + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + + out_dtypes = [DataType.s4, DataType.u4, DataType.f32] + + #INT8x8x4 and INT8x8x32 + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + for out_dtype in out_dtypes: + tile_descriptions = [ + TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \ + else 8 * DataTypeSize[out_dtype] + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32): + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) return operations @@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args): LayoutType.TensorNC64HW64, ] + use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity + min_cc = 75 max_cc = 1024 @@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args): TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), ] operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], - dst_layout, dst_type, min_cc, 128, 128, 64, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) layouts_nhwc = [ (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), @@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args): for dst_layout in dst_layouts_nhwc: dst_type = math_inst.element_b tile_descriptions = [ + TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), ] for tile in tile_descriptions: - operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], - dst_layout, dst_type, min_cc, layout[2], layout[2], 32, - False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + dst_align = 16 if tile.threadblock_shape[1] == 16 else 32 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 - operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], - dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, - False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) + # INT4x4x8 + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + tile_descriptions = [ + TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), + ] + for tile in tile_descriptions: + dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) + if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: + dst_align = 64 if tile.threadblock_shape[1] == 32 else 128 + operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, + DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, + ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) return operations @@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args): DataType.s8, ] + use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling + min_cc = 61 max_cc = 1024 @@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args): for layout in layouts: for dst_type, dst_layout in zip(dst_types, dst_layouts): tile_descriptions = [ - TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), @@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args): ] operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], dst_layout, dst_type, min_cc, 32, 32, 32, - True) + use_special_optimization) return operations ################################################################################ diff --git a/dnn/scripts/cutlass_generator/library.py b/dnn/scripts/cutlass_generator/library.py index 0559f28f7..9308357b9 100644 --- a/dnn/scripts/cutlass_generator/library.py +++ b/dnn/scripts/cutlass_generator/library.py @@ -562,6 +562,24 @@ StrideSupportNames = { StrideSupport.Unity: 'unity_stride', } +class SpecialOptimizeDesc(enum.Enum): + NoneSpecialOpt = enum_auto() + ConvFilterUnity = enum_auto() + DeconvDoubleUpsampling = enum_auto() + +SpecialOptimizeDescNames = { + SpecialOptimizeDesc.NoneSpecialOpt: 'none', + SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity', + SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling', +} + +SpecialOptimizeDescTag = { + SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE', + SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY', + SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING', +} + + class ImplicitGemmMode(enum.Enum): GemmNT = enum_auto() GemmTN = enum_auto() diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl index 7aaae2d617014de4e40573511f1702b8ee537ac9..52b16821132642f6565526c45e58f0c99e5d0446 100644 GIT binary patch literal 97199 zcmcJY-EJH=5{2*c6h`)Tkw8By=w`3?Ar^~)BP*5=TQ)Rul)U{GyQjKX{OM+~PA!7O z>!d%Pu5(zV*zK8reE84f%j4_Ex5rN(e*62wr8!3DK^eII*PSc7b_CVNJE$FzYU4F^fe>;M0cz0evOOOoBZ6vUyrPvOBO=wtYw84pRC@Dk zN2Dv9QL-bpdHUx4X?qYLi|*hBRf^06y5Le*$B;Gx&CpB#n;p_jpbKmotHPQI)DoO` zX(nWA?rdf42r3LWq1i$02r7-!PctEPN~fWA1gZ2E*N#Xsoz=A?Cmu1@#BO`)p zW4xm33wA`Rjn~u(Mq2)y`L!d`70xKx5i(CF_kUJEw&+fdXeQ7FT-W&_Z3LR3oS(@d z%>=r@rm>oyW&+J;=Uti!`I_nR^cL5SNHU$( zwIe74)S%a8dp<@+1l7iPMb#JVh*TS|sS}K}{5kV$N2Dv9Q93}-(`b4iPEWL#huh!2 zr$^_{-(Jp*$JuX>FXwNs-@c!Izq&h}ukP;emkZPXOM@g&QVO?6;`if>Di#Q6VkDzU z99EbY%EI6kOTs+RZq;uitTaHmVwIH!Ml)kgD-BQUPF}m3Al#ld9VpU11(**Gg}@APch2f4N%& zlq*(QX<&3K=7F-Xs_F<~9%$D}Z@Xw(X@GLYs;V0p<5n7=EUc=!fzcJ_fmT-X!6&rA zPC&V0o0SucZpA!M7PeI#Da-@yTIp>UYF6^y8lYUUs_F*DxRnMd3#+PbV049fpj|7y z?V@R=0m>DttTZsX74tw@SXFg|Fb}l060{%FR@$LlF{!$pF<_+~%EF}Tc1Bm22impL z+b)_`8lYUU%1Q&HTQLumg;iBY2=hR@R(jh-(@F!BD^^wAz!8*$d)~x34wBMFa?tt?>w{tQmx9Yex1x!`9efx|+6Y53I4lv^BPC zn?l4I8?0%~;Ebz;cwkLyW`%fQO=~8DXlL#%uSjF~XEJ#%r5GHfxNqrZs~zt`g#bHLaNy;(;}-@d&A`8H6>f zIXL6StQmwgt2uaWCBy@3TH_H?Su+UJ*4VZXRYg27T}`JD4@_GF5F*ytV9ja{&bTpa z24T%=4qjUc@xYqac!X5e48oe$49>W!A|6JDZQI zr6wld>NvA&;y9vPttuGfOqNv@d>ql02pq_5Ed@HBDJf6UUict*YST zh^8#6RDHre&SY6t!N(C@Sq^4x;>eaKUA48orYvounaR}>6U~UGEN!Bh$+D#2W<*z( zgV{CF*&v!Gjx)Jhm5JksZnYdtmQ@uTAj`q*n&@m0O%unNT&=3$(IL_p1RVI!jy47+pSyokWfGh{IHW9Pm(k7ajTrDZM88Kj@ znaQ%G;ATWumV?t19?7qASbBeBz@PRcJmM+dZ7zF0G_ESfsu_MVXZ0V5Lj9LOQGAU^NT5JJF4% zjIo-99J{p2hJ)2Cq(cVr^jZ8Et7*#a#5I~S#%dOF?9wq)##l{L#x8Bjs7x7SX;Vh_ zHKV4OSlSfRrOj&4ZCxU2Vrf%Mmo{aHDJE9al(9)x$#AfmrcBCku$rbgWK^b%v6_V( zn`C3AjIo-99J{oV;b1jQamc7l8Dl9^_~>VH+v8xVLONwQSjrTDjLj4qt69jgNmfPF z!D<$AQig-oG{qsKGG&a_EaccE8#85$)hy)LrIic^t7(cuMrFzvOPgZaBUIIJuyi4P zG8`;zieHA9Vq!H5IX210Oc`S}3psXaCBwmLn&OaAnKH&|nld)Ys%kh`%|cGfaIl)D zIAm0&f1-b0U5SJUU83c&0F0kRy>C-Y9+LZ)CMNLs-}kO&|#n8=Y9Fp(oA zU?N6zy@X&QMO6RP0N>_25)nv*0L(EWK$e1u7-g+{`C3v<0twDyaawar?dqCHefG}Jzb;bSBB(?11NP^;_B(%md?JYB53RZf|bux1Wz15(DXb-5I0Y$ zjnFhDu}g&;R)DJ6}csz+@|%0Q^h zQNJb_OgZV-1;njoKVE_9^ycRJ>HVgE?ztUWh?`I>n*oL9a~28~i=bFNhoR^#2}{TC z?x1Fa0HbC(fH8|0z_=;}FlrS7C;7*1ao%{mQlDO)-XD4o!i2|O4JdWq!T2;q zy7XEcy~RLxw*0j`{#JUMVUr=F7oA3!Guj%WRn9QY8DoMpkJ><9LoGz1KvCnrohCkH-&P7WMjCkH-+ zPR_bU(ckHqgDg%@a)65!l3wNp$>y@>u|e|d)FA1sUfN9x4RQp@3{hA38`wWbpXC_KHX9msJnL*R*%%DfmnV}^K z<|=Ey{Pa%!5lJsIgJiSvJTpjsof#yZ)k~jAnL&=QIy%<9F2{3?Be|WQze5LLR|EA2 z4LYVqkB*`Ih6N90Vl z40jjH=y4f&N5?C(Qo%^}DXrNsjc}-V1JxOTl@0(^8i38`KU^z30ebes^#&y2cn@&@ z^uGJqTKef?>a1uc+qAa=+EwDIVh>hn1XrdiR%-+|?*VwV_#EgifL9!mhrJ@C4}D57!%zgyZ%4>h9_N>Tc#Ba*SCh5_a0770QkQ zk$H0fV#OsOG-(Y1%lrka4FTtUAwX|H$aGf-(2Pn2{gDxQi}-lYkUj`&cgLW3(h~Oi zqK(OB;fQ%dz){5@98CP}T~Mm7e`@y^w&VKgNsP1880hn1Xnr$R%-+|?*VwV_;l$mfL9!m2lpO8PX%cMjwYy) z>=O)_Hvt$_yn!K;&H${;RH)JbY(D?tTH&$MvmdS(l7!`x0Fax20w~kp)BOr4I|W4M zEdYoWkAP6!27q99>o0r~^Bv(YnCv}{m|Ksdit{+8HXg@?bn3#VrK;O85LRnY21qTQ z3|j;`^@Zdtef*7O5jcD`L-C{y?DZ-ela0a=^EQB^iaj`{ZUZ>3v;iDbw*eeCX${Ii z2wArQTphT+5ji{D4%nt_k_tw00(O=cyGA%vyn*Trz)A;zDh0K8g!x^x%7D~`y6 zd%@OILE3<$32G$!1Ow(x00tFrV92C104p;Usx$zb&wsd9c&zm7hwFtTp*UmkZvqOS zOn*=JE1>KY5Sh0CAXYpALUkJeg59mZ@JY;fguh_2_c&s1J&r2Qws2f-VBgju?b1l4FO4( zmVl)4G&|mASbfXeLWJUotm)Pep*kjYhuZ|Jc_|ifC_A>ZOwQ;Gz=}&?ts!9P3}Ce( z;Jhyc=nY7N?g{~#L8;JZGKh|@iP98&ol#TSFeGzX)dtRxkK2vbDn_Bz#*k$KqxHs+ z^WG4lI3jDhJ4C3C$p&C~*ONoqgR4bqC_4s3=FI_!6_^r0NhL$)H|dIJQ)G_?2KfRqi`UQE* zVe`$k{(`*QQ)*F{@4~u#LmvIk@~uO6^rt)N%VZwzZs{xX?gl>Rg5hjVYFHryNUB2c ztbYwx$PjtPLI&V_;HZTRlI}uGPDx7Hp?;TI9zI3j0m>AC@$5jJBJdD(iogT#J?IpH z2hl0IL?LN{wTL^$?41fZqGWa&GOyx%QRblwS8gC(K~q>m^8zo`RY#J8AAnz=nTN>l^m!w1T03;M`f?EU4KEl z$@SZUJmzDVxk(t}GOd(2C9df@%!`Ypwh3KW+WM`W#DDbT88QYZbPx3c;A1B#k^hGLfe0mao$ zp{R9#K#_HSKv8+N;S*N&2ejgdtkwMits0d&{VD~yCvW6*H#|qrDK#@CBZ9m57NRzW ztZL9(E}W&sV2k_5K)3cQmSMn`eulAy3&nPhGBE-N(p3%C4 ztW@RbcKzcspxHIhN-oL(RUE}uHSYB}YavpeL%>B9ka~mARnLHGUXQqCJBNZddIk#3 zy)LNUDt5g-=gu<WW-N5cF&4RH7>m^>(ZHI9fW`e@`}LTQVdmarh^6%ysx}D2OiClTmo9sWK2B={ zck8VoYVqkxdLukrQX9mUKG0iNJbeJe%$>pz%RYdiYVR@3x({HudcInBFsx<_xy@KiuLQ4{Qy929ZsW0HExm`%55nOE* zQf&-buMSAPG30WEKr4>OM7>I&RmWsQW^!*;^YsT5HTMj~9NjlsE1;c1s*NG*S&Y;h zLoWLRT5&|y>i&RM9g_{|58d6)@zL30c*FQdV|oCgx<<#b!==WX{F|N zA(cjOwN*&9F=XA(k$PjuX>X$e~=eYx*MLU=M!-5;v1a0bqP3G)wZ`>I7^Gc z7L~<78->MyFI`9c!eAG@%5QgVvz{X+$uZ=j2vEgQY*mR~vvijj1um+9!Ztbu3SIRI6qsjTh=GlM zfz};lr7B0S$GP*20vAO3QR87MH%y$}N%Jp-*f$Vyd?UY~R483itifWkKNj6zp= dMuD4oM!_3-Mxj~vtd-3#mc6;IzF78u{|9~n@nHY} delta 2260 zcmY*aTTEP46wSrU%rMLYnW8mqc~b_a5MHGeZJ1Qh7Gp~ELz7xNn$-BAhWH_md4S-M zKuZ~B*)7XCiy?d5;&ev{wVErQ<-Tg)J@kZE#X2gN{uXMPA0`=iUutw#_M(oJ@P{%cnxT6;IE_hHFbgK-- z7+P>M+=z})DMy+0c&LzBT`*0mL@}F?JDP)=VR^i&^T-t|iX(fd&NC5{(T!IlGO2CE zsZ3@|yHw-cfSIopd*Xwl;zqcZ*rIMwb;>&tbYVT|mRZ~>u201UX>wrf{sR~sw!#$d@ry_*cBk+M0uSNDqjP}kCa2|07ekC~~)PTG{^VzIK-&i%CpQu3g z5Hs99TP}PbM$eT=d~>);GT(TInS@r!emTnc^a$hOiJWBY{FWx=_+uG48jxJup^|mPwXDyGcgNB+DS2WDrg= zC|}G-GLj6!g^Z*>=FXDkuU*_mY+!(L!~;dr6KzBZqFajFK9&t5n44-^@g>xFU}DD&zpaZG&m5*|gD_KC0K>VAQ@ z%9VI>6XY`o+1pdqc;jOWy4Q5Pa+w_?@<1=~2SH(VrV)2vYV)vRhbGw!@u|Tpg9a~u{ zHf`cmIkI5f3R_T+cp8COT@2X*`itc!y2Owj;?g}U@NCTgkSlE(41K~WPWfHm+xecJ zH1Ah?nuhORySyvTA-ancBPy$Si|i9!ZsIvbCEg<86~;(VD@Yy9)?RWO!A|Y4;rD5c z#_5N(TBEH%a~P{WZFfqz&sbBR6>EN*anh$9Gv;$N-PqZs*;B&3V>NDdYrAPvF}AN; zjJkHsk=kj}nzJd;n#K%)RT%S+YIR1<<632^>oM&oo%gP{YfZ+BZ)s1eWLrw4e5X@+ HRhs=@b-&?3 diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 04934ac39..f0aa61d85 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -553,7 +553,10 @@ void ConvolutionBase::check_or_deduce_dtype_fwd(DType src, dst.valid() && (dst.enumv() == src.enumv() || ((dst.enumv() == DTypeEnum::QuantizedS4 || dst.enumv() == DTypeEnum::Quantized4Asymm) && - src.enumv() == DTypeEnum::QuantizedS8)); + src.enumv() == DTypeEnum::QuantizedS8) || + ((src.enumv() == DTypeEnum::QuantizedS4 || + src.enumv() == DTypeEnum::Quantized4Asymm) && + dst.enumv() == DTypeEnum::QuantizedS8)); if (cond_dst) { supported_dst_dtype.push_back(dst); } diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index 6c687dfec..4475c0718 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : int8_nchw32_imma) { all_algos.push_back(&algo); } + for (auto&& algo : int8_nhwc_imma) { + all_algos.push_back(&algo); + } for (auto&& algo : int4_int4_nchw64_imma) { all_algos.push_back(&algo); } @@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { int8_nchw32_imma.emplace_back( AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1}); } - + { + using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam; + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16}); + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8}); + int8_nhwc_imma.emplace_back( + AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8}); + int8_nhwc_imma.emplace_back( + AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4}); + } { using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam; int4_int4_nchw64_imma.emplace_back( @@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { } { using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); + int4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); int4_int4_nhwc_imma.emplace_back( AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); int4_int4_nhwc_imma.emplace_back( @@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { } { using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam; + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); + uint4_int4_nhwc_imma.emplace_back( + AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); uint4_int4_nhwc_imma.emplace_back( AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); uint4_int4_nhwc_imma.emplace_back( diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 2682391bc..90e0dc7b1 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -72,6 +72,7 @@ public: CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8, CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8, CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8, + CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4, CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4, CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4, @@ -524,6 +525,7 @@ public: * + * +--- AlgoInt8NCHW4DotProdImplicitGemm * +--- AlgoInt8NCHW32IMMAImplicitGemm + * +--- AlgoInt8NHWCIMMAImplicitGemm * + * +--- AlgoInt4NCHW64IMMAImplicitGemmBase * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm @@ -582,7 +584,7 @@ public: // operation (cutlass kernel) from the global OperationTable const cutlass::library::Operation* get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, - bool load_from_const, bool without_shared_load) const; + bool use_conv_filter_unity_opt, bool without_shared_load) const; // execute the cutlass kernel found by get_cutlass_conv_op. we give // subclasses full freedom to decide where and how these arguments are @@ -829,6 +831,47 @@ private: std::string m_name; }; +class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final + : public AlgoCutlassConvolutionBase { +public: + AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param) + : AlgoCutlassConvolutionBase(algo_param) { + m_name = ConvBias::algo_name( + ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s", + to_string(m_algo_param).c_str()), + ConvBias::DirectParam{}); + } + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + static std::string to_string(AlgoParam algo_param); + size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8) + + std::string param() const override { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; + } + +private: + std::tuple get_constants( + const ExecArgs& args) const; + + void reorder_filter(const ExecArgs& args, int interleaved, + void* reordered_filter) const; + + std::string m_name; +}; + class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase : public AlgoCutlassConvolutionBase { public: @@ -1087,6 +1130,7 @@ public: #endif #if CUDA_VERSION >= 10020 std::vector int8_nchw32_imma; + std::vector int8_nhwc_imma; std::vector int4_int4_nchw64_imma; std::vector uint4_int4_nchw64_imma; std::vector int4_int4_nhwc_imma; diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp index 4c4f04a26..096686d13 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp @@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, LayoutTypeID::kTensorNC64HW64}; case Format::NHWC: switch (access_type) { + case 4: + return {LayoutTypeID::kTensorNHWC, + LayoutTypeID::kTensorNC4HW4, + LayoutTypeID::kTensorNHWC, + LayoutTypeID::kTensorNHWC}; case 8: return {LayoutTypeID::kTensorNHWC, LayoutTypeID::kTensorNC8HW8, @@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, - bool load_from_const, bool without_shared_load) const { - using Format = param::ConvBias::Format; + bool use_conv_filter_unity_opt, bool without_shared_load) const { auto&& param = args.opr->param(); auto layouts = get_layout_pack(param.format, m_algo_param.access_size); - auto epilogue_type = get_epilogue_type(param.nonlineMode, - param.format != Format::NCHW4_NCHW); + auto epilogue_type = get_epilogue_type( + param.nonlineMode, + args.dst_layout->dtype.enumv() != DTypeEnum::Float32); + + cutlass::conv::SpecialOptimizeDesc special_optimization = + (use_conv_filter_unity_opt) + ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY + : cutlass::conv::SpecialOptimizeDesc::NONE; + ConvolutionKey key{convert_conv_op(conv_op), convert_dtype(args.src_layout->dtype.enumv()), layouts.src, @@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( m_algo_param.instruction_k, epilogue_type, m_algo_param.stage, - load_from_const, + special_optimization, without_shared_load}; return Singleton::get().operation_table.find_op(key); diff --git a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu index 2f296aca3..c5ca513c3 100644 --- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu +++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu @@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter( IC, FH, FW, trans_oc); after_kernel_launch(); } - -template +template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter( int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, - uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved, - cudaStream_t stream) { - static constexpr uint32_t elements_per_access = alignbits / size_bits; - uint32_t nr_threads = - query_blocksize_for_kernel(reinterpret_cast( - reorder_nhwc_imma_filter_kernel)); + uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits, + uint32_t interleaved, cudaStream_t stream) { + const uint32_t elements_per_access = alignbits / size_bits; + + void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t, + uint32_t, uint32_t, uint32_t, bool); + kern = nullptr; + + auto get_kern = [&kern](const uint32_t alignbits, + const uint32_t interleaved) { +#define DISPATCH_KERNEL(alignbits_, interleaved_) \ + if (alignbits == alignbits_ && interleaved == interleaved_) { \ + kern = reorder_nhwc_imma_filter_kernel; \ + return; \ + } + DISPATCH_KERNEL(128, 16); + DISPATCH_KERNEL(64, 16); + DISPATCH_KERNEL(32, 16); + DISPATCH_KERNEL(128, 32); + DISPATCH_KERNEL(64, 32); + DISPATCH_KERNEL(32, 32); + DISPATCH_KERNEL(128, 64); + DISPATCH_KERNEL(64, 64); + DISPATCH_KERNEL(32, 64); + +#undef DISPATCH_KERNEL + }; + + get_kern(alignbits, interleaved); + + uint32_t nr_threads = query_blocksize_for_kernel(kern); uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access); nr_threads = std::min(nr_threads, vthreads); uint32_t nr_blocks = DIVUP(vthreads, nr_threads); - if (oc_interleaved == 32) { - reorder_nhwc_imma_filter_kernel - <<>>( - dst_filter, src_filter, OC, IC, FH, FW, trans_oc); - } else { - reorder_nhwc_imma_filter_kernel - <<>>( - dst_filter, src_filter, OC, IC, FH, FW, trans_oc); - } + + kern<<>>(dst_filter, src_filter, OC, IC, + FH, FW, trans_oc); + after_kernel_launch(); } @@ -180,15 +200,14 @@ INST(8, 32) INST(4, 64) #undef INST -#define INST(_size_bits, _alignbits) \ - template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \ - _size_bits, _alignbits>( \ - int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ - uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ - uint32_t oc_interleaved, cudaStream_t stream); -INST(4, 32) -INST(4, 64) -INST(4, 128) +#define INST(_size_bits) \ + template void \ + megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \ + int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ + uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ + uint32_t alignbits, uint32_t interleaved, cudaStream_t stream); +INST(4) +INST(8) #undef INST // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh index dc46f9c60..03f74e902 100644 --- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh @@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter, uint32_t FW, bool trans_oc, cudaStream_t stream); -template +template void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, uint32_t FH, - uint32_t FW, bool trans_oc, - uint32_t oc_interleaved, cudaStream_t stream); + uint32_t FW, bool trans_oc, uint32_t alignbits, + uint32_t interleaved, cudaStream_t stream); } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp index cbe3bc5d4..8b5f1cda0 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp @@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants( args.filter_layout->dtype.param().scale, bias_scale = args.bias_layout->dtype.param().scale, - dst_scale = args.dst_layout->dtype.param().scale; + dst_scale; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + dst_scale = args.dst_layout->dtype.param().scale; + } float alpha = src_scale * filter_scale / dst_scale, beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; if (args.z_layout->ndim > 0) { - float z_scale = args.z_layout->dtype.param().scale; + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + z_scale = args.z_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.z_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + z_scale = args.z_layout->dtype.param().scale; + } gamma = z_scale / dst_scale; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp index 5f4f11ef4..40bb0e82f 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp @@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available( if (fh * fw > kMaxFilterPixels) return false; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = true; + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + return true; } @@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( float dst_scale = 0.f; float threshold = 0.f; uint8_t src_zero = 0; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = true; if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { @@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp index 353e26d99..cf53fd39c 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp @@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( if (args.src_layout->dtype.enumv() != src_dtype() || args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || - args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || - args.dst_layout->dtype.enumv() != src_dtype()) + args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32) + return false; + + if (!(args.dst_layout->dtype.enumv() == src_dtype() || + args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8)) return false; // uint4 do not support H_SWISH activition @@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0)) return false; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 32 || + m_algo_param.threadblock_n == 64)); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + return true; } @@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( float dst_scale = 0.f; float threshold = 0.f; uint8_t src_zero = 0; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && (m_algo_param.threadblock_n == 32 || m_algo_param.threadblock_n == 64)); + if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + src_zero = args.src_layout->dtype.param() + .zero_point; + } + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { dst_scale = args.dst_layout->dtype.param().scale; - src_zero = args.src_layout->dtype.param() - .zero_point; - } else { // DTypeEnum::QuantizedS4 + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { dst_scale = args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::QuantizedS8 + dst_scale = args.dst_layout->dtype.param().scale; } cudaStream_t stream = cuda_stream(args.opr->handle()); const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, @@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter( cudaStream_t stream = cuda_stream(args.opr->handle()); // reformat filter from nhwc to ncxhwx and reorder oc - // use trans_oc threadblock_n must be 32 or 64 + // use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && (m_algo_param.threadblock_n == 32 || m_algo_param.threadblock_n == 64)); - uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32; - - if (iterleaved == 8) { - cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } else if (iterleaved == 16) { - cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } else { - megdnn_assert(iterleaved == 32); - cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>( - reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, - fh, fw, trans_oc, oc_iterleave, stream); - } + uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32; + + uint32_t alignbits = iterleaved * 4; + + cutlass_wrapper::reorder_nhwc_imma_filter<4>( + reinterpret_cast(reordered_filter), + reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, + fw, trans_oc, alignbits, oc_iterleaved, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 4eda5cd70..f182f00bd 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( // FIXME: too large filter size is not supported now size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2; available &= fh * fw <= kMaxFilterPixels; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = (param.format == Format::NCHW32); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + available &= (op != nullptr); + return available; } @@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( gamma = z_scale / dst_scale; } float delta = 0.f, theta = 0.f, threshold = 0.f; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = (param.format == Format::NCHW32); const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, ConvType::kConvolution, - load_from_const, without_shared_load); + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 348c122d4..83a0c341b 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( // FIXME: too large filter size is not supported now size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2; available &= fh * fw <= kMaxFilterPixels; - ; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = false; + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + available &= (op != nullptr); + return available; } @@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( } } float threshold = 0.f; - bool load_from_const = !(fh == 1 && fw == 1); + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); bool without_shared_load = false; - const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, - ConvType::kConvolution, - load_from_const, without_shared_load); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp new file mode 100644 index 000000000..8e35d1e9c --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp @@ -0,0 +1,294 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/cutlass_reorder_filter.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + using NonlineMode = megdnn::param::ConvBias::NonlineMode; + + auto&& param = args.opr->param(); + + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) + return false; + + if (param.format != Format::NHWC || param.sparse != Sparse::DENSE || + param.mode != Mode::CROSS_CORRELATION) + return false; + + if (param.nonlineMode != NonlineMode::IDENTITY && + param.nonlineMode != NonlineMode::RELU && + param.nonlineMode != NonlineMode::H_SWISH) + return false; + + if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 || + args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8) + return false; + + auto dst_dtype = args.dst_layout->dtype.enumv(); + + if (!(dst_dtype == DTypeEnum::QuantizedS8 || + dst_dtype == DTypeEnum::QuantizedS4 || + dst_dtype == DTypeEnum::Quantized4Asymm || + dst_dtype == DTypeEnum::Float32)) + return false; + + if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 || + (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && + dst_dtype == DTypeEnum::Float32))) + return false; + + if (!is_compute_capability_required(7, 5)) + return false; + + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + // param buffer size is 4K, use 3.4K to store precomputed offset + size_t kMaxFilterPixels = + 848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1; + if (fh * fw > kMaxFilterPixels) + return false; + // co should be aligned with 4, and ci should be aligned with + // algo_param.access_size + if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0)) + return false; + + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + dst_dtype != DTypeEnum::Float32))); + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + if (op == nullptr) + return false; + + return true; +} + +size_t +ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + if (args.preprocessed_filter) { + return 0; + } else { + return args.filter_layout->span().dist_byte(); + } +} + +size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0; +} + +SmallVector ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: + deduce_preprocessed_filter_layout(const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + reorder_filter(args, m_algo_param.access_size, filter_ptr); +} + +std::tuple +ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants( + const ExecArgs& args) const { + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = 1.f, dst_scale; + + if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { + bias_scale = args.bias_layout->dtype.param().scale; + } + + uint8_t dst_zero = 0; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + dst_zero = args.dst_layout->dtype.param() + .zero_point; + } else { // DTypeEnum::Float32 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32); + dst_scale = 1.f; + } + + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, + theta = dst_zero; + + if (args.z_layout->ndim > 0) { + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + z_scale = + args.z_layout->dtype.param().scale; + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + gamma = z_scale / dst_scale; + delta = -z_zero * gamma; + } else { // DTypeEnum::Float32 + megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32); + gamma = 1.f; + } + } + + if (args.opr->param().nonlineMode == + param::ConvBias::NonlineMode::IDENTITY) { + delta += theta; + theta = 0.f; + } + + return {alpha, beta, gamma, delta, theta}; +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](3), + hi = args.src_layout->operator[](1), + wi = args.src_layout->operator[](2); + size_t co = args.dst_layout->operator[](3), + ho = args.dst_layout->operator[](1), + wo = args.dst_layout->operator[](2); + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR + + void* filter_ptr = nullptr; + void* bias_ptr = nullptr; + void* z_ptr = nullptr; + + if (args.preprocessed_filter) { + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + } else { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + reorder_filter(args, m_algo_param.access_size, filter_ptr); + } + bias_ptr = args.bias_tensor->raw_ptr; + + if (args.z_layout->ndim > 0) + z_ptr = args.z_tensor->raw_ptr; + + // \note these constants of cutlass epilogue will be passed to method + // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, + // a different dtype here results in undefined epilogue behaviors + float alpha, beta, gamma, delta, theta; + std::tie(alpha, beta, gamma, delta, theta) = get_constants(args); + + float dst_scale = 1.f; + float threshold = 0.f; + bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); + + auto dst_dtype = args.dst_layout->dtype.enumv(); + + bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + dst_dtype != DTypeEnum::Float32))); + + if (dst_dtype == DTypeEnum::QuantizedS8) { // DTypeEnum::QuantizedS8 + dst_scale = args.dst_layout->dtype.param().scale; + } else if (dst_dtype == DTypeEnum::QuantizedS4) { + dst_scale = args.dst_layout->dtype.param().scale; + } else if (dst_dtype == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + } else { // DTypeEnum::Float32 + dst_scale = 1.f; + } + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + const auto* op = get_cutlass_conv_op( + args, ConvOperator::kFprop, ConvType::kConvolution, + use_conv_filter_unity_opt, without_shared_load); + + execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, + z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, + ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, + &alpha, &beta, &gamma, &delta, &theta, &threshold, + &dst_scale, stream); + + after_kernel_launch(); +} + +std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string( + AlgoParam algo_param) { + return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m, + algo_param.threadblock_n, algo_param.threadblock_k, + algo_param.warp_m, algo_param.warp_n, algo_param.warp_k, + algo_param.stage, algo_param.access_size); +} + +void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter( + const ExecArgs& args, const int iterleaved, + void* reordered_filter) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](3), + fh = args.filter_layout->operator[](1), + fw = args.filter_layout->operator[](2); + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + // reformat filter from nhwc to ncxhwx and reorder oc + // use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype + bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && + (m_algo_param.threadblock_n == 16 || + (m_algo_param.threadblock_n == 32 && + args.dst_layout->dtype.enumv() != DTypeEnum::Float32))); + uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16; + + uint32_t alignbits = iterleaved * 8; + + cutlass_wrapper::reorder_nhwc_imma_filter<8>( + reinterpret_cast(reordered_filter), + reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, + fw, trans_oc, alignbits, oc_iterleaved, stream); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp index d271449e2..73bb4b170 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp @@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants( args.filter_layout->dtype.param().scale, bias_scale = args.bias_layout->dtype.param().scale, - dst_scale = - args.dst_layout->dtype.param().scale; + dst_scale; + + uint8_t dst_zero = 0; + + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + dst_scale = + args.dst_layout->dtype.param().scale; + + dst_zero = args.dst_layout->dtype.param() + .zero_point; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + dst_scale = args.dst_layout->dtype.param().scale; + } - uint8_t dst_zero = - args.dst_layout->dtype.param().zero_point; float alpha = src_scale * filter_scale / dst_scale, beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = dst_zero; if (args.z_layout->ndim > 0) { - float z_scale = - args.z_layout->dtype.param().scale; - gamma = z_scale / dst_scale; - uint8_t z_zero = - args.z_layout->dtype.param().zero_point; - delta = -z_zero * gamma; + float z_scale; + if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + z_scale = + args.z_layout->dtype.param().scale; + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + gamma = z_scale / dst_scale; + delta = -z_zero * gamma; + } else { // DTypeEnum::QuantizedS8 + megdnn_assert(args.z_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } } // identity epilogue has no theta: diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 4af7896b7..d6ea710ed 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -65,6 +65,7 @@ public: class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; class AlgoInt8NCHW32IMMAImplicitGemm; + class AlgoInt8NHWCIMMAImplicitGemm; class AlgoInt4NCHW64IMMAImplicitGemmBase; class AlgoInt4Int4NCHW64IMMAImplicitGemm; class AlgoUInt4Int4NCHW64IMMAImplicitGemm; diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index e9b354547..f87479e5b 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -275,6 +275,7 @@ public: private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const; + const void* get_available_op(const SizeArgs& args) const; AlgoParam m_algo_param; std::string m_name; }; @@ -295,6 +296,7 @@ public: private: WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const; + const void* get_available_op(const SizeArgs& args) const; }; class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp index c4f631e6e..fc4107dbf 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -20,6 +20,43 @@ using namespace megdnn; using namespace cuda; +const void* +ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& fm = args.filter_meta; + size_t sh = fm.stride[0], sw = fm.stride[1]; + cutlass::conv::SpecialOptimizeDesc special_optimization = + (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: + DECONV_DOUBLE_UPSAMPLING + : cutlass::conv::SpecialOptimizeDesc::NONE; + ConvolutionKey key{ + cutlass::conv::Operator::kDgrad, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS8, + LayoutTypeID::kTensorK4RSC4, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS32, + LayoutTypeID::kTensorNC4HW4, + cutlass::conv::ConvType::kConvolution, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 4, + cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, + m_algo_param.stage, + special_optimization, + false}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: is_available(const SizeArgs& args) const { auto&& fm = args.filter_meta; @@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: // FIXME: too large filter size is not supported now available &= fm.spatial[0] * fm.spatial[1] <= (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2); + available &= (get_available_op(args) != nullptr); // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); @@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( args.grad_layout->dtype.param().scale; // \note these constants of cutlass epilogue will be passed to struct - // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a - // different dtype here results in undefined epilogue behaviors + // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, + // a different dtype here results in undefined epilogue behaviors float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f, gamma = 0.f, delta = 0.f; using namespace cutlass::library; - // only use 16x64x8_16x64x8_2stages impl - ConvolutionKey key{ - cutlass::conv::Operator::kDgrad, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS8, - LayoutTypeID::kTensorK4RSC4, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS32, - LayoutTypeID::kTensorNC4HW4, - cutlass::conv::ConvType::kConvolution, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 4, - cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, - m_algo_param.stage, - true, - false}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ @@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() { int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); - int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp index f45188028..50ebde323 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp @@ -19,6 +19,44 @@ using namespace megdnn; using namespace cuda; +const void* +ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& fm = args.filter_meta; + size_t sh = fm.stride[0], sw = fm.stride[1]; + cutlass::conv::SpecialOptimizeDesc special_optimization = + (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: + DECONV_DOUBLE_UPSAMPLING + : cutlass::conv::SpecialOptimizeDesc::NONE; + // only use 16x64x8_16x64x8_2stages impl + ConvolutionKey key{ + cutlass::conv::Operator::kDgrad, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS8, + LayoutTypeID::kTensorK4RSC4, + NumericTypeID::kS8, + LayoutTypeID::kTensorNC4HW4, + NumericTypeID::kS32, + LayoutTypeID::kTensorNC4HW4, + cutlass::conv::ConvType::kConvolution, + 16, + 64, + 8, + 16, + 64, + 8, + 1, + 1, + 4, + cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, + 2, + special_optimization, + false}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( const SizeArgs& args) const { auto&& fm = args.filter_meta; @@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); // FIXME: too large filter size is not supported now available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2); + + available &= (get_available_op(args) != nullptr); + // only support sm_61 or later, platform should have fast native int8 // support available &= is_compute_capability_required(6, 1); @@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( using namespace cutlass::library; - // only use 16x64x8_16x64x8_2stages impl - ConvolutionKey key{ - cutlass::conv::Operator::kDgrad, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS8, - LayoutTypeID::kTensorK4RSC4, - NumericTypeID::kS8, - LayoutTypeID::kTensorNC4HW4, - NumericTypeID::kS32, - LayoutTypeID::kTensorNC4HW4, - cutlass::conv::ConvType::kConvolution, - 16, - 64, - 8, - 16, - 64, - 8, - 1, - 1, - 4, - cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, - 2, - true, - false}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); // gcc prints warnings when size_t values are implicitly narrowed to int cutlass::conv::Conv2dProblemSize problem_size{ diff --git a/dnn/src/cuda/cutlass/convolution_operation.h b/dnn/src/cuda/cutlass/convolution_operation.h index 663bf8b9c..c2405b8b6 100644 --- a/dnn/src/cuda/cutlass/convolution_operation.h +++ b/dnn/src/cuda/cutlass/convolution_operation.h @@ -119,8 +119,8 @@ public: m_description.threadblock_swizzle = ThreadblockSwizzleMap< typename Operator::ThreadblockSwizzle>::kId; - m_description.need_load_from_const_mem = - Operator::kNeedLoadFromConstMem; + m_description.special_optimization = + Operator::kSpecialOpt; m_description.gemm_mode = Operator::kGemmMode; m_description.without_shared_load = Operator::kWithoutSharedLoad; } diff --git a/dnn/src/cuda/cutlass/library.h b/dnn/src/cuda/cutlass/library.h index 9907fad1b..b9dd76a28 100644 --- a/dnn/src/cuda/cutlass/library.h +++ b/dnn/src/cuda/cutlass/library.h @@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription { ThreadblockSwizzleID threadblock_swizzle; - bool need_load_from_const_mem; + conv::SpecialOptimizeDesc special_optimization; conv::ImplicitGemmMode gemm_mode; bool without_shared_load; }; diff --git a/dnn/src/cuda/cutlass/operation_table.cpp b/dnn/src/cuda/cutlass/operation_table.cpp index 166559180..2da08ea35 100644 --- a/dnn/src/cuda/cutlass/operation_table.cpp +++ b/dnn/src/cuda/cutlass/operation_table.cpp @@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc( key.epilogue_type = desc.epilogue_type; key.stages = desc.tile_description.threadblock_stages; - key.need_load_from_const_mem = desc.need_load_from_const_mem; + key.special_optimization = desc.special_optimization; key.without_shared_load = desc.without_shared_load; return key; @@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) { ///////////////////////////////////////////////////////////////////////////////////////////////// Operation const* OperationTable::find_op(GemmKey const& key) const { - megdnn_assert(gemm_operations.count(key) > 0, - "key not found in cutlass operation table"); - auto const& ops = gemm_operations.at(key); - megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", - ops.size()); - return ops[0]; + if (gemm_operations.count(key)) { + auto const& ops = gemm_operations.at(key); + megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", + ops.size()); + return ops[0]; + } + return nullptr; } ///////////////////////////////////////////////////////////////////////////////////////////////// Operation const* OperationTable::find_op(ConvolutionKey const& key) const { - megdnn_assert(convolution_operations.count(key) > 0, - "key not found in cutlass operation table"); - auto const& ops = convolution_operations.at(key); - megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", - ops.size()); - return ops[0]; + if (convolution_operations.count(key) > 0) { + auto const& ops = convolution_operations.at(key); + megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", + ops.size()); + return ops[0]; + } + return nullptr; } ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/dnn/src/cuda/cutlass/operation_table.h b/dnn/src/cuda/cutlass/operation_table.h index 420e1e55c..1fef2ff78 100644 --- a/dnn/src/cuda/cutlass/operation_table.h +++ b/dnn/src/cuda/cutlass/operation_table.h @@ -211,7 +211,7 @@ struct ConvolutionKey { epilogue::EpilogueType epilogue_type; int stages; - bool need_load_from_const_mem; + conv::SpecialOptimizeDesc special_optimization; bool without_shared_load; inline bool operator==(ConvolutionKey const& rhs) const { @@ -234,7 +234,7 @@ struct ConvolutionKey { (instruction_shape_n == rhs.instruction_shape_n) && (instruction_shape_k == rhs.instruction_shape_k) && (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && - (need_load_from_const_mem == rhs.need_load_from_const_mem) && + (special_optimization == rhs.special_optimization) && (without_shared_load == rhs.without_shared_load); } @@ -270,8 +270,8 @@ struct ConvolutionKey { "\n instruction_shape: " + instruction_shape_str + "\n epilogue_type: " + to_string(epilogue_type) + "\n stages: " + std::to_string(stages) + - "\n need_load_from_const_mem: " + - to_string(need_load_from_const_mem) + + "\n special_optimization: " + + to_string(special_optimization) + "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; } @@ -308,8 +308,8 @@ struct ConvolutionKeyHasher { sizeof(key.instruction_shape_k)) .update(&key.epilogue_type, sizeof(key.epilogue_type)) .update(&key.stages, sizeof(key.stages)) - .update(&key.need_load_from_const_mem, - sizeof(key.need_load_from_const_mem)) + .update(&key.special_optimization, + sizeof(key.special_optimization)) .update(&key.without_shared_load, sizeof(key.without_shared_load)) .digest(); diff --git a/dnn/src/cuda/cutlass/util.cu b/dnn/src/cuda/cutlass/util.cu index 11e1d3014..6efcfa980 100644 --- a/dnn/src/cuda/cutlass/util.cu +++ b/dnn/src/cuda/cutlass/util.cu @@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) { /////////////////////////////////////////////////////////////////////////////////////////////////// +static struct { + char const* text; + char const* pretty; + conv::SpecialOptimizeDesc enumerant; +} SpecialOptimizeDesc_enumerants[] = { + {"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE}, + {"conv_filter_unity", "ConvFilterUnity", + conv::SpecialOptimizeDesc::CONV_FILTER_UNITY}, + {"deconv_double_upsampling", "DeconvDoubleUpsampling", + conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING}, +}; + +/// Converts an SpecialOptimizeDesc enumerant to a string +char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) { + for (auto const& possible : SpecialOptimizeDesc_enumerants) { + if (special_opt == possible.enumerant) { + if (pretty) { + return possible.pretty; + } else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + static struct { char const* text; char const* pretty; diff --git a/dnn/src/cuda/cutlass/util.h b/dnn/src/cuda/cutlass/util.h index cb6e6f999..da36393e2 100644 --- a/dnn/src/cuda/cutlass/util.h +++ b/dnn/src/cuda/cutlass/util.h @@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false); /// Converts a MathOperationID enumerant to a string char const* to_string(MathOperationID math_op, bool pretty = false); +/// Converts a SpecialOptimizeDesc enumerant to a string +char const* to_string(conv::SpecialOptimizeDesc special_opt, + bool pretty = false); + /// Converts an ImplicitGemmMode enumerant to a string char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false); diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index aa2807ebc..df2c23cc5 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -235,6 +235,7 @@ public: m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s", m_algo_param.to_string().c_str())} {} bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; const char* name() const override { return m_name.c_str(); } AlgoAttribute attribute() const override { @@ -260,6 +261,7 @@ private: void do_exec(const ExecArgs& args) const override; int min_alignment_requirement() const override { return 1; } std::string m_name; + const void* get_available_op(const SizeArgs& args) const; }; class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final @@ -270,6 +272,7 @@ public: m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", m_algo_param.to_string().c_str())} {} bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; const char* name() const override { return m_name.c_str(); } AlgoAttribute attribute() const override { @@ -297,6 +300,7 @@ private: void do_exec(const ExecArgs& args) const override; int min_alignment_requirement() const override { return 1; } std::string m_name; + const void* get_available_op(const SizeArgs& args) const; }; class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp index 7278202d2..96d35303a 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp @@ -19,6 +19,39 @@ using namespace megdnn; using namespace cuda; +const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& param = args.opr->param(); + auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + + int alignment = min_alignment_requirement(); + GemmKey key{NumericTypeID::kF32, + layoutA, + NumericTypeID::kF32, + layoutB, + NumericTypeID::kF32, + LayoutTypeID::kRowMajor, + NumericTypeID::kF32, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 1, + 2, + alignment, + alignment, + SplitKMode::kNone}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( const SizeArgs& args) const { bool available = @@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( m_algo_param.threadblock_n <= y_grid_limit); + available &= (get_available_op(args) != nullptr); + return available; } @@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec( using namespace cutlass::library; - auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - - int alignment = min_alignment_requirement(); - GemmKey key{NumericTypeID::kF32, - layoutA, - NumericTypeID::kF32, - layoutB, - NumericTypeID::kF32, - LayoutTypeID::kRowMajor, - NumericTypeID::kF32, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 1, - 2, - alignment, - alignment, - SplitKMode::kNone}; - - const Operation* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); GemmArguments gemm_args{problem_size, args.tensor_a.raw_ptr, diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp index dfbc3a069..d2128a1ac 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -19,6 +19,39 @@ using namespace megdnn; using namespace cuda; +const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op( + const SizeArgs& args) const { + using namespace cutlass::library; + auto&& param = args.opr->param(); + auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor + : LayoutTypeID::kRowMajor; + + int alignment = min_alignment_requirement(); + GemmKey key{NumericTypeID::kF32, + layoutA, + NumericTypeID::kF32, + layoutB, + NumericTypeID::kF32, + LayoutTypeID::kRowMajor, + NumericTypeID::kF32, + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k, + m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k, + 1, + 1, + 1, + 2, + alignment, + alignment, + SplitKMode::kParallel}; + return (void*)Singleton::get().operation_table.find_op(key); +} + bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( const SizeArgs& args) const { auto&& param = args.opr->param(); @@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( available &= ((m + m_algo_param.threadblock_m - 1) / m_algo_param.threadblock_m <= y_grid_limit); + available &= (get_available_op(args) != nullptr); + return available; } @@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec( float alpha = 1.f, beta = 0.f; using namespace cutlass::library; - - auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor - : LayoutTypeID::kRowMajor; - - int alignment = min_alignment_requirement(); - GemmKey key{NumericTypeID::kF32, - layoutA, - NumericTypeID::kF32, - layoutB, - NumericTypeID::kF32, - LayoutTypeID::kRowMajor, - NumericTypeID::kF32, - m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k, - m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k, - 1, - 1, - 1, - 2, - alignment, - alignment, - SplitKMode::kParallel}; - - Operation const* op = Singleton::get().operation_table.find_op(key); + const Operation* op = (const Operation*)get_available_op(args); GemmArguments gemm_args{problem_size, args.tensor_a.raw_ptr, diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 8f2da45ce..aaabe442e 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { ConvBias::DirectParam{}); check(algo); } + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) { + require_compute_capability(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{19.990229f}) + .set_dtype(4, dtype::QuantizedS8{19.990228f}) + .set_epsilon(1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) { + require_compute_capability(7, 5); + Checker> checker( + handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformIntRNG bias_rng{-50, 50}; + UniformIntRNG rng_u4{0, 15}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng_u4) + .set_dtype(0, dtype::QuantizedS8{0.2f}) + .set_dtype(1, dtype::QuantizedS8{0.3f}) + .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f}) + .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8}) + .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4}) + .set_epsilon(1 + 1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) { + require_compute_capability(7, 5); + Checker checker(handle_cuda()); + auto check = [&checker](const std::string& algo) { + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo.c_str())); + UniformIntRNG rng{-8, 8}; + UniformFloatRNG float_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &float_rng) + .set_rng(3, &float_rng) + .set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); + param.pad_h = param.pad_w = 0; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); + }; + std::string algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", + ConvBias::DirectParam{}); + check(algo); + algo = ConvBias::algo_name( + "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", + ConvBias::DirectParam{}); + check(algo); +} + #endif TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { @@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) { checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< ConvBiasForward>( ConvBias::algo_name( - "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2", + "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1", ConvBias::DirectParam{}) .c_str())); checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) @@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM", param::ConvBias::Format::NCHW32); } + +TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) { + require_compute_capability(7, 5); + benchmark_target_algo_with_cudnn_tsc( + handle_cuda(), get_det_first_bench_args(16), + dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, + dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, + "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM", + param::ConvBias::Format::NHWC); +} #endif TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) { diff --git a/dnn/test/cuda/conv_test_utils.cpp b/dnn/test/cuda/conv_test_utils.cpp index 387e97052..91cac30c1 100644 --- a/dnn/test/cuda/conv_test_utils.cpp +++ b/dnn/test/cuda/conv_test_utils.cpp @@ -102,9 +102,7 @@ std::vector get_det_first_bench_args(size_t batch) { args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2}); args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1}); - args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2}); args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1}); - args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2}); return args; } @@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc( .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]}) .dimshuffle({1, 3, 4, 0, 2})); + } else if (format == Format::NHWC) { + ret = static_cast( + TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1})); } return ret; }; @@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc( if ((format == Format::CHWN4 || format == Format::NCHW4) && (arg.ci % 16 != 0)) continue; + // skip testcase which cannot enable nhwc tensorcore + if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0)) + continue; Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 ? Format::NCHW32 : Format::NCHW4; diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index bef4bfd5a..b4172536a 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) { all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); - all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); for (auto algo_param : all_params) { Checker checker(handle_cuda()); -- GitLab