diff --git a/dnn/scripts/cutlass_generator/BUILD b/dnn/scripts/cutlass_generator/BUILD index d90fc43844b40bf75703b3f9b49ebaa3e304d23b..98ff589ad9c32db9de307cf8d39ffaa08b5d3d66 100644 --- a/dnn/scripts/cutlass_generator/BUILD +++ b/dnn/scripts/cutlass_generator/BUILD @@ -19,6 +19,7 @@ genrule( CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D) CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D) CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D) + CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations rrconv2d_wgrad --type simt $(@D) """, tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], visibility = ["//visibility:public"], diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py index 129baa6279b350c307b2f74661ae23f98b3d978c..4c5795f1f6480d60fafb33488c861b666e65c923 100644 --- a/dnn/scripts/cutlass_generator/conv2d_operation.py +++ b/dnn/scripts/cutlass_generator/conv2d_operation.py @@ -35,6 +35,8 @@ class Conv2dOperation: without_shared_load=False, required_cuda_ver_major=9, required_cuda_ver_minor=2, + rin=None, + rout=None, ): self.operation_kind = OperationKind.Conv2d @@ -54,6 +56,8 @@ class Conv2dOperation: self.without_shared_load = without_shared_load self.required_cuda_ver_major = required_cuda_ver_major self.required_cuda_ver_minor = required_cuda_ver_minor + self.rin = rin + self.rout = rout # def accumulator_type(self): @@ -95,6 +99,8 @@ class Conv2dOperation: conv_type_name = "" if self.conv_type == ConvType.DepthwiseConvolution: conv_type_name = "dw" + elif self.conv_type == ConvType.RegionRestrictedConvolution: + conv_type_name = "rr" return "%s%s%s%s%s%s%s_%s" % ( ShortDataTypeNames[self.accumulator_type()], @@ -125,6 +131,9 @@ class Conv2dOperation: elif self.src.element == self.flt.element: extended_name = "${core_name}_${element_src}" + if self.rin != None: + extended_name += "_${element_rin}" + extended_name = SubstituteTemplate( extended_name, { @@ -132,6 +141,7 @@ class Conv2dOperation: "element_flt": DataTypeNames[self.flt.element], "element_dst": DataTypeNames[self.dst.element], "core_name": self.core_name(), + "element_rin": DataTypeNames[self.rin.element], }, ) @@ -512,6 +522,115 @@ using Convolution_${operation_name} = return SubstituteTemplate(self.template, values) +class EmitRegionRestrictedConvolutionBackwardFilterInstance: + def __init__(self): + self.template = """ +// kernel instance "${operation_name}" generated by cutlass generator +using Convolution_${operation_name} = + typename cutlass::conv::device::RegionRestrictedConvolutionBackwardFilter< + ${element_src}, + ${layout_src}, + ${element_diff}, + ${layout_diff}, + ${element_src_mask}, + ${layout_src_mask}, + ${element_output_mask}, + ${layout_output_mask}, + ${element_grad}, + ${layout_grad}, + ${element_accumulator}, + ${conv_type}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_grad}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${alignment_src}, + ${alignment_diff}, + ${alignment_src_mask}, + ${alignment_output_mask}, + ${special_optimization}, + ${math_operator}, + ${implicit_gemm_mode}>; +""" + + def emit(self, operation): + + warp_shape = [ + int( + operation.tile_description.threadblock_shape[idx] + / operation.tile_description.warp_count[idx] + ) + for idx in range(3) + ] + + epilogue_vector_length = int( + min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) + / DataTypeSize[operation.dst.element] + ) + + values = { + "operation_name": operation.procedural_name(), + "conv_type": ConvTypeTag[operation.conv_type], + "element_src": DataTypeTag[operation.src.element], + "layout_src": LayoutTag[operation.src.layout], + "element_diff": DataTypeTag[operation.flt.element], + "layout_diff": LayoutTag[operation.flt.layout], + "element_src_mask": DataTypeTag[operation.rin.element], + "layout_src_mask": LayoutTag[operation.rin.layout], + "element_output_mask": DataTypeTag[operation.rout.element], + "layout_output_mask": LayoutTag[operation.rout.layout], + "element_grad": DataTypeTag[operation.dst.element], + "layout_grad": LayoutTag[operation.dst.layout], + "element_accumulator": DataTypeTag[operation.accumulator_type()], + "opcode_class": OpcodeClassTag[ + operation.tile_description.math_instruction.opcode_class + ], + "arch": "cutlass::arch::Sm%d" % operation.arch, + "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]), + "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]), + "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]), + "warp_shape_m": str(warp_shape[0]), + "warp_shape_n": str(warp_shape[1]), + "warp_shape_k": str(warp_shape[2]), + "instruction_shape_m": str( + operation.tile_description.math_instruction.instruction_shape[0] + ), + "instruction_shape_n": str( + operation.tile_description.math_instruction.instruction_shape[1] + ), + "instruction_shape_k": str( + operation.tile_description.math_instruction.instruction_shape[2] + ), + "epilogue_vector_length": str(epilogue_vector_length), + "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor], + "element_epilogue": str(DataTypeTag[operation.element_epilogue]), + "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor], + "stages": str(operation.tile_description.stages), + "alignment_src": str(operation.src.alignment), + "alignment_diff": str(operation.flt.alignment), + "alignment_src_mask": str(operation.rin.alignment), + "alignment_output_mask": str(operation.rout.alignment), + "special_optimization": SpecialOptimizeDescTag[ + operation.special_optimization + ], + "math_operator": MathOperationTag[ + operation.tile_description.math_instruction.math_operation + ], + "implicit_gemm_mode": ImplicitGemmModeTag[operation.implicit_gemm_mode], + } + + return SubstituteTemplate(self.template, values) + + ################################################################################################### # # Generator functions for all layouts @@ -540,7 +659,10 @@ def GenerateConv2d( operations = [] element_epilogue = DataType.f32 - if conv_type == ConvType.DepthwiseConvolution: + if ( + conv_type == ConvType.DepthwiseConvolution + or conv_type == ConvType.RegionRestrictedConvolution + ): if conv_kind == ConvKind.Fprop: swizzling_functor = SwizzlingFunctor.DepthwiseConvolutionFprop elif conv_kind == ConvKind.Dgrad: @@ -680,6 +802,16 @@ def GenerateConv2d( flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a]), ) + rin = TensorDescription( + tile.math_instruction.element_rin, + src_layout, + int(src_align / DataTypeSize[tile.math_instruction.element_rin]), + ) + rout = TensorDescription( + tile.math_instruction.element_rout, + dst_layout, + int(dst_align / DataTypeSize[tile.math_instruction.element_rout]), + ) bias = TensorDescription( bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])) ) @@ -704,6 +836,8 @@ def GenerateConv2d( without_shared_load, required_cuda_ver_major, required_cuda_ver_minor, + rin, + rout, ) operations.append(new_operation) if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt: @@ -724,6 +858,8 @@ def GenerateConv2d( without_shared_load, required_cuda_ver_major, required_cuda_ver_minor, + rin, + rout, ) operations.append(new_operation) return operations @@ -955,5 +1091,89 @@ void initialize_${operation_name}(Manifest &manifest) { self.kernel_file.close() +class EmitRegionRestrictedConvSingleKernelWrapper: + def __init__(self, kernel_path, operation, short_path=False): + self.kernel_path = kernel_path + self.operation = operation + self.short_path = short_path + + # Now only support wgrad + assert self.operation.conv_kind == ConvKind.Wgrad + self.instance_emitter = EmitRegionRestrictedConvolutionBackwardFilterInstance() + self.convolution_name = "RegionRestrictedConvolutionBackwardFilterOperation" + + self.header_template = """ +#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + +#include "cutlass/convolution/device/convolution.h" + +#include "src/cuda/cutlass/manifest.h" +#include "src/cuda/cutlass/convolution_operation.h" +""" + self.instance_template = """ +${operation_instance} +""" + + self.manifest_template = """ +namespace cutlass { +namespace library { + +void initialize_${operation_name}(Manifest &manifest) { + manifest.append(new ${convolution_name}( + "${operation_name}" + )); +} + +} // namespace library +} // namespace cutlass +""" + + self.epilogue_template = """ +#pragma GCC diagnostic pop +#endif +""" + + # + def __enter__(self): + if self.short_path: + self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % GlobalCnt.cnt) + GlobalCnt.cnt += 1 + else: + self.kernel_path = os.path.join( + self.kernel_path, "%s.cu" % self.operation.procedural_name() + ) + self.kernel_file = open(self.kernel_path, "w") + return self + + # + def emit(self): + self.kernel_file.write( + SubstituteTemplate( + self.instance_template, + {"operation_instance": self.instance_emitter.emit(self.operation)}, + ) + ) + + # emit manifest helper + manifest = SubstituteTemplate( + self.manifest_template, + { + "operation_name": self.operation.procedural_name(), + "convolution_name": self.convolution_name, + }, + ) + self.kernel_file.write(manifest) + + # + def __exit__(self, exception_type, exception_value, traceback): + self.kernel_file.close() + + ################################################################################################### ################################################################################################### diff --git a/dnn/scripts/cutlass_generator/gen_list.py b/dnn/scripts/cutlass_generator/gen_list.py index f652a4a03494ef6ea87c62021d5b3b7c49bdf05d..917aa6864760db78300c92fa03f775e55f9905e7 100644 --- a/dnn/scripts/cutlass_generator/gen_list.py +++ b/dnn/scripts/cutlass_generator/gen_list.py @@ -64,4 +64,5 @@ if __name__ == "__main__": write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4) write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2) write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4) + write_merge_file_name(f, "rrconv2d_wgrad", "simt", 2) f.write("]") diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py index 931108e2d38bf4e919e394263f9f3a57b9ecaf0d..06e482819e63c10024816067cf85aa8dfb25f01d 100644 --- a/dnn/scripts/cutlass_generator/generator.py +++ b/dnn/scripts/cutlass_generator/generator.py @@ -1260,6 +1260,218 @@ def GenerateDwconv2d_Simt(args, conv_kind): return operations +def GenerateRegionRestrictedconv2d_Simt(args, conv_kind): + ################################################################################ + # warps per threadblock + ################################################################################ + warpsPerThreadblocks = [] + for warpsPerThreadblock0 in warpsPerThreadblockEdge: + for warpsPerThreadblock1 in warpsPerThreadblockEdge: + if ( + warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio + and warpsPerThreadblock1 / warpsPerThreadblock0 + <= warpsPerThreadblockRatio + and warpsPerThreadblock0 * warpsPerThreadblock1 + <= warpsPerThreadblockMax + ): + warpsPerThreadblocks.append( + [warpsPerThreadblock0, warpsPerThreadblock1] + ) + + ################################################################################ + # warp shapes + ################################################################################ + warpNumThreads = 32 + warpShapes = [] + for warp0 in warpShapeEdges: + for warp1 in warpShapeEdges: + if ( + warp0 / warp1 <= warpShapeRatio + and warp1 / warp0 <= warpShapeRatio + and warp0 * warp1 <= warpShapeMax + and warp0 * warp1 > warpShapeMin + ): + warpShapes.append([warp0, warp1]) + + # sgemm + ( + precisionType, + precisionBits, + threadblockMaxElements, + threadblockTilesL0, + ) = precisions["s"] + + layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)] + + math_instructions = [ + MathInstruction( + [1, 1, 1], + DataType.f32, + DataType.f32, + DataType.f32, + OpcodeClass.Simt, + MathOperation.multiply_add, + DataType.s32, + DataType.s32, + ), + MathInstruction( + [1, 1, 1], + DataType.f32, + DataType.f32, + DataType.f32, + OpcodeClass.Simt, + MathOperation.multiply_add, + DataType.s8, + DataType.s8, + ), + ] + + min_cc = 50 + max_cc = 1024 + + dst_layouts = [LayoutType.TensorNCHW] + + dst_types = [DataType.f32] + + if conv_kind == ConvKind.Wgrad: + alignment_constraints = [32] + else: + alignment_constraints = [128, 32] + + operations = [] + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([128, 128, 8], 1, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([32, 128, 8], 1, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), + ] + for warpsPerThreadblock in warpsPerThreadblocks: + for warpShape in warpShapes: + warpThreadsM = 0 + if warpShape[0] > warpShape[1]: + warpThreadsM = 8 + else: + warpThreadsM = 4 + warpThreadsN = warpNumThreads / warpThreadsM + + # skip shapes with conflicting rectangularity + # they are unlikely to be fastest + blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] + blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] + warpG = warpShape[0] > warpShape[1] + warpL = warpShape[0] < warpShape[1] + + blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1] * 2 + blockL2 = warpsPerThreadblock[0] * 2 < warpsPerThreadblock[1] + warpG2 = warpShape[0] > warpShape[1] * 2 + warpL2 = warpShape[0] * 2 < warpShape[1] + + if blockG2 and warpL: + continue + if blockL2 and warpG: + continue + if warpG2 and blockL: + continue + if warpL2 and blockG: + continue + + # check threadblock ratios and max + threadblockTile = [ + warpShape[0] * warpsPerThreadblock[0], + warpShape[1] * warpsPerThreadblock[1], + ] + if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: + continue + if threadblockTile[0] > threadblockEdgeMax: + continue + if threadblockTile[1] > threadblockEdgeMax: + continue + totalThreads = ( + warpNumThreads * warpsPerThreadblock[0] * warpsPerThreadblock[1] + ) + + # calculate unroll + # ensure that every iteration at least a full load of A,B are done + unrollMin = 8 + unrollMin0 = totalThreads // threadblockTile[0] + unrollMin1 = totalThreads // threadblockTile[1] + unroll = max(unrollMin, unrollMin0, unrollMin1) + + threadTileM = warpShape[0] // warpThreadsM + threadTileN = warpShape[1] // warpThreadsN + if threadTileM < 2 or threadTileN < 2: + continue + if threadTileM * threadTileN * precisionBits > 8 * 8 * 32: + continue + + # epilogue currently only supports N < WarpNumThreads + if threadblockTile[1] < warpNumThreads: + continue + + # limit smem + smemBitsA = threadblockTile[0] * unroll * 2 * precisionBits + smemBitsB = threadblockTile[1] * unroll * 2 * precisionBits + smemKBytes = (smemBitsA + smemBitsB) / 8 / 1024 + if smemKBytes > 48: + continue + + tile = TileDescription( + [threadblockTile[0], threadblockTile[1], unroll], + 1, + [ + threadblockTile[0] // warpShape[0], + threadblockTile[1] // warpShape[1], + 1, + ], + math_inst, + min_cc, + max_cc, + ) + + def filter(t: TileDescription) -> bool: + nonlocal tile + return ( + t.threadblock_shape[0] == tile.threadblock_shape[0] + and t.threadblock_shape[1] == tile.threadblock_shape[1] + and t.threadblock_shape[2] == tile.threadblock_shape[2] + and t.warp_count[0] == tile.warp_count[0] + and t.warp_count[1] == tile.warp_count[1] + and t.warp_count[2] == tile.warp_count[2] + and t.stages == tile.stages + ) + + if not any(t for t in tile_descriptions if filter(t)): + continue + + for layout in layouts: + for dst_type, dst_layout in zip(dst_types, dst_layouts): + for alignment_src in alignment_constraints: + operations += GenerateConv2d( + ConvType.RegionRestrictedConvolution, + conv_kind, + [tile], + layout[0], + layout[1], + dst_layout, + dst_type, + min_cc, + alignment_src, + 32, + 32, + SpecialOptimizeDesc.NoneSpecialOpt, + ImplicitGemmMode.GemmNT + if conv_kind == ConvKind.Wgrad + else ImplicitGemmMode.GemmTN, + ) + return operations + + # def GenerateDwconv2d_TensorOp_884(args, conv_kind): layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)] @@ -1644,6 +1856,14 @@ def GenerateDwconv2dWgradOperations(args): return GenerateDwconv2d_TensorOp_884(args, ConvKind.Wgrad) +def GenerateRegionRestrictedconv2dWgradOperations(args): + assert args.type == "simt", ( + "operation RegionRestrictedconv2d wgrad only support" + "simt. (got:{})".format(args.type) + ) + return GenerateRegionRestrictedconv2d_Simt(args, ConvKind.Wgrad) + + def GenerateGemmOperations(args): if args.type == "tensorop884": return GeneratesGemm_TensorOp_884(args) @@ -1698,6 +1918,8 @@ def ConcatFile( sub_string_1 = sub_string_2 = "simt" if "dwconv2d_" in operations: filtered_operations = operations[:2] + operations[9:] + if "rrconv2d_" in operations: + filtered_operations = operations[:2] + operations[9:] elif ("conv2d" in operations) or ("deconv" in operations): filtered_operations = "cutlass" else: @@ -1893,6 +2115,7 @@ if __name__ == "__main__": "dwconv2d_fprop", "dwconv2d_dgrad", "dwconv2d_wgrad", + "rrconv2d_wgrad", ], required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv, dwconv2d_fprop, dwconv2d_dgrad, dwconv2d_wgrad)", @@ -1928,9 +2151,11 @@ if __name__ == "__main__": operations = GenerateDwconv2dFpropOperations(args) elif args.operations == "dwconv2d_dgrad": operations = GenerateDwconv2dDgradOperations(args) - else: - assert args.operations == "dwconv2d_wgrad", "invalid operation" + elif args.operations == "dwconv2d_wgrad": operations = GenerateDwconv2dWgradOperations(args) + else: + assert args.operations == "rrconv2d_wgrad", "invalid operation" + operations = GenerateRegionRestrictedconv2dWgradOperations(args) if ( args.operations == "conv2d" @@ -1974,6 +2199,42 @@ if __name__ == "__main__": required_cuda_ver_minor, epilogue, ) + elif args.operations == "rrconv2d_wgrad": + for operation in operations: + with EmitRegionRestrictedConvSingleKernelWrapper( + args.output, operation, short_path + ) as emitter: + emitter.emit() + head = EmitRegionRestrictedConvSingleKernelWrapper( + args.output, operations[0], short_path + ).header_template + required_cuda_ver_major = operations[0].required_cuda_ver_major + required_cuda_ver_minor = operations[0].required_cuda_ver_minor + epilogue = EmitRegionRestrictedConvSingleKernelWrapper( + args.output, operations[0], short_path + ).epilogue_template + if "tensorop" in args.type: + ConcatFile( + 4, + args.output, + args.operations, + args.type, + head, + required_cuda_ver_major, + required_cuda_ver_minor, + epilogue, + ) + else: + ConcatFile( + 2, + args.output, + args.operations, + args.type, + head, + required_cuda_ver_major, + required_cuda_ver_minor, + epilogue, + ) elif args.operations == "gemm": for operation in operations: with EmitGemmSingleKernelWrapper( diff --git a/dnn/scripts/cutlass_generator/library.py b/dnn/scripts/cutlass_generator/library.py index ea26b2ea69ca9c50dedffee9f6e67f610076b684..203d0ee2f02389e69557dd11507dc6bf6d104a9a 100644 --- a/dnn/scripts/cutlass_generator/library.py +++ b/dnn/scripts/cutlass_generator/library.py @@ -532,6 +532,7 @@ class ConvType(enum.Enum): Local = enum_auto() LocalShare = enum_auto() DepthwiseConvolution = enum_auto() + RegionRestrictedConvolution = enum_auto() ConvTypeTag = { @@ -540,6 +541,8 @@ ConvTypeTag = { ConvType.Local: "cutlass::conv::ConvType::kLocal", ConvType.LocalShare: "cutlass::conv::ConvType::kLocalShare", ConvType.DepthwiseConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution", + # RegionRestrictedConvolution using the same conv type with Depthwise + ConvType.RegionRestrictedConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution", } # @@ -640,6 +643,8 @@ class MathInstruction: element_accumulator, opcode_class, math_operation=MathOperation.multiply_add, + element_rin=DataType.s32, + element_rout=DataType.s32, ): self.instruction_shape = instruction_shape self.element_a = element_a @@ -647,6 +652,8 @@ class MathInstruction: self.element_accumulator = element_accumulator self.opcode_class = opcode_class self.math_operation = math_operation + self.element_rin = element_rin + self.element_rout = element_rout # diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl index 04e92260d36900fa2dab5cd811018e96263e01b1..4c315b9383e7db30b5ebaa9a327950683a518744 100644 Binary files a/dnn/scripts/cutlass_generator/list.bzl and b/dnn/scripts/cutlass_generator/list.bzl differ diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index d37f377cd55521ba3ef40a4071a31688c1fd7024..d4b79a7d60de2e755a84fd2d876df61494d446b3 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -188,6 +188,7 @@ if(MGE_WITH_CUDA) gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES) gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES) gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES) + gen_cutlass_kimpl(rrconv2d_wgrad simt CUTLASS_SOURCES) list(PREPEND CUSOURCES ${CUTLASS_SOURCES}) # Compile the following file first, the priority_compile_opr.txt is generated by diff --git a/dnn/src/cuda/cutlass/convolution_operation.h b/dnn/src/cuda/cutlass/convolution_operation.h index 0004bd9396ca8128b47dc9469f1600828976cf02..5873b0329a9c6b20f2998f2ed62e699fab56a155 100644 --- a/dnn/src/cuda/cutlass/convolution_operation.h +++ b/dnn/src/cuda/cutlass/convolution_operation.h @@ -452,6 +452,86 @@ public: /////////////////////////////////////////////////////////////////////////////////////////////////// +template +class RegionRestrictedConvolutionBackwardFilterOperation + : public ConvolutionBackwardFilterOperationBase { +public: + using Operator = Operator_; + using ElementSrc = typename Operator::ElementSrc; + using LayoutSrc = typename Operator::LayoutSrc; + using ElementDiff = typename Operator::ElementDiff; + using LayoutDiff = typename Operator::LayoutDiff; + using ElementGrad = typename Operator::ElementGrad; + using LayoutGrad = typename Operator::LayoutGrad; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + + using OperatorArguments = typename Operator::Arguments; + + using ElementRin = typename Operator::ElementMaskInput; + using LayoutRin = typename Operator::LayoutMaskInput; + using ElementRout = typename Operator::ElementMaskOutput; + using LayoutRout = typename Operator::LayoutMaskOutput; + + RegionRestrictedConvolutionBackwardFilterOperation( + char const* name = "unknown_gemm") + : ConvolutionBackwardFilterOperationBase(name) { + /// rin in description -> rin in C++ template + this->m_description.rin = make_TensorDescription( + Operator::kAlignmentMaskInput); + /// rout in description -> rout in C++ template + this->m_description.rout = make_TensorDescription( + Operator::kAlignmentMaskOutput); + this->m_description.without_shared_load = false; + } + + virtual Status run( + void const* arguments_ptr, void* device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + cutlass::conv::Operator conv_op = this->m_description.conv_op; + ConvolutionArguments const* conv_args = + reinterpret_cast(arguments_ptr); + const auto& ps = conv_args->problem_size; + + OperatorArguments args; + args.problem_size = ps; + /// src in convolution arguments -> ref_src + args.ref_src = { + static_cast(const_cast(conv_args->src)), + LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))}; + /// filter in convolution arguments -> ref_diff + args.ref_diff = { + static_cast(const_cast(conv_args->filter)), + LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))}; + /// dst in convolution arguments -> ref_grad + args.ref_grad = { + static_cast(conv_args->dst), + LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))}; + /// rin in convolution arguments -> ref_mask_input + args.ref_mask_input = { + static_cast(const_cast(conv_args->rin)), + LayoutRin::packed(implicit_gemm_tensor_rin_extent(conv_op, ps))}; + /// rout in convolution arguments -> ref_mask_output + args.ref_mask_output = { + static_cast(const_cast(conv_args->rout)), + LayoutRout::packed(implicit_gemm_tensor_rout_extent(conv_op, ps))}; + + args.output_op = init_epilogue_param().get( + conv_args); + + Operator op; + Status status = op.initialize(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + + return op.run(stream); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace library } // namespace cutlass diff --git a/dnn/src/cuda/cutlass/initialize_all.cu b/dnn/src/cuda/cutlass/initialize_all.cu index 68e8c1f951a61e49dab5388c3c46bd1967c4ab57..53db35d4ba145f164d132fcc6793ce3d70293eae 100644 --- a/dnn/src/cuda/cutlass/initialize_all.cu +++ b/dnn/src/cuda/cutlass/initialize_all.cu @@ -50,6 +50,7 @@ void initialize_all_deconv_simt_operations(Manifest& manifest); void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest); void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest); +void initialize_all_rrconv2d_wgrad_simt_operations(Manifest& manifest); #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED void initialize_all_gemm_tensorop884_operations(Manifest& manifest); void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); @@ -70,6 +71,7 @@ void initialize_all(Manifest& manifest) { initialize_all_dwconv2d_fprop_simt_operations(manifest); initialize_all_dwconv2d_dgrad_simt_operations(manifest); initialize_all_dwconv2d_wgrad_simt_operations(manifest); + initialize_all_rrconv2d_wgrad_simt_operations(manifest); #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED initialize_all_gemm_tensorop884_operations(manifest); initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); diff --git a/dnn/src/cuda/cutlass/library.h b/dnn/src/cuda/cutlass/library.h index ad217e93158c38806163a75471cfa1dd6131832a..61a531b6169b8d94d84706875114d5983fe6558d 100644 --- a/dnn/src/cuda/cutlass/library.h +++ b/dnn/src/cuda/cutlass/library.h @@ -471,6 +471,10 @@ struct ConvolutionDescription : public OperationDescription { conv::SpecialOptimizeDesc special_optimization; conv::ImplicitGemmMode gemm_mode; bool without_shared_load; + + // only used by rrconv + TensorDescription rin; + TensorDescription rout; }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -499,6 +503,10 @@ struct ConvolutionArguments { /// Host pointer to extra param struct void const* extra_param; + + // only used by rrconv, default: nullptr + void const* rin = nullptr; + void const* rout = nullptr; }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/dnn/src/cuda/cutlass/operation_table.cpp b/dnn/src/cuda/cutlass/operation_table.cpp index d60371958a7023723bd6127d0a11d27cca254f19..495b9edd62793cf8da3250021d2701f510ba77fd 100644 --- a/dnn/src/cuda/cutlass/operation_table.cpp +++ b/dnn/src/cuda/cutlass/operation_table.cpp @@ -118,6 +118,11 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) key.alignment_filter = desc.filter.alignment; key.without_shared_load = desc.without_shared_load; + key.element_rin = desc.rin.element; + key.layout_rin = desc.rin.layout; + key.element_rout = desc.rout.element; + key.layout_rout = desc.rout.layout; + return key; } diff --git a/dnn/src/cuda/cutlass/operation_table.h b/dnn/src/cuda/cutlass/operation_table.h index 6582460c7d40d7b817f719571586881621a8d2f8..1604b93a517d064744c64589b920f6c6e3d77a88 100644 --- a/dnn/src/cuda/cutlass/operation_table.h +++ b/dnn/src/cuda/cutlass/operation_table.h @@ -201,6 +201,12 @@ struct ConvolutionKey { bool without_shared_load; + // only used by rrconv + library::NumericTypeID element_rin = library::NumericTypeID::kInvalid; + library::LayoutTypeID layout_rin = library::LayoutTypeID::kInvalid; + library::NumericTypeID element_rout = library::NumericTypeID::kInvalid; + library::LayoutTypeID layout_rout = library::LayoutTypeID::kInvalid; + inline bool operator==(ConvolutionKey const& rhs) const { return (conv_op == rhs.conv_op) && (element_src == rhs.element_src) && (layout_src == rhs.layout_src) && @@ -223,7 +229,9 @@ struct ConvolutionKey { (special_optimization == rhs.special_optimization) && (alignment_src == rhs.alignment_src) && (alignment_filter == rhs.alignment_filter) && - (without_shared_load == rhs.without_shared_load); + (without_shared_load == rhs.without_shared_load) && + (element_rin == rhs.element_rin) && (layout_rin == rhs.layout_rin) && + (element_rout == rhs.element_rout) && (layout_rout == rhs.layout_rout); } inline bool operator!=(ConvolutionKey const& rhs) const { return !(*this == rhs); } @@ -260,7 +268,11 @@ struct ConvolutionKey { "\n special_optimization: " + to_string(special_optimization) + "\n alignment_src: " + std::to_string(alignment_src) + "\n alignment_filter: " + std::to_string(alignment_filter) + - "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; + "\n without_shared_load: " + to_string(without_shared_load) + + "\n element_rin: " + to_string(element_rin) + + "\n layout_rin: " + to_string(layout_rin) + + "\n element_rout: " + to_string(element_rout) + + "\n layout_rout: " + to_string(layout_rout) + "\n}"; } }; @@ -293,6 +305,10 @@ struct ConvolutionKeyHasher { .update(&key.alignment_src, sizeof(key.alignment_src)) .update(&key.alignment_filter, sizeof(key.alignment_filter)) .update(&key.without_shared_load, sizeof(key.without_shared_load)) + .update(&key.element_rin, sizeof(key.element_rin)) + .update(&key.layout_rin, sizeof(key.layout_rin)) + .update(&key.element_rout, sizeof(key.element_rout)) + .update(&key.layout_rout, sizeof(key.layout_rout)) .digest(); } }; diff --git a/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp b/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp index 5f574b1b84a30bb0de5d4f4d5fde6ebcf9b07b2e..1b3d7398bb862a6e8f1170cefa1500402b71a41c 100644 --- a/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp +++ b/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp @@ -1,4 +1,5 @@ #include "src/cuda/region_restricted_convolution/opr_impl.h" +#include "src/cuda/cutlass/singleton.h" #include "src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter.cuh" #include "src/cuda/region_restricted_convolution/chanwise/kern.cuh" #include "src/cuda/utils.h" @@ -6,6 +7,7 @@ using namespace megdnn; using namespace cuda; using namespace region_restricted_convolution; +using namespace cutlass::library; /* ============== RegionRestrictedConvolutionForwardImpl ============== */ void RegionRestrictedConvolutionForwardImpl::exec( @@ -113,7 +115,137 @@ size_t RegionRestrictedConvolutionBackwardFilterImpl::get_workspace_in_bytes( void RegionRestrictedConvolutionBackwardFilterImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_in rin, _megdnn_tensor_in rout, _megdnn_tensor_out grad, _megdnn_workspace workspace) { - megdnn_throw("Region Restricted Conv BackwardFilter unimplemented"); + auto fm = check_exec( + src.layout, diff.layout, rin.layout, rout.layout, grad.layout, + workspace.size); + + megdnn_assert( + fm.group > 1 && src.layout.dtype.category() == DTypeCategory::FLOAT && + param().compute_mode == Param::ComputeMode::DEFAULT && + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && + param().stride_h == 1 && param().stride_w == 1); + + int hi = src.layout.operator[](2), wi = src.layout.operator[](3); + int n = diff.layout.operator[](0), ho = diff.layout.operator[](2), + wo = diff.layout.operator[](3); + int co = fm.group, ci = co, groups = co; + int fh = fm.spatial[0], fw = fm.spatial[1]; + int sh = fm.stride[0], sw = fm.stride[1]; + int ph = fm.padding[0], pw = fm.padding[1]; + int dh = 0, dw = 0; + + // check if channelwise convolution + megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); + auto stream = cuda_stream(handle()); + + float alpha = 1.f; + float beta = 0.f; + + ConvolutionKey key; + + int threadblock_shape_n = 128; + int warp_shape_m = 32; + int warp_shape_n = 64; + if (grad.layout.operator[](3) % 8 < 4) { + threadblock_shape_n = 64; + warp_shape_m = 64; + warp_shape_n = 32; + } + + if (rin.layout.dtype == dtype::Int32() && rout.layout.dtype == dtype::Int32()) { + key = { + cutlass::conv::Operator::kWgrad, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + cutlass::conv::ConvType::kDepthwiseConvolution, + 128, + threadblock_shape_n, + 8, + warp_shape_m, + warp_shape_n, + 8, + 1, + 1, + 1, + cutlass::epilogue::EpilogueType::kLinearCombination, + 1, + cutlass::conv::SpecialOptimizeDesc::NONE, + 1, + 1, + false, + NumericTypeID::kS32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kS32, + LayoutTypeID::kTensorNCHW, + }; + } else if ( + rin.layout.dtype == dtype::Uint8() && rout.layout.dtype == dtype::Uint8()) { + key = { + cutlass::conv::Operator::kWgrad, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kF32, + cutlass::conv::ConvType::kDepthwiseConvolution, + 128, + threadblock_shape_n, + 8, + warp_shape_m, + warp_shape_n, + 8, + 1, + 1, + 1, + cutlass::epilogue::EpilogueType::kLinearCombination, + 1, + cutlass::conv::SpecialOptimizeDesc::NONE, + 1, + 1, + false, + NumericTypeID::kS8, + LayoutTypeID::kTensorNCHW, + NumericTypeID::kS8, + LayoutTypeID::kTensorNCHW, + }; + } else { + megdnn_throw(ssprintf( + "don't support region restricted type rin: %s, rout: %s", + rin.layout.dtype.name(), rout.layout.dtype.name()) + .c_str()); + } + + const Operation* op = + (const Operation*)Singleton::get().operation_table.find_op(key); + + cutlass::conv::Conv2dProblemSize problem_size{ + n, hi, wi, ci, co, fh, fw, ho, + wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, + 1, // split k slices, always 1 + groups, // groups + }; + + cutlass::library::ConvolutionArguments conv_args{ + problem_size, src.raw_ptr(), diff.raw_ptr(), nullptr, + nullptr, grad.raw_ptr(), &alpha, &beta, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, rin.raw_ptr(), rout.raw_ptr()}; + + cutlass_check(op->run(&conv_args, nullptr, stream)); + + after_kernel_launch(); } // vim: syntax=cpp.doxygen diff --git a/dnn/test/cuda/region_restricted_convolution.cpp b/dnn/test/cuda/region_restricted_convolution.cpp index ab03cdf2922f04f3d64af902de027afb0803543e..1e87f474dd24b0e6cd528f84da8f6fa3b000cedd 100644 --- a/dnn/test/cuda/region_restricted_convolution.cpp +++ b/dnn/test/cuda/region_restricted_convolution.cpp @@ -465,6 +465,206 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) { run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); } +TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) { + require_compute_capability(7, 5); + + Benchmarker bencher(handle_cuda()); + + bencher.set_display(false); + bencher.set_before_exec_callback(AlgoChecker( + "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage")); + + Benchmarker rr_bencher(handle_cuda()); + rr_bencher.set_display(false); + + ConvolutionBackwardFilter::Param param; + param.format = ConvolutionBackwardFilter::Param::Format::NCHW; + param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP; + + RegionRestrictedConvolutionBackwardFilter::Param rr_param; + rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW; + rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; + + UniformIntRNG r_rng{1, 3}; + + auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, + size_t fw, size_t sh, size_t sw, size_t nr_times) { + param.pad_h = fh / 2; + param.pad_w = fw / 2; + param.stride_h = sh; + param.stride_w = sw; + + rr_param.pad_h = fh / 2; + rr_param.pad_w = fw / 2; + rr_param.stride_h = sh; + rr_param.stride_w = sw; + + bencher.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(4, dtype::Float32()); + bencher.proxy()->target_execution_policy = {}; + bencher.set_times(nr_times); + + rr_bencher.set_param(rr_param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int32()) + .set_dtype(3, dtype::Int32()); + rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng); + rr_bencher.set_times(nr_times); + + size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); + size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); + TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi}, + rout{batch, ho, wo}, grad{g, 1, 1, fh, fw}; + + float bandwith = static_cast( + src.total_nr_elems() + diff.total_nr_elems() + + grad.total_nr_elems()) / + (1024 * 1024 * 1024) * 1e3; + + float rr_bandwith = static_cast( + src.total_nr_elems() + diff.total_nr_elems() + + rin.total_nr_elems() + rout.total_nr_elems() + + grad.total_nr_elems()) / + (1024 * 1024 * 1024) * 1e3; + + auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times; + auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12; + + auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; + auto rr_ops = + 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; + printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " + "src=%s, " + "diff=%s, grad=%s\n" + "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" + "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n", + src.to_string().c_str(), diff.to_string().c_str(), + grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops, + bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms, + time_in_ms / rr_time_in_ms); + }; + + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); +} + +TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) { + require_compute_capability(7, 5); + + Benchmarker bencher(handle_cuda()); + + bencher.set_display(false); + bencher.set_before_exec_callback(AlgoChecker( + "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage")); + + Benchmarker rr_bencher(handle_cuda()); + rr_bencher.set_display(false); + + ConvolutionBackwardFilter::Param param; + param.format = ConvolutionBackwardFilter::Param::Format::NCHW; + param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP; + + RegionRestrictedConvolutionBackwardFilter::Param rr_param; + rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW; + rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; + + UniformIntRNG r_rng{1, 3}; + + auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, + size_t fw, size_t sh, size_t sw, size_t nr_times) { + param.pad_h = fh / 2; + param.pad_w = fw / 2; + param.stride_h = sh; + param.stride_w = sw; + + rr_param.pad_h = fh / 2; + rr_param.pad_w = fw / 2; + rr_param.stride_h = sh; + rr_param.stride_w = sw; + + bencher.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(4, dtype::Float32()); + bencher.proxy()->target_execution_policy = {}; + bencher.set_times(nr_times); + + rr_bencher.set_param(rr_param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Uint8()) + .set_dtype(3, dtype::Uint8()); + rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng); + rr_bencher.set_times(nr_times); + + size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); + size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); + TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi}, + rout{batch, ho, wo}, grad{g, 1, 1, fh, fw}; + + float bandwith = static_cast( + src.total_nr_elems() + diff.total_nr_elems() + + grad.total_nr_elems()) / + (1024 * 1024 * 1024) * 1e3; + + float rr_bandwith = static_cast( + src.total_nr_elems() + diff.total_nr_elems() + + rin.total_nr_elems() + rout.total_nr_elems() + + grad.total_nr_elems()) / + (1024 * 1024 * 1024) * 1e3; + + auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times; + auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12; + + auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; + auto rr_ops = + 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; + printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " + "src=%s, " + "diff=%s, grad=%s\n" + "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" + "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n", + src.to_string().c_str(), diff.to_string().c_str(), + grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops, + bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms, + time_in_ms / rr_time_in_ms); + }; + + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); +} + #endif TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) { @@ -585,6 +785,125 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) { } } +TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) { + Checker checker(handle_cuda()); + + for (auto dt : std::vector{dtype::Int32(), dtype::Uint8()}) { + auto run = [&checker, &dt]( + size_t n, size_t g, size_t ih, size_t fh, size_t padding, + size_t stride) { + RegionRestrictedConvolutionBackwardFilter::Param cur_param; + cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode:: + CROSS_CORRELATION; + cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param:: + ComputeMode::DEFAULT; + cur_param.sparse = + RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dt) + .set_dtype(3, dt); + float scale = 64.f / sqrt(fh * fh); + UniformFloatRNG rng(scale, 2 * scale); + UniformIntRNG r_rng{1, 2}; + checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng( + 3, &r_rng); + cur_param.pad_h = cur_param.pad_w = padding; + cur_param.stride_h = cur_param.stride_w = stride; + + size_t oh = (ih + 2 * padding - fh + 1) / stride; + checker.set_param(cur_param).execs({ + {n, g * 1, ih, ih}, // src + {n, g * 1, oh, oh}, // diff + {n, ih, ih}, // rin + {n, oh, oh}, // rout + {g, 1, 1, fh, fh} // grad + }); + }; + if (dt == dtype::Int32()) { + run(4, 8, 32, 5, 5 / 2, 1); + run(1, 2, 2, 2, 0, 1); + run(1, 2, 3, 3, 0, 1); + run(1, 2, 4, 4, 0, 1); + run(1, 2, 5, 5, 0, 1); + run(1, 2, 6, 6, 0, 1); + run(1, 2, 7, 7, 0, 1); + } + run(4, 8, 32, 7, 7 / 2, 1); + run(4, 8, 32, 9, 9 / 2, 1); + run(4, 8, 32, 11, 11 / 2, 1); + run(4, 8, 32, 13, 13 / 2, 1); + run(4, 8, 32, 15, 15 / 2, 1); + run(4, 8, 32, 17, 17 / 2, 1); + run(4, 8, 32, 19, 19 / 2, 1); + run(4, 8, 32, 21, 21 / 2, 1); + run(4, 8, 32, 23, 23 / 2, 1); + run(4, 8, 32, 25, 25 / 2, 1); + run(4, 8, 32, 27, 27 / 2, 1); + run(4, 8, 32, 29, 29 / 2, 1); + run(4, 8, 32, 31, 31 / 2, 1); + } +} + +TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) { + Checker checker(handle_cuda()); + + for (auto dt : std::vector{dtype::Int32(), dtype::Uint8()}) { + auto run = [&checker, &dt]( + size_t n, size_t g, size_t ih, size_t fh, size_t padding, + size_t stride) { + RegionRestrictedConvolutionBackwardFilter::Param cur_param; + cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode:: + CROSS_CORRELATION; + cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param:: + ComputeMode::DEFAULT; + cur_param.sparse = + RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dt) + .set_dtype(3, dt); + float scale = 64.f / sqrt(fh * fh); + UniformFloatRNG rng(scale, 2 * scale); + UniformIntRNG r_rng{1, 1}; + checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng( + 3, &r_rng); + cur_param.pad_h = cur_param.pad_w = padding; + cur_param.stride_h = cur_param.stride_w = stride; + + size_t oh = (ih + 2 * padding - fh + 1) / stride; + checker.set_param(cur_param).execs({ + {n, g * 1, ih, ih}, // src + {n, g * 1, oh, oh}, // diff + {n, ih, ih}, // rin + {n, oh, oh}, // rout + {g, 1, 1, fh, fh} // grad + }); + }; + if (dt == dtype::Int32()) { + run(4, 8, 32, 5, 5 / 2, 1); + run(1, 2, 2, 2, 0, 1); + run(1, 2, 3, 3, 0, 1); + run(1, 2, 4, 4, 0, 1); + run(1, 2, 5, 5, 0, 1); + run(1, 2, 6, 6, 0, 1); + run(1, 2, 7, 7, 0, 1); + } + run(4, 8, 32, 7, 7 / 2, 1); + run(4, 8, 32, 9, 9 / 2, 1); + run(4, 8, 32, 11, 11 / 2, 1); + run(4, 8, 32, 13, 13 / 2, 1); + run(4, 8, 32, 15, 15 / 2, 1); + run(4, 8, 32, 17, 17 / 2, 1); + run(4, 8, 32, 19, 19 / 2, 1); + run(4, 8, 32, 21, 21 / 2, 1); + run(4, 8, 32, 23, 23 / 2, 1); + run(4, 8, 32, 25, 25 / 2, 1); + run(4, 8, 32, 27, 27 / 2, 1); + run(4, 8, 32, 29, 29 / 2, 1); + run(4, 8, 32, 31, 31 / 2, 1); + } +} } // namespace test } // namespace megdnn