build(mgb/cutlass): merge partial headers

GitOrigin-RevId: 1bc2af604bea52159f8bfed7adcd2049bb900287

build(mgb/cutlass): merge partial headers
GitOrigin-RevId: 1bc2af604bea52159f8bfed7adcd2049bb900287
81065cf0 · Megvii Engine Team · d610c987 · 81065cf0 · 81065cf0 · 81065cf0
6 changed file
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
@@ -213,7 +213,7 @@ class EmitConv2dInstance:
    def __init__(self):
        self.template = """
 // kernel instance "${operation_name}" generated by cutlass generator
-using Convolution = 
+using Convolution_${operation_name} = 
  typename cutlass::conv::device::Convolution<
    ${element_src}, 
    ${layout_src},
@@ -317,7 +317,7 @@ class EmitDeconvInstance:
    def __init__(self):
        self.template = """
 // kernel instance "${operation_name}" generated by cutlass generator
-using Convolution = 
+using Convolution_${operation_name} = 
  typename cutlass::conv::device::Deconvolution<
    ${element_src}, 
    ${layout_src},
@@ -419,7 +419,7 @@ class EmitConvolutionBackwardFilterInstance:
    def __init__(self):
        self.template = """
 // kernel instance "${operation_name}" generated by cutlass generator
-using Convolution = 
+using Convolution_${operation_name} = 
  typename cutlass::conv::device::ConvolutionBackwardFilter<
    ${element_src}, 
    ${layout_src},
@@ -905,7 +905,7 @@ namespace cutlass {
 namespace library {

 void initialize_${operation_name}(Manifest &manifest) {
-  manifest.append(new ${convolution_name}<Convolution>(
+  manifest.append(new ${convolution_name}<Convolution_${operation_name}>(
    "${operation_name}"
  ));
 }
@@ -929,19 +929,6 @@ void initialize_${operation_name}(Manifest &manifest) {
                self.kernel_path, "%s.cu" % self.operation.procedural_name()
            )
        self.kernel_file = open(self.kernel_path, "w")
-        self.kernel_file.write(
-            SubstituteTemplate(
-                self.header_template,
-                {
-                    "required_cuda_ver_major": str(
-                        self.operation.required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        self.operation.required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
        return self

    #
@@ -965,7 +952,6 @@ void initialize_${operation_name}(Manifest &manifest) {

    #
    def __exit__(self, exception_type, exception_value, traceback):
-        self.kernel_file.write(self.epilogue_template)
        self.kernel_file.close()



--- a/dnn/scripts/cutlass_generator/gemm_operation.py
+++ b/dnn/scripts/cutlass_generator/gemm_operation.py
@@ -1347,19 +1347,6 @@ void initialize_${operation_name}(Manifest &manifest) {
                self.kernel_path, "%s.cu" % self.operation.procedural_name()
            )
        self.kernel_file = open(self.kernel_path, "w")
-        self.kernel_file.write(
-            SubstituteTemplate(
-                self.header_template,
-                {
-                    "required_cuda_ver_major": str(
-                        self.operation.required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        self.operation.required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
        return self

    #
@@ -1379,7 +1366,6 @@ void initialize_${operation_name}(Manifest &manifest) {

    #
    def __exit__(self, exception_type, exception_value, traceback):
-        self.kernel_file.write(self.epilogue_template)
        self.kernel_file.close()


@@ -1435,20 +1421,6 @@ ${operation_instance}
                self.kernel_path, "%s.cu" % self.operation.procedural_name()
            )
        self.kernel_file = open(self.kernel_path, "w")
-        self.kernel_file.write(
-            SubstituteTemplate(
-                self.header_template,
-                {
-                    "wrapper_path": self.wrapper_path,
-                    "required_cuda_ver_major": str(
-                        self.operation.required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        self.operation.required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
        return self

    #
@@ -1468,7 +1440,6 @@ ${operation_instance}

    #
    def __exit__(self, exception_type, exception_value, traceback):
-        self.kernel_file.write(self.epilogue_template)
        self.kernel_file.close()



--- a/dnn/scripts/cutlass_generator/gen_list.py
+++ b/dnn/scripts/cutlass_generator/gen_list.py
@@ -35,24 +35,31 @@ def write_op_list(f, gen_op, gen_type):
    if gen_op != "gemv":
        f.write('    "all_%s_%s_operations.cu",\n' % (gen_op, gen_type))

+# Write down a list of merged filenames
+def write_merge_file_name(f, gen_op, gen_type):
+    f.write('    "{}_{}_1.cu",\n'.format(gen_op,gen_type))
+    f.write('    "{}_{}_2.cu",\n'.format(gen_op,gen_type))
+    if gen_op != "gemv":
+        f.write('    "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type))

 if __name__ == "__main__":
    with open("list.bzl", "w") as f:
        f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n")
        f.write("cutlass_gen_list = [\n")
-        write_op_list(f, "gemm", "simt")
-        write_op_list(f, "gemm", "tensorop1688")
-        write_op_list(f, "gemm", "tensorop884")
-        write_op_list(f, "gemv", "simt")
-        write_op_list(f, "deconv", "simt")
-        write_op_list(f, "deconv", "tensorop8816")
-        write_op_list(f, "conv2d", "simt")
-        write_op_list(f, "conv2d", "tensorop8816")
-        write_op_list(f, "conv2d", "tensorop8832")
-        write_op_list(f, "dwconv2d_fprop", "simt")
-        write_op_list(f, "dwconv2d_fprop", "tensorop884")
-        write_op_list(f, "dwconv2d_dgrad", "simt")
-        write_op_list(f, "dwconv2d_dgrad", "tensorop884")
-        write_op_list(f, "dwconv2d_wgrad", "simt")
-        write_op_list(f, "dwconv2d_wgrad", "tensorop884")
+
+        write_merge_file_name(f, "gemm", "simt")
+        write_merge_file_name(f, "gemm", "tensorop1688")
+        write_merge_file_name(f, "gemm", "tensorop884")
+        write_merge_file_name(f, "gemv", "simt")
+        write_merge_file_name(f, "deconv", "simt")
+        write_merge_file_name(f, "deconv", "tensorop8816")
+        write_merge_file_name(f, "conv2d", "simt")
+        write_merge_file_name(f, "conv2d", "tensorop8816")
+        write_merge_file_name(f, "conv2d", "tensorop8832")
+        write_merge_file_name(f, "dwconv2d_fprop", "simt")
+        write_merge_file_name(f, "dwconv2d_fprop", "tensorop884")
+        write_merge_file_name(f, "dwconv2d_dgrad", "simt")
+        write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884")
+        write_merge_file_name(f, "dwconv2d_wgrad", "simt")
+        write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884")
        f.write("]")
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -9,7 +9,7 @@ import os.path
 import shutil
 import argparse
 import platform
-
+import string
 from library import *
 from manifest import *

@@ -1657,6 +1657,108 @@ def GenerateGemvOperations(args):
    return GenerateGemv_Simt(args)


+def concat_file(file_path:str,file_name_first:str,file_name_last:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None):
+    import os
+    meragefiledir = file_path
+    filenames=os.listdir(meragefiledir)  
+    file1=open(file_path + '/{}_{}_1.cu'.format(file_name_first,file_name_last),'w')
+    file2=open(file_path + '/{}_{}_2.cu'.format(file_name_first,file_name_last),'w')
+    if wrapper_path is None:
+        file1.write(
+            SubstituteTemplate(
+                head,
+                {
+                    "required_cuda_ver_major": str(
+                        required_cuda_ver_major
+                    ),
+                    "required_cuda_ver_minor": str(
+                        required_cuda_ver_minor
+                    ),
+                },
+            )
+        )
+        file2.write(
+            SubstituteTemplate(
+                head,
+                {
+                    "required_cuda_ver_major": str(
+                        required_cuda_ver_major
+                    ),
+                    "required_cuda_ver_minor": str(
+                        required_cuda_ver_minor
+                    ),
+                },
+            )
+        )
+    else:
+        file1.write(
+                SubstituteTemplate(
+                    head,
+                    {
+                        "wrapper_path": wrapper_path,
+                        "required_cuda_ver_major": str(
+                            required_cuda_ver_major
+                        ),
+                        "required_cuda_ver_minor": str(
+                            required_cuda_ver_minor
+                        ),
+                    },
+                )
+            )
+        file2.write(
+            SubstituteTemplate(
+                head,
+                {
+                    "wrapper_path": wrapper_path,
+                    "required_cuda_ver_major": str(
+                        required_cuda_ver_major
+                    ),
+                    "required_cuda_ver_minor": str(
+                        required_cuda_ver_minor
+                    ),
+                },
+            )
+        )
+    flag = 0
+    if "tensorop" in file_name_last:
+        sub_string_1 = "tensorop"
+        sub_string_2 = file_name_last[8:]
+    else:
+        sub_string_1 = sub_string_2 = "simt"
+    if "dwconv2d_" in file_name_first:
+        file_name_first = file_name_first[:2]+file_name_first[9:]
+    elif ("conv2d" in file_name_first) or ("deconv" in file_name_first):
+        file_name_first = "cutlass"
+    for filename in filenames:
+        if (file_name_first in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
+            flag += 1
+            filepath=meragefiledir+'/'+filename
+            if flag <= len(filenames)/2:
+                for line in open(filepath):
+                    file1.writelines(line)
+            else:
+                for line in open(filepath):
+                    file2.writelines(line)
+            os.remove(filepath)
+            file1.write('\n')
+            file2.write('\n')
+        elif filename[0].isdigit() and ("all_" not in filename):
+            flag += 1
+            filepath=meragefiledir+'/'+filename
+            if flag <= len(filenames)/2:
+                for line in open(filepath):
+                    file1.writelines(line)
+            else:
+                for line in open(filepath):
+                    file2.writelines(line)
+            os.remove(filepath)
+            file1.write('\n')
+            file2.write('\n')
+    file1.write(epilogue)
+    file2.write(epilogue)
+    file1.close()
+    file2.close()
+
 ###################################################################################################
 ###################################################################################################

@@ -1727,18 +1829,33 @@ if __name__ == "__main__":
                args.output, operation, short_path
            ) as emitter:
                emitter.emit()
+        head = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).header_template
+        required_cuda_ver_major = operations[0].required_cuda_ver_major
+        required_cuda_ver_minor = operations[0].required_cuda_ver_minor
+        epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
+        concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
    elif args.operations == "gemm":
        for operation in operations:
            with EmitGemmSingleKernelWrapper(
                args.output, operation, short_path
            ) as emitter:
                emitter.emit()
+        head = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).header_template
+        required_cuda_ver_major = operations[0].required_cuda_ver_major
+        required_cuda_ver_minor = operations[0].required_cuda_ver_minor
+        epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
+        concat_file(args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
    elif args.operations == "gemv":
        for operation in operations:
            with EmitGemvSingleKernelWrapper(
                args.output, operation, gemv_wrapper_path, short_path
            ) as emitter:
                emitter.emit()
+        head = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).header_template
+        required_cuda_ver_major = operations[0].required_cuda_ver_major
+        required_cuda_ver_minor = operations[0].required_cuda_ver_minor
+        epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template
+        concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path)

    if args.operations != "gemv":
        GenerateManifest(args, operations, args.output)

--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl
+#pragma once
+
 #include "cutlass/gemm/kernel/default_gemv.h"
 #include "cutlass/gemm/kernel/gemv_batched_strided.h"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"