diff --git a/python/akg/__init__.py b/python/akg/__init__.py index a1a01629513df28ee61b81094268ea948190b4a0..8665dc9e645191eae2afe1f38a4f1d7063ff046f 100644 --- a/python/akg/__init__.py +++ b/python/akg/__init__.py @@ -81,5 +81,6 @@ from .autodiff import get_variables from .autodiff import register_variables from .lang.cce.te_compute.common import fargmax, fargmin, mad from . import lang +from .utils.dump_cuda_meta import dump_cuda_meta __all__ = ["differentiate"] diff --git a/python/akg/composite/build_module.py b/python/akg/composite/build_module.py index 3f725fcf99f98bcc0c83fb9f4587da59b2c80e35..cc4723aff231f21788bae9a5d7e430b80ef1a6dd 100644 --- a/python/akg/composite/build_module.py +++ b/python/akg/composite/build_module.py @@ -22,7 +22,6 @@ from akg import tvm from akg.tvm import _api_internal from .repository import __all__ as repository import topi -from akg.utils import dump_cuda_meta def generate_trait(desc): """ generate trait of kernel description """ @@ -181,5 +180,4 @@ def build_cuda(outputs, args, sch_name, kernel_name): dump_ir = os.getenv('MS_AKG_DUMP_IR') == "on" with tvm.build_config(dump_pass_ir = dump_ir): mod = akg.build(s, list(args), "cuda", name = kernel_name) - dump_cuda_meta.dump(mod, kernel_name, s, list(args)) return mod diff --git a/python/akg/utils/dump_cuda_meta.py b/python/akg/utils/dump_cuda_meta.py index e518b89bb985e6fc71cea1a8d37feefdcc6ace0e..da365d375c6cabf607ecf17bd0ef0c942fed1fdc 100644 --- a/python/akg/utils/dump_cuda_meta.py +++ b/python/akg/utils/dump_cuda_meta.py @@ -20,93 +20,66 @@ import fcntl import hashlib import akg.tvm -def get_dim(dim, axis=True): - """get dim info""" - dims_str = { - "grid_dim0": "// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = ", - "grid_dim1": "// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = ", - "grid_dim2": "// attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = ", - "block_dim0": "// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = ", - "block_dim1": "// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = ", - "block_dim2": "// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = " - } - dim_to_axis = { - "grid_dim0": '"blockIdx.x" : ', - "grid_dim1": '"blockIdx.y" : ', - "grid_dim2": '"blockIdx.z" : ', - "block_dim0": '"threadIdx.x" : ', - "block_dim1": '"threadIdx.y" : ', - "block_dim2": '"threadIdx.z" : ' - } - if axis: - return dim_to_axis.get(dim) - return dims_str.get(dim) - - -def parse_params(file, dim, ir): - """parse parameters""" - dim_str = get_dim(dim, axis=False) - pos = ir.find(dim_str) - if pos != -1: - index = pos + len(dim_str) - param_temp = get_dim(dim) - - while ir[index].isdigit(): - param_temp += ir[index] - index += 1 - file.write(param_temp + ",\n") - else: - param_temp = get_dim(dim) + '1' - file.write(param_temp + ",\n") -def save_gpu_params(s, args, kernel_info): - """save gpu parameters""" - ptx_code = kernel_info[0] - file_name = kernel_info[1] - kernel_name = kernel_info[2] +@akg.tvm.register_func +def dump_cuda_meta(code, ptx, thread_info): + """ + Function for dumping cuda meta. - - dump_ir = os.getenv('MS_AKG_DUMP_IR') == "on" - if dump_ir: - schedule_path = os.path.realpath(kernel_name) - all_passes = os.listdir(schedule_path) - for cur_pass in all_passes: - if cur_pass.startswith("00_"): - with open(schedule_path + '/' + cur_pass, "r") as file: - ir = file.read() - break - else: - ir = str(akg.tvm.lower(s, args, simple_mode=True)) - file_path = os.path.realpath(file_name) - if os.path.exists(file_path): - os.remove(file_path) + Args: + code: gpu code. + ptx: ptx code. + thread_info: thread info, written to json file. + """ + # kernel name + kernel_name = code.split("_kernel")[0].split(" ")[-1] + # sha256 of ptx sha256 = hashlib.sha256() - sha256.update(ptx_code.encode("utf-8")) + sha256.update(ptx.encode("utf-8")) hash_str = sha256.hexdigest() - with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT, 0o400), 'w') as fo: - fo.write("{\n") - fo.write('"kernelName" : ' + '"' + kernel_name + "_kernel0" + '",\n') - parse_params(fo, "grid_dim0", ir) - parse_params(fo, "grid_dim1", ir) - parse_params(fo, "grid_dim2", ir) - parse_params(fo, "block_dim0", ir) - parse_params(fo, "block_dim1", ir) - parse_params(fo, "block_dim2", ir) - fo.write('"sha256" : ' + '"' + hash_str + '"\n') - fo.write("}\n") -def dump(mod, kernel_name, sch, args): + # thread info + thread_info_dict = { + "blockIdx.x": "1", + "blockIdx.y": "1", + "blockIdx.z": "1", + "threadIdx.x": "1", + "threadIdx.y": "1", + "threadIdx.z": "1" + } + for thread_tag in thread_info_dict.keys(): + if thread_tag in thread_info: + if isinstance(thread_info[thread_tag], int): + thread_info_dict[thread_tag] = str(thread_info[thread_tag]) + elif isinstance(thread_info[thread_tag], akg.tvm.expr.IntImm): + thread_info_dict[thread_tag] = str(thread_info[thread_tag].value) + meta_path = "./cuda_meta_" + str(os.getpid()) + "/" cuda_path = os.path.realpath(meta_path) if not os.path.isdir(cuda_path): os.makedirs(cuda_path) + + # save ptx file to cuda meta ptx_file = os.path.realpath(meta_path + kernel_name + ".ptx") with open(ptx_file, "at") as f: fcntl.flock(f.fileno(), fcntl.LOCK_EX) f.seek(0, 2) if f.tell() == 0: - ptx_code = mod.imported_modules[0].get_source('ptx') - f.write(ptx_code) - param_path = os.path.realpath(meta_path + kernel_name + '.json') - save_gpu_params(sch, args, (ptx_code, param_path, kernel_name)) + f.write(ptx) + + # save json file to cuda meta + json_file = os.path.realpath(meta_path + kernel_name + ".json") + if os.path.exists(json_file): + os.remove(json_file) + with os.fdopen(os.open(json_file, os.O_WRONLY | os.O_CREAT, 0o400), 'w') as fo: + fo.write("{\n") + fo.write('"kernelName" : ' + '"' + kernel_name + "_kernel0" + '",\n') + fo.write('"blockIdx.x" : ' + thread_info_dict["blockIdx.x"] + ',\n') + fo.write('"blockIdx.y" : ' + thread_info_dict["blockIdx.y"] + ',\n') + fo.write('"blockIdx.z" : ' + thread_info_dict["blockIdx.z"] + ',\n') + fo.write('"threadIdx.x" : ' + thread_info_dict["threadIdx.x"] + ',\n') + fo.write('"threadIdx.y" : ' + thread_info_dict["threadIdx.y"] + ',\n') + fo.write('"threadIdx.z" : ' + thread_info_dict["threadIdx.z"] + ',\n') + fo.write('"sha256" : ' + '"' + hash_str + '"\n') + fo.write("}\n") diff --git a/python/akg/utils/kernel_exec.py b/python/akg/utils/kernel_exec.py index 9167cff1bba45b82efd2d0f954973f37f6e35f5d..ce570985f3b56f975462504b52dc48df29da5a0d 100644 --- a/python/akg/utils/kernel_exec.py +++ b/python/akg/utils/kernel_exec.py @@ -42,7 +42,6 @@ from akg.utils import format_transform as ft_util from akg.utils import custom_tiling as ct_util from akg.utils import validation_check as vc_util from akg.utils.dsl_create import TensorUtils -from akg.utils import dump_cuda_meta sh = logging.StreamHandler(sys.stdout) logging.getLogger().addHandler(sh) @@ -746,7 +745,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", with akg.tvm.build_config(dump_pass_ir=dump_ir): mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds) - dump_cuda_meta.dump(mod, kernel_name, s, op_var) if dump_code: source_code = mod.imported_modules[0].get_source() create_code(kernel_name, "./", source_code, "CUDA") diff --git a/third_party/incubator-tvm/src/codegen/opt/build_cuda_on.cc b/third_party/incubator-tvm/src/codegen/opt/build_cuda_on.cc index 95ee4cf5be6cbbe324dbac8f6ccc43730e1bea04..6a451bc0fab4cad9bcc3b75b7e1cedacd15d662c 100644 --- a/third_party/incubator-tvm/src/codegen/opt/build_cuda_on.cc +++ b/third_party/incubator-tvm/src/codegen/opt/build_cuda_on.cc @@ -23,6 +23,12 @@ * * \file build_cuda.cc */ + +/* + * 2020.8.14 - Get thread info inside BuildCUDA function, + * enbale dump cuda meta. + */ + #if defined(__linux__) #include #endif @@ -133,8 +139,18 @@ runtime::Module BuildCUDA(Array funcs) { CodeGenCUDA cg; cg.Init(output_ssa); + Map thread_info; for (LoweredFunc f : funcs) { cg.AddFunction(f); + + for (const auto &axis : f->thread_axis) { + auto thread_tag = axis->thread_tag; + auto node = axis->dom.get(); + if (node != nullptr) { + CHECK(axis->dom->extent.as()); + thread_info.Set(thread_tag, axis->dom->extent); + } + } } std::string code = cg.Finish(); @@ -151,6 +167,11 @@ runtime::Module BuildCUDA(Array funcs) { } else { ptx = NVRTCCompile(code, cg.need_include_path()); } + + if (const auto* f = Registry::Get("dump_cuda_meta")) { + (*f)(code, ptx, thread_info); + } + return CUDAModuleCreate(ptx, fmt, ExtractFuncInfo(funcs), code); }