!73 fix matmul tuning and support all space tuning.

Merge pull request !73 from chenlei_autodiff/matmul_tiling

!73 fix matmul tuning and support all space tuning.
Merge pull request !73 from chenlei_autodiff/matmul_tiling
41107c86 · mindspore-ci-bot · Gitee · 94ebe03e · 3713f94c · 41107c86
8 changed file
--- a/tests/fuzz/tune/autotuning/job.py
+++ b/tests/fuzz/tune/autotuning/job.py
@@ -15,22 +15,23 @@
 """AutoTuning job"""
 import os
 import json
+import time
 import datetime
 import importlib
 import logging
+import subprocess
 import numpy as np
 from collections import namedtuple
 from akg import composite
 from akg.utils import kernel_exec as utils
 from autotuning.runner import KernelRunner, error_time_list, error_time_string
-from autotuning.tuner import ModelBasedTuner
+from autotuning.tuner import ModelBasedTuner, Tuner
 from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
 from autotuning.space_generators import get_space
 from autotuning.space import ListConfigSpace
 from autotuning.test_data_generators import gen_data

-logging.basicConfig(level=logging.DEBUG,
-                    format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
+logging.basicConfig(level=logging.DEBUG)

 logger = logging.getLogger('fuzz.tune.autotuning.job')

@@ -92,11 +93,16 @@ def launch_json(debug_mode: bool = True, save_res: bool = False, json_input_dir=
        if save_res:
            save_tuning_result(key, "json", None, index_table, tuner)

-def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True,
-         save_res: bool = False, insert_key='', conf_of_set_dim=""):
+def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False,
+         all_space: bool = True, insert_key='', conf_of_set_dim=""):
    """AutoTuning jobs"""
    iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
+    time_start_get_space = time.time()
    index_table, space, key, expect, input_for_mod = get_space(op_type, desc)
+    if all_space:
+        iter_times = [space.length, space.length, space.length]
+    time_end_get_space = time.time()
+    print("get space time: ", time_end_get_space - time_start_get_space)
    print('space size:', space.length)
    print('index table:', index_table)
    key = key if insert_key == '' else insert_key
@@ -121,12 +127,18 @@ def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True,
    # available device numbers, normally is 8 or 1
    available_device_numbers = utils.get_available_devices_num()

-    tuner = ModelBasedTuner(runner, index_table, space,
-                            n_parallel=available_device_numbers if is_truly_profiling else 1,
-                            plan_size=64, pre_model=None)
+    time_start_tuning = time.time()
+    if all_space:
+        tuner = Tuner(runner, index_table, space, n_parallel=available_device_numbers)
+    else:
+        tuner = ModelBasedTuner(runner, index_table, space,
+                                n_parallel=available_device_numbers if is_truly_profiling else 1,
+                                plan_size=64, pre_model=None)
    least_try_times = iter_times[0 if space.length < 10 ** 4 else 1 if space.length < 10 ** 5 else 2]
    tuner.tune(least_try_times, output_file=op_type + ".log")

+    time_end_tuning = time.time()
+    print("tuning time: ", time_end_tuning - time_start_tuning)
    print_tuning_result(op_type, space, index_table, tuner, key)

    if save_res:
@@ -231,46 +243,48 @@ def load_json_configs(op_type):
            return {}
    return {}

-def read_shapes_from_file(debug_mode, save_res, conf_of_set_dim, op_type):
+def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type):
    """read tuning shapes from file"""
    file = importlib.import_module('autotuning.shapes.' + op_type)
    shapes = file.shapes
    for _, shp in enumerate(shapes):
-        do_profiling(shp, debug_mode, save_res, op_type, conf_of_set_dim)
+        do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim)

-def do_profiling(shp, debug_mode, save_res, op_type, conf_of_set_dim=None):
+def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None):
    """do profiling"""
+    # remove undeleted JOB files for previous shapes
+    subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True)
    if op_type == 'matmul':
        key = shp[2][0:-1]
        logger.debug("start profiling: [%s]", str(key))
        desc = MatmulCubeDesc(*key)
-        jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
+        jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
        logger.debug("end profiling: [%s]", str(key))
    elif op_type.startswith('conv_backprop'):
        key = shp[2]
        logger.debug("start profiling: [%s]", str(key))
        desc = ConvBackpropDesc(*key)
-        jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
+        jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
        logger.debug("end profiling: [%s]", str(key))
    elif op_type.startswith('conv'):
        key = shp[2]
        logger.debug("start profiling: [%s]", str(key))
        desc = ConvDesc(*key)
-        jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
+        jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
        logger.debug("end profiling: [%s]", str(key))
    else:
        key = shp
        logger.debug("start profiling: [%s]", str(key))
        desc = key
-        jobs(op_type, desc, debug_mode, save_res, conf_of_set_dim=conf_of_set_dim)
+        jobs(op_type, desc, debug_mode, save_res, all_space, conf_of_set_dim=conf_of_set_dim)
        logger.debug("end profiling: [%s]", str(key))

-def launch(op_type, debug_mode, save_res=False, desc=None):
+def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False):
    # get the existed tiling
    conf_of_set_dim = load_json_configs(op_type)

    if desc is None:
-        read_shapes_from_file(debug_mode, save_res, conf_of_set_dim, op_type)
+        read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type)
    else:
        shp = desc
-        do_profiling(shp, debug_mode, save_res, op_type)
+        do_profiling(shp, debug_mode, save_res, all_space, op_type)
--- a/tests/fuzz/tune/autotuning/kernel_compiler.py
+++ b/tests/fuzz/tune/autotuning/kernel_compiler.py
@@ -115,7 +115,8 @@ def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table,
        attrs = {'dim': dim_info, 'bypass': config.bypass}
    return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format,
                                     op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y,
-                                     op_desc.dtype, op_desc.out_dtype, kernel_name, attrs, gen_tiling_spaces)
+                                     op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name,
+                                     attrs, tuning=gen_tiling_spaces)


 def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None,

--- a/tests/fuzz/tune/autotuning/runner.py
+++ b/tests/fuzz/tune/autotuning/runner.py
@@ -18,6 +18,7 @@ import multiprocessing
 import logging
 import os
 import subprocess
+import time
 from typing import NamedTuple
 import numpy as np
 from akg import composite
@@ -86,8 +87,10 @@ class KernelRunner:

    def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False):
        """Compile and execute a config of the operator on device"""
+        time_one_kernel_start = time.time()
        logger.debug('compile %dth kernel', idx)
        try:
+            time_start_build = time.time()
            if self.op_type == "json":
                if is_auto:
                    mod = composite.build(self.op_desc)
@@ -105,6 +108,8 @@ class KernelRunner:
            else:
                mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table,
                                     None if is_auto else config.input, idx)
+            time_end_build = time.time()
+            logger.debug("build module time: %f", time_end_build - time_start_build)
            logger.debug('finished compile %dth kernel', idx)
        except BaseException as e:
            logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str(config.input), str(e))
@@ -127,6 +132,7 @@ class KernelRunner:
            for _ in range(self.repeat_times):
                stat_info = {}
                try:
+                    time_start_launch = time.time()
                    if self.mod_output_param is not None:
                        output, stat_info = utils.mod_launch(mod, list(self.input), self.mod_output_param,
                                                             tuning=True, device_id=device_id)
@@ -144,18 +150,24 @@ class KernelRunner:
                                stat_info['run_time'] = precision_error_time
                                logger.debug("Precision Error: [%s]",
                                             "origin" if config is None else str(config.input))
+                    time_end_launch = time.time()
+                    logger.debug("mod launch time: %f", time_end_launch - time_start_launch)
                except BaseException as e:
                    logger.debug("Run Failed: [%s] : %s", str(config.input), str(e))
                    stat_info['run_time'] = run_failed_time
                run_times[idx] = np.minimum(run_times[idx], stat_info['run_time'])
        finally:
            logger.debug('end of %dth kernel', idx)
+            time_one_kernel_end = time.time()
+            logger.debug('run one kernel time: %f', time_one_kernel_end - time_one_kernel_start)
        return

-    def run(self, configs, best_time=np.inf, is_auto_set_dim=False):
+    def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False):
        """Compile and execute a batch config of the operator on device"""
        start = time.time()
+        logger.setLevel(logging.DEBUG)
        logger.debug("gen cce kernels batch: %d kernels", len(configs))
+        subprocess.run("rm -rf ./jobs/JOB*", shell=True)
        process_jobs = []
        run_times = multiprocessing.Manager().list(np.full((len(configs),), compile_fail_time))
        for idx, config in enumerate(configs):
@@ -173,6 +185,8 @@ class KernelRunner:
                run_times[idx] = timeout_time
                p.terminate()

+        process_end = time.time()
+        logger.debug("process time: %f", process_end - start)
        # clean the profiling directory
        tune_device = int(os.environ['DEVICE_ID'])
        tune_num = int(os.environ['DEVICE_TOTAL_NUM'])
@@ -206,6 +220,7 @@ class KernelRunner:
                job_file = p[0].decode('utf8').strip().split('/')[-2]
                subprocess.run("rm -rf ./jobs/%s" % job_file, shell=True)
        end = time.time()
+        logger.debug("run kernels time: %f", end - start)
        self.run_kernel_time += end - start

        for idx, config in enumerate(configs):

--- a/tests/fuzz/tune/autotuning/space.py
+++ b/tests/fuzz/tune/autotuning/space.py
@@ -161,6 +161,9 @@ class ListConfigSpace(ConfigSpace):
        """reset fetch state"""
        self.__fetch_pool = [i for i in range(len(self._configs))]

+    def fetch_scope(self, start, end):
+        self.__fetch_pool = [i for i in range(start, end)]
+
    def has_next(self) -> bool:
        return len(self.__fetch_pool) > 0

@@ -172,6 +175,12 @@ class ListConfigSpace(ConfigSpace):
        self.__fetch_pool.pop()
        return ret

+    def fetch_next_index(self) -> int:
+        """fetch next index of config"""
+        idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0]
+        self.__fetch_pool.pop()
+        return idx
+
    def fetch_config(self) -> ConfigEntity:
        """fetch a random config"""
        return self.get(self.fetch_index())

--- a/tests/fuzz/tune/autotuning/test_data_generators.py
+++ b/tests/fuzz/tune/autotuning/test_data_generators.py
@@ -107,10 +107,10 @@ def _gen_data_matmul_cube(op_desc: MatmulCubeDesc):
    _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
                                                            op_desc.bias, op_desc.left_format, op_desc.right_format,
                                                            op_desc.out_format)
-    m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.out_dtype,
-                                                             op_desc.bias, op_desc.adj_x, op_desc.adj_y,
-                                                             op_desc.left_format, op_desc.right_format,
-                                                             op_desc.out_format)
+    m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype,
+                                                             op_desc.out_dtype, op_desc.bias, op_desc.adj_x,
+                                                             op_desc.adj_y, op_desc.left_format,
+                                                             op_desc.right_format, op_desc.out_format)

    out_data = np.full(out_shape, np.nan, op_desc.out_dtype)


--- a/tests/fuzz/tune/autotuning/tuner.py
+++ b/tests/fuzz/tune/autotuning/tuner.py
@@ -93,7 +93,7 @@ class Tuner:
        print('tuning time:', self._tuning_time, 'secs')

    def next_batch(self, batch_size: int, is_add_visited=True):
-        """extract next batch"""
+        """extract next batch with xgboost model"""
        ret = []
        counter = 0
        if not is_add_visited:
@@ -116,6 +116,17 @@ class Tuner:
            counter += 1
        return ret

+    def next_config(self, batch_size: int):
+        """extract next config orderly"""
+        ret = []
+        counter = 0
+        while counter < batch_size and self._space.has_next():
+            index = self._space.fetch_next_index()
+            ret.append(self._space.get(index))
+            self._visited.add(index)
+            counter += 1
+        return ret
+
    def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""):
        """export configs"""
        mode = "a" if append else "w"
@@ -158,13 +169,13 @@ class Tuner:
        while i < least_try_times:
            if not self._space.has_next():
                break
-            configs = self.next_batch(min(self._n_parallel, least_try_times - i))
+            configs = self.next_config(min(self._n_parallel, least_try_times - i))
            run_times = self._runner.run(configs, self._best_time)
            results = []
            for idx, conf in enumerate(configs):
                results.append((conf.input_id, run_times[idx]))
                # keep best config
-                if self.best_time < run_times[idx]:
+                if self.best_time > run_times[idx]:
                    self._best_time = run_times[idx]
                    self._best_iter = i + idx
                    self._best_config = conf
@@ -224,6 +235,7 @@ class ModelBasedTuner(Tuner):
        self.__least_try_times = least_try_times
        self.__early_stopping = early_stopping

+        logger.setLevel(logging.DEBUG)
        old_level = logger.level
        i = 0
        error_ct = 0

--- a/tests/fuzz/tune/autotuning/type_definitions.py
+++ b/tests/fuzz/tune/autotuning/type_definitions.py
@@ -21,7 +21,7 @@ ConvDesc = namedtuple("ConvDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride'
 ConvBackpropDesc = namedtuple("ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation'])

 MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format",
-                                               "out_format", "adj_x", "adj_y", "dtype", "out_dtype"])
+                                               "out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"])

 # config param definitions
 ConvConfig = namedtuple('ConvConfig', ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass'])

--- a/tests/fuzz/tune/test.py
+++ b/tests/fuzz/tune/test.py
@@ -13,11 +13,16 @@
 # limitations under the License.

 """test"""
+import time
 from autotuning.job import launch
 from test_run.sub_run import sub_execute

+time_start = time.time()
 op_type_ = 'sub'
 debug_mode_ = True
 save_res_ = True
+all_space_ = False
 desc_ = ('024_sub_64_16_128_128_64_16_128_128_fp16', sub_execute, [(64, 16, 128, 128), (64, 16, 128, 1), 'float16'])
-launch(op_type=op_type_, debug_mode=debug_mode_, save_res=save_res_, desc=desc_)
+launch(op_type=op_type_, debug_mode=debug_mode_, save_res=save_res_, desc=desc_, all_space=all_space_)
+time_end = time.time()
+print("launch time: ", time_end - time_start)