提交 3713f94c 编写于 作者: C chenlei_autodiff

fix matmul tuning and support all space tuning.

上级 6335daad
......@@ -15,22 +15,23 @@
"""AutoTuning job"""
import os
import json
import time
import datetime
import importlib
import logging
import subprocess
import numpy as np
from collections import namedtuple
from akg import composite
from akg.utils import kernel_exec as utils
from autotuning.runner import KernelRunner, error_time_list, error_time_string
from autotuning.tuner import ModelBasedTuner
from autotuning.tuner import ModelBasedTuner, Tuner
from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
from autotuning.space_generators import get_space
from autotuning.space import ListConfigSpace
from autotuning.test_data_generators import gen_data
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('fuzz.tune.autotuning.job')
......@@ -92,11 +93,16 @@ def launch_json(debug_mode: bool = True, save_res: bool = False, json_input_dir=
if save_res:
save_tuning_result(key, "json", None, index_table, tuner)
def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True,
save_res: bool = False, insert_key='', conf_of_set_dim=""):
def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False,
all_space: bool = True, insert_key='', conf_of_set_dim=""):
"""AutoTuning jobs"""
iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
time_start_get_space = time.time()
index_table, space, key, expect, input_for_mod = get_space(op_type, desc)
if all_space:
iter_times = [space.length, space.length, space.length]
time_end_get_space = time.time()
print("get space time: ", time_end_get_space - time_start_get_space)
print('space size:', space.length)
print('index table:', index_table)
key = key if insert_key == '' else insert_key
......@@ -121,12 +127,18 @@ def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True,
# available device numbers, normally is 8 or 1
available_device_numbers = utils.get_available_devices_num()
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=64, pre_model=None)
time_start_tuning = time.time()
if all_space:
tuner = Tuner(runner, index_table, space, n_parallel=available_device_numbers)
else:
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=64, pre_model=None)
least_try_times = iter_times[0 if space.length < 10 ** 4 else 1 if space.length < 10 ** 5 else 2]
tuner.tune(least_try_times, output_file=op_type + ".log")
time_end_tuning = time.time()
print("tuning time: ", time_end_tuning - time_start_tuning)
print_tuning_result(op_type, space, index_table, tuner, key)
if save_res:
......@@ -231,46 +243,48 @@ def load_json_configs(op_type):
return {}
return {}
def read_shapes_from_file(debug_mode, save_res, conf_of_set_dim, op_type):
def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type):
"""read tuning shapes from file"""
file = importlib.import_module('autotuning.shapes.' + op_type)
shapes = file.shapes
for _, shp in enumerate(shapes):
do_profiling(shp, debug_mode, save_res, op_type, conf_of_set_dim)
do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim)
def do_profiling(shp, debug_mode, save_res, op_type, conf_of_set_dim=None):
def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None):
"""do profiling"""
# remove undeleted JOB files for previous shapes
subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True)
if op_type == 'matmul':
key = shp[2][0:-1]
logger.debug("start profiling: [%s]", str(key))
desc = MatmulCubeDesc(*key)
jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv_backprop'):
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvBackpropDesc(*key)
jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv'):
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvDesc(*key)
jobs(op_type, desc, debug_mode, save_res, key.__str__(), conf_of_set_dim)
jobs(op_type, desc, debug_mode, save_res, all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
else:
key = shp
logger.debug("start profiling: [%s]", str(key))
desc = key
jobs(op_type, desc, debug_mode, save_res, conf_of_set_dim=conf_of_set_dim)
jobs(op_type, desc, debug_mode, save_res, all_space, conf_of_set_dim=conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
def launch(op_type, debug_mode, save_res=False, desc=None):
def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False):
# get the existed tiling
conf_of_set_dim = load_json_configs(op_type)
if desc is None:
read_shapes_from_file(debug_mode, save_res, conf_of_set_dim, op_type)
read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type)
else:
shp = desc
do_profiling(shp, debug_mode, save_res, op_type)
do_profiling(shp, debug_mode, save_res, all_space, op_type)
......@@ -115,7 +115,8 @@ def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table,
attrs = {'dim': dim_info, 'bypass': config.bypass}
return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format,
op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y,
op_desc.dtype, op_desc.out_dtype, kernel_name, attrs, gen_tiling_spaces)
op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name,
attrs, tuning=gen_tiling_spaces)
def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None,
......
......@@ -18,6 +18,7 @@ import multiprocessing
import logging
import os
import subprocess
import time
from typing import NamedTuple
import numpy as np
from akg import composite
......@@ -86,8 +87,10 @@ class KernelRunner:
def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False):
"""Compile and execute a config of the operator on device"""
time_one_kernel_start = time.time()
logger.debug('compile %dth kernel', idx)
try:
time_start_build = time.time()
if self.op_type == "json":
if is_auto:
mod = composite.build(self.op_desc)
......@@ -105,6 +108,8 @@ class KernelRunner:
else:
mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table,
None if is_auto else config.input, idx)
time_end_build = time.time()
logger.debug("build module time: %f", time_end_build - time_start_build)
logger.debug('finished compile %dth kernel', idx)
except BaseException as e:
logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str(config.input), str(e))
......@@ -127,6 +132,7 @@ class KernelRunner:
for _ in range(self.repeat_times):
stat_info = {}
try:
time_start_launch = time.time()
if self.mod_output_param is not None:
output, stat_info = utils.mod_launch(mod, list(self.input), self.mod_output_param,
tuning=True, device_id=device_id)
......@@ -144,18 +150,24 @@ class KernelRunner:
stat_info['run_time'] = precision_error_time
logger.debug("Precision Error: [%s]",
"origin" if config is None else str(config.input))
time_end_launch = time.time()
logger.debug("mod launch time: %f", time_end_launch - time_start_launch)
except BaseException as e:
logger.debug("Run Failed: [%s] : %s", str(config.input), str(e))
stat_info['run_time'] = run_failed_time
run_times[idx] = np.minimum(run_times[idx], stat_info['run_time'])
finally:
logger.debug('end of %dth kernel', idx)
time_one_kernel_end = time.time()
logger.debug('run one kernel time: %f', time_one_kernel_end - time_one_kernel_start)
return
def run(self, configs, best_time=np.inf, is_auto_set_dim=False):
def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False):
"""Compile and execute a batch config of the operator on device"""
start = time.time()
logger.setLevel(logging.DEBUG)
logger.debug("gen cce kernels batch: %d kernels", len(configs))
subprocess.run("rm -rf ./jobs/JOB*", shell=True)
process_jobs = []
run_times = multiprocessing.Manager().list(np.full((len(configs),), compile_fail_time))
for idx, config in enumerate(configs):
......@@ -173,6 +185,8 @@ class KernelRunner:
run_times[idx] = timeout_time
p.terminate()
process_end = time.time()
logger.debug("process time: %f", process_end - start)
# clean the profiling directory
tune_device = int(os.environ['DEVICE_ID'])
tune_num = int(os.environ['DEVICE_TOTAL_NUM'])
......@@ -206,6 +220,7 @@ class KernelRunner:
job_file = p[0].decode('utf8').strip().split('/')[-2]
subprocess.run("rm -rf ./jobs/%s" % job_file, shell=True)
end = time.time()
logger.debug("run kernels time: %f", end - start)
self.run_kernel_time += end - start
for idx, config in enumerate(configs):
......
......@@ -161,6 +161,9 @@ class ListConfigSpace(ConfigSpace):
"""reset fetch state"""
self.__fetch_pool = [i for i in range(len(self._configs))]
def fetch_scope(self, start, end):
self.__fetch_pool = [i for i in range(start, end)]
def has_next(self) -> bool:
return len(self.__fetch_pool) > 0
......@@ -172,6 +175,12 @@ class ListConfigSpace(ConfigSpace):
self.__fetch_pool.pop()
return ret
def fetch_next_index(self) -> int:
"""fetch next index of config"""
idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0]
self.__fetch_pool.pop()
return idx
def fetch_config(self) -> ConfigEntity:
"""fetch a random config"""
return self.get(self.fetch_index())
......
......@@ -107,10 +107,10 @@ def _gen_data_matmul_cube(op_desc: MatmulCubeDesc):
_, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
op_desc.bias, op_desc.left_format, op_desc.right_format,
op_desc.out_format)
m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.out_dtype,
op_desc.bias, op_desc.adj_x, op_desc.adj_y,
op_desc.left_format, op_desc.right_format,
op_desc.out_format)
m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype,
op_desc.out_dtype, op_desc.bias, op_desc.adj_x,
op_desc.adj_y, op_desc.left_format,
op_desc.right_format, op_desc.out_format)
out_data = np.full(out_shape, np.nan, op_desc.out_dtype)
......
......@@ -93,7 +93,7 @@ class Tuner:
print('tuning time:', self._tuning_time, 'secs')
def next_batch(self, batch_size: int, is_add_visited=True):
"""extract next batch"""
"""extract next batch with xgboost model"""
ret = []
counter = 0
if not is_add_visited:
......@@ -116,6 +116,17 @@ class Tuner:
counter += 1
return ret
def next_config(self, batch_size: int):
"""extract next config orderly"""
ret = []
counter = 0
while counter < batch_size and self._space.has_next():
index = self._space.fetch_next_index()
ret.append(self._space.get(index))
self._visited.add(index)
counter += 1
return ret
def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""):
"""export configs"""
mode = "a" if append else "w"
......@@ -158,13 +169,13 @@ class Tuner:
while i < least_try_times:
if not self._space.has_next():
break
configs = self.next_batch(min(self._n_parallel, least_try_times - i))
configs = self.next_config(min(self._n_parallel, least_try_times - i))
run_times = self._runner.run(configs, self._best_time)
results = []
for idx, conf in enumerate(configs):
results.append((conf.input_id, run_times[idx]))
# keep best config
if self.best_time < run_times[idx]:
if self.best_time > run_times[idx]:
self._best_time = run_times[idx]
self._best_iter = i + idx
self._best_config = conf
......@@ -224,6 +235,7 @@ class ModelBasedTuner(Tuner):
self.__least_try_times = least_try_times
self.__early_stopping = early_stopping
logger.setLevel(logging.DEBUG)
old_level = logger.level
i = 0
error_ct = 0
......
......@@ -21,7 +21,7 @@ ConvDesc = namedtuple("ConvDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride'
ConvBackpropDesc = namedtuple("ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation'])
MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format",
"out_format", "adj_x", "adj_y", "dtype", "out_dtype"])
"out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"])
# config param definitions
ConvConfig = namedtuple('ConvConfig', ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass'])
......
......@@ -13,11 +13,16 @@
# limitations under the License.
"""test"""
import time
from autotuning.job import launch
from test_run.sub_run import sub_execute
time_start = time.time()
op_type_ = 'sub'
debug_mode_ = True
save_res_ = True
all_space_ = False
desc_ = ('024_sub_64_16_128_128_64_16_128_128_fp16', sub_execute, [(64, 16, 128, 128), (64, 16, 128, 1), 'float16'])
launch(op_type=op_type_, debug_mode=debug_mode_, save_res=save_res_, desc=desc_)
launch(op_type=op_type_, debug_mode=debug_mode_, save_res=save_res_, desc=desc_, all_space=all_space_)
time_end = time.time()
print("launch time: ", time_end - time_start)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册