refactor(sdk): refactor load and run with new framework

GitOrigin-RevId: b092699dee49eab068e262327b078ce157e36f26

refactor(sdk): refactor load and run with new framework
GitOrigin-RevId: b092699dee49eab068e262327b078ce157e36f26
37c1726f · Megvii Engine Team · b75658c8 · 37c1726f · 37c1726f · 37c1726f
45 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,6 @@ option(MGE_ENABLE_EXCEPTIONS "Build with exceptions" ON)
 option(MGE_WITH_TEST "Enable test for MegEngine." OFF)
 option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
 option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
-option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)
@@ -542,6 +541,8 @@ if(MGE_WITH_TEST)
    include(cmake/gtest.cmake)
 endif()

+include(cmake/gflags.cmake)
+
 if(MGE_BUILD_IMPERATIVE_RT)
    set(CMAKE_CXX_STANDARD 17)
 endif()
@@ -1147,10 +1148,6 @@ endif()

 add_subdirectory(src)

-if(MGE_BUILD_SDK)
-    add_subdirectory(sdk/load-and-run)
-endif()
-
 if(MGE_BUILD_IMPERATIVE_RT)
    add_subdirectory(imperative)
    message(STATUS "Enable imperative python wrapper runtime")

--- a/cmake/gflags.cmake
+++ b/cmake/gflags.cmake
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags ${CMAKE_CURRENT_BINARY_DIR}/gflags)
\ No newline at end of file
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -150,6 +150,9 @@ if(MGE_WITH_TEST)
    add_subdirectory(test)
 endif()

+#load_and_run
+add_subdirectory(load_and_run)
+
 # tools and example
 add_executable(rc4_encryptor tools/rc4_encrypt.cpp)


--- a/lite/load_and_run/BUILD
+++ b/lite/load_and_run/BUILD
+load("//brain/megbrain/lite:flags.bzl","pthread_select")
+
+cc_library(
+    name = "mgblar",
+    copts = ["-std=c++14"],
+
+    srcs = glob(["src/**/*.cpp"], exclude = ["src/main.cpp"]),
+    hdrs = glob(["src/**/*.h"]),
+    includes = ["src"],
+    features = if_opt([
+        "no_exceptions",
+        "no_rtti",
+    ]),
+    defines = [
+        "LITE_BUILD_WITH_MGE=1", 
+    ],
+
+    deps = ["//brain/megbrain/lite:lite_static_test"]+
+        pthread_select(
+            ["@com_github_gflags_gflags//:gflags_nothreads"],
+            ["//external:gflags"]
+        ),
+    alwayslink = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_megvii_binary(
+    name = "load_and_run",
+    copts = ["-std=c++14"],
+    srcs = ["src/main.cpp"],
+    features = if_opt([
+        "no_exceptions",
+        "no_rtti",
+    ]),
+    internal_deps = [":mgblar"],
+    visibility = ["//visibility:public"],
+)
+
--- a/lite/load_and_run/CMakeLists.txt
+++ b/lite/load_and_run/CMakeLists.txt
+# BUILD the load and run for lite
+include_directories(PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/load_and_run/src>)
+file (GLOB_RECURSE SOURCES ./*.cpp)
+
+add_executable (load_and_run  ${SOURCES})
+
+target_link_libraries(load_and_run lite_static)
+target_link_libraries(load_and_run megbrain)
+target_link_libraries(load_and_run gflags)
+
+if(LITE_BUILD_WITH_RKNPU)
+    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(load_and_run  PRIVATE "-fuse-ld=gold")
+endif()
+
+if(MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(load_and_run megdnn)
+endif()
+
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(load_and_run dl)
+    else()
+        target_link_libraries(load_and_run dl rt)
+    endif()
+endif()
+
+install (TARGETS load_and_run EXPORT ${LITE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
--- a/lite/load_and_run/dump_with_testcase.py
+++ b/lite/load_and_run/dump_with_testcase.py
+#!/usr/bin/env mdl
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from megskull.graph import NodeFilter, FpropEnv
+from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization
+from megskull.utils.logconf import get_logger
+from meghair.utils import io
+import megbrain as mgb
+
+import argparse
+import struct
+import re
+import os
+
+import numpy as np
+import cv2
+
+logger = get_logger(__name__)
+
+def auto_reformat_image(args, path, data, dst_shape):
+    """reformat image to target shape
+
+    :param data: image data as numpy array
+    :param dst_shape: target shape
+    """
+    dim3_format = False     # required input format does not contain batch
+    hwc_format = False      # required input format is NHWC
+
+    if len(dst_shape) == 3:
+        dst_shape = (1, ) + dst_shape
+        dim3_format = True
+
+    assert len(dst_shape) == 4, 'bad dst_shape: {}'.format(dst_shape)
+    chl = dst_shape[1]
+    if chl in [1, 3]:
+        n, c, h, w = dst_shape
+        dst_shape = (n, h, w, c)
+    else:
+        chl = dst_shape[3]
+        assert chl in [1, 3], (
+            'can not infer input format from shape: {}'.format(dst_shape))
+        hwc_format = True
+
+    # dst_shape has now been normalized to NHWC format
+
+    if args.resize_input:
+        h, w = dst_shape[1:3]
+        data = cv2.resize(data, (w, h))
+        logger.info('input {} resized to {}'.format(path, data.shape))
+
+    if chl == 1:
+        data = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY)
+        data = data[:, :, np.newaxis]
+
+    assert data.ndim == 3
+    data = data[np.newaxis]
+    # data normalized to NHWC format
+
+    if not hwc_format:
+        data = np.transpose(data, (0, 3, 1, 2))
+
+    if dim3_format:
+        data = np.squeeze(data, 0)
+
+    return data
+
+def read_input_data(args, dst_shape, dtype, path, repeat):
+    def check_shape_equal(dst_shape, data_shape):
+        assert len(data_shape) == len(dst_shape) , (
+            'input/data shapes mismatch: {} vs {}'.format(
+                dst_shape, data_shape))
+
+        if data_shape[1:] != dst_shape[1:]:
+            logger.warning('dst_shape is {}; data_shape is {}'.format(
+                dst_shape, data_shape))
+
+    if path.startswith('#'):
+        assert not args.resize_input
+        assert not args.input_transform
+        spec = path
+        m = re.match(
+            r'^#rand\(([-0-9.]*)\s*,\s*([-0-9.]*)\s*(,[^\)]+)?\)$', spec)
+        assert m, 'bad spec {}'.format(spec)
+
+        rng_min = float(m.group(1))
+        rng_max = float(m.group(2))
+        if m.group(3):
+            shape_str = m.group(3)
+            try:
+                shape = shape_str[1:].split(',')
+                if shape[-1].strip() == '...':
+                    shape = shape[:-1]
+                    shape.extend(list(dst_shape[len(shape):]))
+                data_shape = tuple(map(int, shape))
+            except ValueError as e:
+                raise ValueError('bad spec {}: {}'.format(spec, e.args))
+        else:
+            data_shape = dst_shape
+
+        check_shape_equal(dst_shape, data_shape)
+        return np.random.uniform(rng_min, rng_max, data_shape).astype(dtype)
+
+    # try to load image
+    data = cv2.imread(path, cv2.IMREAD_COLOR)
+    if data is None:
+        assert not args.resize_input
+        data = io.load(path)
+        assert isinstance(data, np.ndarray)
+    else:
+        # load image succeeds, so we expect input format is image format
+        data = auto_reformat_image(args, path, data, dst_shape)
+
+    data = np.repeat(data, repeat, axis=0)
+    if repeat > 1:
+        logger.info('repeat input for {} times, data shape is {}'.format(
+            repeat, data.shape))
+
+    check_shape_equal(dst_shape, data.shape)
+
+    if args.input_transform:
+        data = eval(args.input_transform, {'data': data, 'np': np})
+
+    return data
+
+
+def gen_one_testcase(args, inputs, spec):
+    paths = spec.split(';')
+    if len(paths) != len(inputs):
+        if len(paths) == 1 and paths[0].startswith('#'):
+            paths = ['{}:{}'.format(name, paths[0]) for name in inputs.keys()]
+    assert len(paths) == len(inputs), (
+        'required inputs: {}; data paths: {}'.format(inputs.keys(), paths))
+    if len(paths) == 1 and ':' not in paths[0]:
+        paths[0] = next(iter(inputs.keys())) + ':' + paths[0]
+
+    ret = {}
+    for path in paths:
+        var, path = path.split(':')
+        if args.repeat:
+            repeat = args.repeat
+        else:
+            repeat = 1
+        ret[var] = read_input_data(args, inputs[var].imm_shape,
+                                   inputs[var].dtype, path, repeat)
+    return ret
+
+
+def make_feeds(args):
+    outputs = io.load_network(args.input).outputs
+    if not args.no_assert:
+        env = FpropEnv(verbose_fprop=False)
+        # set flag so ExternCOprPlaceholder produce expected output
+        env.flags.user['extern_c_opr_eval'] = True
+        func = env.comp_graph.compile(None, [mgb.copy_output(env.get_mgbvar(i))
+                                             for i in outputs])
+
+        def expect_name(var): return 'expect:{}'.format(var.name)
+
+    nf = NodeFilter.make_all_deps(*outputs)
+    inputs = {i.name: i for i in nf.data_provider()}
+    if args.init_bn:
+        for i in nf:
+            if isinstance(i, BatchNormalization):
+                if i._iter.get_value() == 0:
+                    i._iter.set_value(1)
+                    i._variance.set_value(np.ones(i._variance.shape))
+
+    testcases = []
+
+    np.set_printoptions(precision=2, threshold=4, suppress=True)
+
+    data_list = []
+    for item in args.data:
+        if item.startswith('@'):
+            with open(item[1:], 'r') as f:
+                data_list.extend([ line.rstrip() for line in f if line.rstrip() != ''])
+        else:
+            data_list.append(item)
+
+    for inp_spec in data_list:
+        cur_testcase = gen_one_testcase(args, inputs, inp_spec)
+        assert len(cur_testcase) == len(inputs), (
+            'required inputs: {}; given data: {}'.format(
+                inputs.keys(), cur_testcase.keys()))
+
+        if not args.no_assert:
+            outputs_get = func(**cur_testcase)
+            for var, val in zip(outputs, outputs_get):
+                cur_testcase[expect_name(var)] = val
+                logger.info(
+                    'generate test groundtruth: var={} shape={} range=({}, {})'
+                    ' mean={} var={}'.format(
+                        var, val.shape, val.min(), val.max(),
+                        np.mean(val), np.var(val)))
+        testcases.append(cur_testcase)
+        logger.info('add testcase: \n {}'.format(
+            '\n '.join('{}: shape={} dtype={} range=({:.2f},{:.2f}) '
+                       'mean={:.2f} sd={:.2f}'.format(
+                           k, v.shape, v.dtype, v.min(), v.max(), np.mean(v),
+                           np.std(v))
+                       for k, v in sorted(cur_testcase.items()))))
+
+    if not args.no_assert:
+        def expect_shp(var):
+            ret = var.partial_shape.determined_shape
+            if ret:
+                return ret
+            return testcases[0][expect_name(var)].shape
+
+        verbose = not args.silent
+        outputs = [AssertEqual(DataProvider(expect_name(i), expect_shp(i),
+                                            dtype=i.dtype,
+                                            comp_node=i.comp_node),
+                               i, verbose=verbose, maxerr=args.maxerr)
+                   for i in outputs]
+    return {'outputs': outputs, 'testcases': testcases}
+
+def optimize_for_inference(args, outputs):
+    args_map = {
+        'enable_io16xc32': 'f16_io_f32_comp',
+        'enable_ioc16': 'f16_io_comp',
+        'enable_hwcd4': 'use_nhwcd4',
+        'enable_nchw4': 'use_nchw4',
+        'enable_nchw88': 'use_nchw88',
+        'enable_nchw44': 'use_nchw44',
+        'enable_nchw44_dot': 'use_nchw44_dot',
+        'enable_nchw32': 'use_nchw32',
+        'enable_chwn4': 'use_chwn4',
+        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
+        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
+        'enable_nchw64': 'use_nchw64', 
+        'enable_fuse_preprocess': 'fuse_preprocess', 
+    }
+
+    kwargs = {}
+    for k, v in args_map.items():
+        if getattr(args, k):
+            assert args.optimize_for_inference, (
+                'optimize_for_inference should be set when {} is given'.format(
+                    k))
+            kwargs[v] = True
+
+    if args.optimize_for_inference:
+        return mgb.optimize_for_inference(outputs, **kwargs)
+
+    return outputs
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Pack computing graph, input values and expected output '
+        'values into one file for checking correctness. README.md gives more '
+        'details on the usage',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('input', help='input file; see README for details')
+    parser.add_argument('-o', '--output', help='output file', required=True)
+    parser.add_argument('--init-bn', action='store_true',
+                        help='initialize untrained batch-normalization, to '
+                        'avoid NaN or Inf results')
+    parser.add_argument(
+        '-d', '--data', default=[], action='append',
+        help='Given input test data when input file is a network, '
+        'and current network output would be used as groundtruth. '
+        'The format is var0:file0;var1:file1... to specify data files for '
+        'input vars. It can also be #rand(min,max,shape...) for generating '
+        'random input data, for example, #rand(0,255), '
+        '#rand(0,255,1,3,224,224) or #rand(0, 255, 1, ...) where `...` means '
+        'the remaining part of the original shape. '
+        'If the shape is not specified, the shape of '
+        'corresponding DataProvider in the network will be used. '
+        'If there is only one input var, its name can be omitted. '
+        'Each data file can either be an image which can be loaded by opencv, '
+        'or a pickled numpy.ndarray. '
+        'This option can be given multiple times to add multiple testcases. '
+        ' *NOTE* '
+        'If you start the data with the letter @, the rest should be a '
+        'filename, and each line in the file should be a single datum in '
+        'the format described above. '
+    )
+    parser.add_argument(
+        '--repeat', type=int, default=1,
+        help='Specify how many times the input image is repeated. '
+        'Useful when running benchmark for batch size other than one. '
+        'Have no effect on randomly generated input data.')
+    parser.add_argument('--silent', action='store_true',
+                        help='set verbose to False in AssertEqual opr')
+    parser.add_argument('--optimize-for-inference', action='store_true',
+                        help='enbale optimization for inference')
+    parser.add_argument('--no-assert', action='store_true',
+                        help='do not insert AssertEqual opr to check result; '
+                        'this option is useful for benchmarking')
+    parser.add_argument('--maxerr', type=float, default=AssertEqual.maxerr,
+                        help='max error for AssertEqual check during runtime')
+    parser.add_argument('--resize-input', action='store_true',
+                        help='resize input image to fit input var shape')
+    parser.add_argument('--input-transform',
+                        help='a python expression to transform the input data. '
+                        'Example: data / np.std(data)')
+    parser.add_argument('--discard-var-name', action='store_true',
+                        help='discard variable and param names in the '
+                        'generated output')
+    parser.add_argument('--output-strip-info', action='store_true',
+                        help='output code strip information')
+    parser.add_argument('--enable-io16xc32', action='store_true',
+                        help='transform the mode to float16 io float32 compute')
+    parser.add_argument('--enable-ioc16', action='store_true',
+                        help='transform the dtype of the model to float16 io '
+                        'and compute')
+    parser.add_argument('--enable-fuse-conv-bias-nonlinearity',
+                        action='store_true',
+                        help='fuse convolution bias and nonlinearity opr to a '
+                        'conv_bias opr and compute')
+    parser.add_argument('--enable-hwcd4', action='store_true',
+                        help='transform the model format from NCHW to NHWCD4 '
+                        'for inference; you may need to disable CUDA and set '
+                        'MGB_USE_MEGDNN_DBG=2')
+    parser.add_argument('--enable-nchw4', action='store_true',
+                        help='transform the model format from NCHW to NCHW4 '
+                        'for inference')
+    parser.add_argument('--enable-nchw88', action='store_true',
+                        help='transform the model format from NCHW to NCHW88 '
+                        'for inference')
+    parser.add_argument('--enable-nchw44', action='store_true',
+                        help='transform the model format from NCHW to NCHW44 '
+                        'for inference')
+    parser.add_argument('--enable-nchw44-dot', action='store_true',
+                        help='transform the model format from NCHW to NCHW44_DOT '
+                        'for optimizing armv8.2 dot in inference')
+    parser.add_argument('--enable-chwn4', action='store_true',
+                        help='transform the model format to CHWN4 '
+                        'for inference, mainly used for nvidia tensorcore')
+    parser.add_argument('--enable-nchw32', action='store_true',
+                        help='transform the model format from NCHW4 to NCHW32 '
+                        'for inference on nvidia TensoCore')
+    parser.add_argument('--enable-nchw64', action='store_true', 
+                        help='transform the model format from NCHW to NCHW64 '
+                        'for inference on Nvidia GPU')
+    parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true',
+                        help='fuse conv_bias with z input for inference on '
+                        'nvidia GPU (this optimization pass will result in mismatch '
+                        'of the precision of output of training and inference)')
+    parser.add_argument('--enable-fuse-preprocess', action='store_true', 
+                        help='fuse astype\pad_channel\dimshuffle and etc opr '
+                        'from h2d op')
+    args = parser.parse_args()
+    if args.data:
+        feeds = make_feeds(args)
+    else:
+        feeds = io.load(args.input)
+
+    assert isinstance(feeds, dict) and feeds['testcases'], (
+        'testcases can not be empty')
+
+    env = FpropEnv(verbose_fprop=False)
+
+    outputs = feeds['outputs']
+    output_mgbvars = list(map(env.get_mgbvar, outputs))
+
+    output_mgbvars = optimize_for_inference(args, output_mgbvars)
+
+    inputs = sorted(((i.name, i.dtype) for i in
+                     NodeFilter.make_all_deps(*outputs).data_provider()))
+    if args.discard_var_name:
+        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
+    else:
+        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)
+
+    with open(args.output, 'wb') as fout:
+        fout.write(b'mgbtest0')
+        fout.write(struct.pack('I', len(feeds['testcases'])))
+    stat = mgb.serialize_comp_graph_to_file(
+        args.output, output_mgbvars, append=True,
+        output_strip_info=args.output_strip_info,
+        **sereg_kwargs)
+    logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.
+                format(stat.tot_bytes / 1024,
+                       (stat.tot_bytes - stat.tensor_value_bytes) / 1024))
+
+    for testcase in feeds['testcases']:
+        assert isinstance(testcase, dict)
+        cg = mgb.comp_graph()
+        cn = mgb.comp_node('cpux')
+        output_mgbvars = []
+        for name, dtype in inputs:
+            output_mgbvars.append(cg.make_shared(cn, value=testcase.pop(name),
+                                                 dtype=dtype))
+        assert not testcase, 'extra inputs provided in testcase: {}'.format(
+            testcase.keys())
+
+        mgb.serialize_comp_graph_to_file(
+            args.output,
+            output_mgbvars,
+            append=True,
+            output_strip_info=args.output_strip_info,
+            append_json=True)
+
+if __name__ == '__main__':
+    main()
--- a/lite/load_and_run/dump_with_testcase_mge.py
+++ b/lite/load_and_run/dump_with_testcase_mge.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import os
+import re
+import struct
+
+import cv2
+import numpy as np
+
+import megengine as mge
+import megengine.core._imperative_rt as rt
+import megengine.core.tensor.megbrain_graph as G
+from megengine import tensor
+from megengine.core._imperative_rt.core2 import apply
+from megengine.core.ops import builtin
+from megengine.utils import comp_graph_tools as cgtools
+
+logger = mge.get_logger(__name__)
+
+
+def auto_reformat_image(args, path, data, dst_shape):
+    """reformat image to target shape
+
+    :param data: image data as numpy array
+    :param dst_shape: target shape
+    """
+    dim3_format = False  # required input format does not contain batch
+    hwc_format = False  # required input format is NHWC
+
+    if not dst_shape:  # input tensor shape is not predefined
+        if len(data.shape) == 2:
+            chl = 1
+            h = data.shape[0]
+            w = data.shape[1]
+        else:
+            assert len(data.shape) == 3, "Input image must be of dimension 2 or 3"
+            h, w, chl = data.shape
+        dst_shape = (1, chl, h, w)
+
+    if len(dst_shape) == 3:
+        dst_shape = (1,) + dst_shape
+        dim3_format = True
+
+    assert len(dst_shape) == 4, "bad dst_shape: {}".format(dst_shape)
+    chl = dst_shape[1]
+    if chl in [1, 3]:
+        n, c, h, w = dst_shape
+        dst_shape = (n, h, w, c)
+    else:
+        chl = dst_shape[3]
+        assert chl in [1, 3], "can not infer input format from shape: {}".format(
+            dst_shape
+        )
+        hwc_format = True
+
+    # dst_shape has now been normalized to NHWC format
+
+    if args.resize_input:
+        h, w = dst_shape[1:3]
+        data = cv2.resize(data, (w, h))
+        logger.info("input {} resized to {}".format(path, data.shape))
+
+    if chl == 1:
+        data = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY)
+        data = data[:, :, np.newaxis]
+
+    assert data.ndim == 3
+    data = data[np.newaxis]
+    # data normalized to NHWC format
+
+    if not hwc_format:
+        data = np.transpose(data, (0, 3, 1, 2))
+
+    if dim3_format:
+        data = np.squeeze(data, 0)
+
+    return data
+
+
+def read_input_data(args, dst_shape, dtype, path, repeat):
+    def check_shape_equal(dst_shape, data_shape):
+        if len(dst_shape):
+            assert len(data_shape) == len(
+                dst_shape
+            ), "input/data shapes mismatch: {} vs {}".format(dst_shape, data_shape)
+
+            if data_shape[1:] != dst_shape[1:]:
+                logger.warning(
+                    "dst_shape is {}; data_shape is {}".format(dst_shape, data_shape)
+                )
+
+    if path.startswith("#"):
+        assert not args.resize_input
+        assert not args.input_transform
+        spec = path
+        m = re.match(r"^#rand\(([-0-9.]*)\s*,\s*([-0-9.]*)\s*(,[^\)]+)?\)$", spec)
+        assert m, "bad spec {}".format(spec)
+
+        rng_min = float(m.group(1))
+        rng_max = float(m.group(2))
+        if m.group(3):
+            shape_str = m.group(3)
+            try:
+                shape = shape_str[1:].split(",")
+                if shape[-1].strip() == "...":
+                    shape = shape[:-1]
+                    shape.extend(list(dst_shape[len(shape) :]))
+                data_shape = tuple(map(int, shape))
+            except ValueError as e:
+                raise ValueError("bad spec {}: {}".format(spec, e.args))
+        else:
+            data_shape = dst_shape
+
+        check_shape_equal(dst_shape, data_shape)
+        return np.random.uniform(rng_min, rng_max, data_shape).astype(dtype)
+
+    # try to load image
+    data = cv2.imread(path, cv2.IMREAD_COLOR)
+    if data is None:
+        assert not args.resize_input
+        data = np.load(path)
+        assert isinstance(data, np.ndarray)
+    else:
+        # load image succeeds, so we expect input format is image format
+        data = auto_reformat_image(args, path, data, dst_shape)
+
+    data = np.repeat(data, repeat, axis=0)
+    if repeat > 1:
+        logger.info(
+            "repeat input for {} times, data shape is {}".format(repeat, data.shape)
+        )
+
+    check_shape_equal(dst_shape, data.shape)
+
+    if args.input_transform:
+        data = eval(args.input_transform, {"data": data, "np": np})
+
+    return data
+
+
+def gen_one_testcase(args, inputs, spec):
+    paths = spec.split(";")
+    if len(paths) != len(inputs):
+        if len(paths) == 1 and paths[0].startswith("#"):
+            paths = ["{}:{}".format(name, paths[0]) for name in inputs.keys()]
+    assert len(paths) == len(inputs), "required inputs: {}; data paths: {}".format(
+        inputs.keys(), paths
+    )
+    if len(paths) == 1 and ":" not in paths[0]:
+        paths[0] = next(iter(inputs.keys())) + ":" + paths[0]
+
+    ret = {}
+    for path in paths:
+        var, path = path.split(":")
+        if args.repeat:
+            repeat = args.repeat
+        else:
+            repeat = 1
+        ret[var] = read_input_data(
+            args, inputs[var].shape, inputs[var].dtype, path, repeat
+        )
+    return ret
+
+
+def make_feeds(args):
+    ret = G.load_graph(args.input)
+    cg_rt, outputs = ret.graph, ret.output_vars_list
+    inputs = cgtools.get_dep_vars(outputs, "Host2DeviceCopy")
+
+    inputs = {i.name: i for i in inputs}
+    if not args.no_assert:
+
+        replace_varmap = {}
+        inp_map = {}
+        # replace var use InputNode
+        for name, var in inputs.items():
+            inp = G.InputNode(
+                device="xpux", dtype=var.dtype, shape=var.shape, graph=cg_rt
+            )
+            replace_varmap[var] = inp.outputs[0]
+            inp_map[name] = inp
+
+        new = cgtools.replace_vars(outputs, replace_varmap)
+        if isinstance(new, rt.VarNode):
+            new = list(new)
+
+        output_nodes = [G.OutputNode(var) for var in new]
+        func = cg_rt.compile([node.outputs[0] for node in output_nodes])
+
+        def make_dev_tensor(value, dtype=None, device=None):
+            return tensor(value, dtype=dtype, device=device)._dev_tensor()
+
+        def calculate(*args, **kwargs):
+            output_val = []
+            # set inputs value
+            for name, var in inputs.items():
+                val = kwargs.pop(name, None)
+                assert val is not None, "miss input name{}".format(name)
+                dev_tensor = make_dev_tensor(val, dtype=var.dtype, device="xpux")
+                inp_map[name].set_value(dev_tensor)
+
+            func.execute()
+
+            for res in output_nodes:
+                output_val.append(res.get_value().numpy())
+            return output_val
+
+        def expect_name(var):
+            return "{}:expect".format(var.name)
+
+    testcases = []
+
+    np.set_printoptions(precision=2, threshold=4, suppress=True)
+
+    data_list = []
+    for item in args.data:
+        if item.startswith("@"):
+            with open(item[1:], "r") as f:
+                data_list.extend([line.rstrip() for line in f if line.rstrip() != ""])
+        else:
+            data_list.append(item)
+
+    for inp_spec in data_list:
+        cur_testcase = gen_one_testcase(args, inputs, inp_spec)
+        assert len(cur_testcase) == len(
+            inputs
+        ), "required inputs: {}; given data: {}".format(
+            inputs.keys(), cur_testcase.keys()
+        )
+
+        if not args.no_assert:
+            outputs_get = calculate(**cur_testcase)
+            for var, val in zip(outputs, outputs_get):
+                cur_testcase[expect_name(var)] = val
+                logger.info(
+                    "generate test groundtruth: var={} shape={} range=({}, {})"
+                    " mean={} var={}".format(
+                        var, val.shape, val.min(), val.max(), np.mean(val), np.var(val)
+                    )
+                )
+        testcases.append(cur_testcase)
+        logger.info(
+            "add testcase: \n {}".format(
+                "\n ".join(
+                    "{}: shape={} dtype={} range=({:.2f},{:.2f}) "
+                    "mean={:.2f} sd={:.2f}".format(
+                        k, v.shape, v.dtype, v.min(), v.max(), np.mean(v), np.std(v)
+                    )
+                    for k, v in sorted(cur_testcase.items())
+                )
+            )
+        )
+
+    if not args.no_assert:
+
+        def expect_shp(var):
+            ret = var.shape
+            if ret:
+                return ret
+            return testcases[0][expect_name(var)].shape
+
+        def assert_equal(expect, real, **kwargs):
+            op = builtin.AssertEqual(**kwargs)
+            (res,) =  G.apply_normal_varnode(op, expect, real)
+            return res
+
+        verbose = not args.silent
+
+        outputs_new = []
+        for i in outputs:
+            device = rt.CompNode("xpux")
+            dtype = i.dtype
+            name = expect_name(i)
+            shape = expect_shp(i)
+            # make expect output as one input of model.
+            expect_get = rt.make_h2d(cg_rt, device, dtype, shape, name)
+            # insert assert opr to check expect and real.
+            outputs_new.append(
+                assert_equal(
+                    expect_get,
+                    i,
+                    verbose=verbose,
+                    maxerr=args.maxerr,
+                )
+            )
+            inputs[expect_name(i)] = expect_get
+        outputs = outputs_new
+
+    return {"outputs": outputs, "testcases": testcases}
+
+
+def optimize_for_inference(args, outputs):
+    args_list = [
+        "enable_io16xc32",
+        "enable_ioc16",
+        "enable_hwcd4",
+        "enable_nchw4",
+        "enable_nchw88",
+        "enable_nchw44",
+        "enable_nchw44_dot",
+        "enable_nchw32",
+        "enable_chwn4",
+        "enable_fuse_conv_bias_nonlinearity",
+        "enable_fuse_conv_bias_with_z",
+        "enable_fuse_preprocess",
+    ]
+    kwargs = {}
+    for k in args_list:
+        if getattr(args, k):
+            kwargs[k] = True
+
+    if args.optimize_for_inference:
+        outputs = G.optimize_for_inference(outputs, **kwargs)
+
+    return outputs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pack computing graph, input values and expected output "
+        "values into one file for checking correctness. README.md gives more "
+        "details on the usage",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("input", help="MegEngine dumped model file")
+    parser.add_argument("-o", "--output", help="output file", required=True)
+    parser.add_argument(
+        "-d",
+        "--data",
+        default=[],
+        action="append",
+        required=True,
+        help="Given input test data when input file is a network, "
+        "and current network output would be used as groundtruth. "
+        "The format is var0:file0;var1:file1... to specify data files for "
+        "input vars. It can also be #rand(min,max,shape...) for generating "
+        "random input data, for example, #rand(0,255), "
+        "#rand(0,255,1,3,224,224) or #rand(0, 255, 1, ...) where `...` means "
+        "the remaining part of the original shape. "
+        "If the shape is not specified, the shape of "
+        "corresponding input tensors in the network will be used. "
+        "If there is only one input var, its name can be omitted. "
+        "Each data file can either be an image which can be loaded by opencv, "
+        "or a pickled numpy.ndarray. "
+        "This option can be given multiple times to add multiple testcases. "
+        " *NOTE* "
+        "If you start the data with the letter @, the rest should be a "
+        "filename, and each line in the file should be a single datum in "
+        "the format described above. ",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+        help="Specify how many times the input image is repeated. "
+        "Useful when running benchmark for batch size other than one. "
+        "Have no effect on randomly generated input data.",
+    )
+    parser.add_argument(
+        "--silent",
+        action="store_true",
+        help="set verbose to False in asserti_equal opr",
+    )
+    parser.add_argument(
+        "--optimize-for-inference",
+        action="store_true",
+        help="enable optimization for inference",
+    )
+    parser.add_argument(
+        "--no-assert",
+        action="store_true",
+        help="do not insert assert_equal opr to check result; "
+        "this option is useful for benchmarking",
+    )
+    parser.add_argument(
+        "--maxerr",
+        type=float,
+        default=1e-4,
+        help="max error for assert_equal check during runtime",
+    )
+    parser.add_argument(
+        "--resize-input",
+        action="store_true",
+        help="resize input image to fit input var shape",
+    )
+    parser.add_argument(
+        "--input-transform",
+        help="a python expression to transform the input data. "
+        "Example: data / np.std(data)",
+    )
+    parser.add_argument(
+        "--discard-var-name",
+        action="store_true",
+        help="discard variable and param names in the " "generated output",
+    )
+    parser.add_argument(
+        "--output-strip-info", action="store_true", help="output code strip information"
+    )
+    parser.add_argument(
+        "--enable-io16xc32",
+        action="store_true",
+        help="transform the mode to float16 io float32 compute",
+    )
+    parser.add_argument(
+        "--enable-ioc16",
+        action="store_true",
+        help="transform the dtype of the model to float16 io " "and compute",
+    )
+    parser.add_argument(
+        "--enable-fuse-conv-bias-nonlinearity",
+        action="store_true",
+        help="fuse convolution bias and nonlinearity opr to a "
+        "conv_bias opr and compute",
+    )
+    parser.add_argument(
+        "--enable-hwcd4",
+        action="store_true",
+        help="transform the model format from NCHW to NHWCD4 "
+        "for inference; you may need to disable CUDA and set "
+        "MGB_USE_MEGDNN_DBG=2",
+    )
+    parser.add_argument(
+        "--enable-nchw4",
+        action="store_true",
+        help="transform the model format from NCHW to NCHW4 " "for inference",
+    )
+    parser.add_argument(
+        "--enable-nchw88",
+        action="store_true",
+        help="transform the model format from NCHW to NCHW88 " "for inference",
+    )
+    parser.add_argument(
+        "--enable-nchw44",
+        action="store_true",
+        help="transform the model format from NCHW to NCHW44 " "for inference",
+    )
+    parser.add_argument(
+        "--enable-nchw44-dot",
+        action="store_true",
+        help="transform the model format from NCHW to NCHW44_DOT "
+        "for optimizing armv8.2 dot in inference",
+    )
+    parser.add_argument(
+        "--enable-nchw32",
+        action="store_true",
+        help="transform the model format from NCHW4 to NCHW32 "
+        "for inference on nvidia TensoCore",
+    )
+    parser.add_argument(
+        "--enable-chwn4",
+        action="store_true",
+        help="transform the model format to CHWN4 "
+        "for inference, mainly used for nvidia tensorcore",
+    )
+    parser.add_argument(
+        "--enable-fuse-conv-bias-with-z",
+        action="store_true",
+        help="fuse conv_bias with z input for inference on "
+        "nvidia GPU (this optimization pass will result in mismatch "
+        "of the precision of output of training and inference)",
+    )
+    parser.add_argument(
+        "--enable-fuse-preprocess",
+        action="store_true",
+        help="fuse astype\pad_channel\dimshuffle and etc opr "
+        "from h2d opr",
+    )
+    args = parser.parse_args()
+
+    feeds = make_feeds(args)
+
+    assert isinstance(feeds, dict) and feeds["testcases"], "testcases can not be empty"
+
+    output_mgbvars = feeds["outputs"]
+    output_mgbvars = optimize_for_inference(args, output_mgbvars)
+
+    inputs = cgtools.get_dep_vars(output_mgbvars, "Host2DeviceCopy")
+    inputs = sorted((i.name, i.dtype) for i in inputs)
+
+    if args.discard_var_name:
+        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
+    else:
+        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)
+
+    strip_info_file = args.output + ".json" if args.output_strip_info else None
+
+    with open(args.output, "wb") as fout:
+        fout.write(b"mgbtest0")
+        fout.write(struct.pack("I", len(feeds["testcases"])))
+        dump_content, stat = G.dump_graph(
+            output_mgbvars,
+            append_json=True,
+            strip_info_file=strip_info_file,
+            **sereg_kwargs,
+        )
+        fout.write(dump_content)
+
+        logger.info(
+            "graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB".format(
+                stat.tot_bytes / 1024, (stat.tot_bytes - stat.tensor_value_bytes) / 1024
+            )
+        )
+
+    def make_dev_tensor(value, dtype=None, device=None):
+        return tensor(value, dtype=dtype, device=device)._dev_tensor()
+
+    for testcase in feeds["testcases"]:
+        assert isinstance(testcase, dict)
+        cg = G.Graph()
+        output_mgbvars = []
+        for name, dtype in inputs:
+            output_mgbvars.append(
+                cg.make_const(
+                    make_dev_tensor(testcase.pop(name), dtype=dtype, device="cpux")
+                )
+            )
+        assert not testcase, "extra inputs provided in testcase: {}".format(
+            testcase.keys()
+        )
+        with open(args.output, "ab") as fout:
+            dump_content, _ = G.dump_graph(
+                output_mgbvars, strip_info_file=strip_info_file, append_json=True
+            )
+            fout.write(dump_content)
+
+
+if __name__ == "__main__":
+    main()
--- a/lite/load_and_run/src/helpers/common.h
+++ b/lite/load_and_run/src/helpers/common.h
+/**
+ * \file lite/load_and_run/src/helpers/common.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include <memory>
+DECLARE_int32(thread);
+namespace lar {
+/*!
+ * \brief: state of model running
+ */
+enum class RunStage {
+
+    BEFORE_MODEL_LOAD = 0,
+
+    AFTER_MODEL_LOAD = 1,
+
+    BEFORE_OUTSPEC_SET = 2,
+
+    //! using for dump static memory information svg file
+    AFTER_OUTSPEC_SET = 3,
+
+    //! using for external c opr library
+    MODEL_RUNNING = 4,
+
+    //! using for output dumper
+    AFTER_RUNNING_WAIT = 5,
+
+    //! using for external c opr library
+    AFTER_RUNNING_ITER = 6,
+
+    AFTER_MODEL_RUNNING = 7,
+};
+/*!
+ * \brief: type of different model
+ */
+enum class ModelType {
+    LITE_MODEL = 0,
+    MEGDL_MODEL,
+    UNKNOWN,
+};
+/*!
+ * \brief: param for running model
+ */
+struct RuntimeParam {
+    RunStage stage = RunStage::AFTER_MODEL_LOAD;
+    size_t warmup_iter;             //! warm up number before running model
+    size_t run_iter;                //! iteration number for running model
+    size_t threads = FLAGS_thread;  //! thread number for running model (NOTE:it's
+                                    //! different from multithread device )
+    size_t testcase_num = 1;        //! testcase number for model with testcase
+};
+/*!
+ * \brief:layout type  for running model optimization
+ */
+enum class OptLayoutType {
+    NCHW4 = 1 << 0,
+    CHWN4 = 1 << 1,
+    NCHW44 = 1 << 2,
+    NCHW88 = 1 << 3,
+    NCHW32 = 1 << 4,
+    NCHW64 = 1 << 5,
+    NHWCD4 = 1 << 6,
+    NCHW44_DOT = 1 << 7
+};
+
+}  // namespace lar
+// vim: syntax=cpp.doxygen
--- a/lite/load_and_run/src/helpers/data_parser.cpp
+++ b/lite/load_and_run/src/helpers/data_parser.cpp
+/**
+ * \file lite/load_and_run/src/helpers/data_parser.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "data_parser.h"
+#include <sstream>
+#include "json_loader.h"
+#include "npy.h"
+
+using namespace lar;
+
+/*!
+ * \brief feed different data to diffferent parser
+ * \param path data file path or data string
+ */
+void DataParser::feed(const std::string& path) {
+    std::string blob_name = "data", blob_string = path;
+    size_t sep = path.find(":");
+    if (sep != std::string::npos) {
+        blob_name = path.substr(0, sep);
+        blob_string = path.substr(sep + 1);
+    }
+
+    auto endWith = [blob_string](std::string suffix) -> bool {
+        return blob_string.rfind(suffix) == (blob_string.length() - suffix.length());
+    };
+
+    if (endWith(".ppm") || endWith(".pgm")) {
+        parse_image(blob_name, blob_string);
+    } else if (endWith(".json")) {
+        parse_json(blob_string);
+    } else if (endWith(".npy")) {
+        parse_npy(blob_name, blob_string);
+    } else {
+        parse_string(blob_name, blob_string);
+    }
+}
+
+void DataParser::parse_json(const std::string& path) {
+    mgb::JsonLoader json;
+    std::shared_ptr<mgb::JsonLoader::Value> root = json.load(path.c_str());
+
+    mgb_assert(root != nullptr, "parse json %s fail", path.c_str());
+    // parse json to data map
+    const std::string SHAPE = "shape", TYPE = "type", RAW = "raw";
+    for (auto& item : root->objects()) {
+        auto&& value = *item.second;
+        auto&& shape = value[SHAPE];
+        mgb_assert(shape->is_array());
+
+        auto&& type = value[TYPE];
+        mgb_assert(type->is_str());
+
+        auto&& raw = value[RAW];
+        mgb_assert(raw->is_array());
+
+        megdnn::SmallVector<size_t> data_shape;
+        for (auto&& shape_ptr : shape->array()) {
+            data_shape.append({static_cast<size_t>(std::round(shape_ptr->number()))});
+        }
+
+        // get type
+        const std::map<std::string, megdnn::DType> type_map = {
+                {"float32", mgb::dtype::Float32()}, {"float", mgb::dtype::Float32()},
+                {"int32", mgb::dtype::Int32()},     {"int", mgb::dtype::Int32()},
+                {"int8", mgb::dtype::Int8()},       {"uint8", mgb::dtype::Uint8()}};
+
+        const std::string& type_str = type->str();
+        mgb_assert(
+                type_map.find(type_str) != type_map.end(),
+                "unknown json data type for --input");
+
+        mgb::DType datatype = type_map.at(type_str);
+        mgb::HostTensorND hv;
+        hv.comp_node(mgb::CompNode::default_cpu(), true)
+                .dtype(datatype)
+                .resize(data_shape);
+        mgb::dt_byte* raw_ptr = hv.raw_ptr();
+        size_t elem_size = datatype.size();
+
+        // get raw
+        const size_t array_size = raw->len();
+        for (size_t idx = 0; idx < array_size; ++idx) {
+            double tmp = (*raw)[idx]->number();
+
+            switch (datatype.enumv()) {
+                case megdnn::DTypeEnum::Int32: {
+                    int32_t ival = std::round(tmp);
+                    memcpy(((char*)raw_ptr) + idx * elem_size, &ival, elem_size);
+                } break;
+                case megdnn::DTypeEnum::Uint8:
+                case megdnn::DTypeEnum::Int8: {
+                    int8_t cval = std::round(tmp);
+                    memcpy(((char*)raw_ptr) + idx, &cval, sizeof(int8_t));
+                } break;
+                case megdnn::DTypeEnum::Float32: {
+                    float fval = tmp;
+                    memcpy(((char*)raw_ptr) + idx * elem_size, &fval, elem_size);
+                } break;
+                default:
+                    break;
+            }
+        }
+
+        inputs.insert(std::make_pair(item.first, std::move(hv)));
+    }
+}
+
+void DataParser::parse_image(const std::string& name, const std::string& path) {
+    // load binary ppm/pgm
+    std::ifstream fin;
+    fin.open(path, std::ifstream::binary | std::ifstream::in);
+    mgb_assert(fin.is_open(), "open file %s failed for --input", path.c_str());
+
+    size_t w = 0, h = 0, channel = 0;
+    char buf[128] = {0};
+
+    fin.getline(buf, 128);
+    if ('5' == buf[1]) {
+        channel = 1;
+    } else if ('6' == buf[1]) {
+        channel = 3;
+    } else {
+        mgb_assert(0, "not a formal ppm/pgm");
+    }
+
+    while (fin.getline(buf, 128)) {
+        if (buf[0] == '#') {
+            continue;
+        }
+        break;
+    }
+    std::stringstream ss;
+    ss << std::string(buf);
+    ss >> w;
+    ss >> h;
+
+    mgb_assert(w > 0 and h > 0);
+
+    mgb::HostTensorND hv;
+    hv.comp_node(mgb::CompNode::default_cpu(), true)
+            .dtype(mgb::dtype::Uint8())
+            .resize({1, h, w, channel});
+
+    fin.read((char*)(hv.raw_ptr()), hv.layout().total_nr_elems());
+    fin.close();
+    inputs.insert(std::make_pair(name, std::move(hv)));
+}
+
+void DataParser::parse_npy(const std::string& name, const std::string& path) {
+    std::string type_str;
+    std::vector<npy::ndarray_len_t> stl_shape;
+    std::vector<int8_t> raw;
+    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);
+
+    megdnn::SmallVector<size_t> shape;
+    for (auto val : stl_shape) {
+        shape.append({static_cast<size_t>(val)});
+    }
+
+    const std::map<std::string, megdnn::DType> type_map = {
+            {"f4", mgb::dtype::Float32()}, {"i4", mgb::dtype::Int32()},
+            {"i2", mgb::dtype::Int16()},   {"u2", mgb::dtype::Uint16()},
+            {"i1", mgb::dtype::Int8()},    {"u1", mgb::dtype::Uint8()}};
+
+    megdnn::DType hv_type;
+    for (auto& item : type_map) {
+        if (type_str.find(item.first) != std::string::npos) {
+            hv_type = item.second;
+            break;
+        }
+    }
+
+    mgb::HostTensorND hv;
+    hv.comp_node(mgb::CompNode::default_cpu(), true).dtype(hv_type).resize(shape);
+    mgb::dt_byte* raw_ptr = hv.raw_ptr();
+    memcpy(raw_ptr, raw.data(), raw.size());
+
+    inputs.insert(std::make_pair(name, std::move(hv)));
+}
+
+void DataParser::parse_string(const std::string name, const std::string& str) {
+    // data type
+    megdnn::DType data_type = mgb::dtype::Int32();
+    if (str.find(".") != std::string::npos or str.find(".") != std::string::npos) {
+        data_type = mgb::dtype::Float32();
+    }
+    // shape
+    size_t number_cnt = 0;
+
+    std::shared_ptr<Brace> brace_root = std::make_shared<Brace>();
+    std::shared_ptr<Brace> cur = brace_root;
+    for (size_t i = 0; i < str.size(); ++i) {
+        char c = str[i];
+        if (c == '[') {
+            std::shared_ptr<Brace> child = std::make_shared<Brace>();
+            child->parent = cur;
+            cur->chidren.emplace_back(child);
+            cur = child;
+        } else if (c == ']') {
+            cur = cur->parent.lock();
+        } else if (c == ',') {
+            number_cnt++;
+        }
+        continue;
+    }
+    ++number_cnt;
+
+    mgb_assert(cur == brace_root, "braces not closed for --input");
+    megdnn::SmallVector<size_t> shape;
+    cur = brace_root;
+    while (not cur->chidren.empty()) {
+        shape.append({cur->chidren.size()});
+        number_cnt /= cur->chidren.size();
+        cur = cur->chidren[0];
+    }
+    mgb_assert(number_cnt > 0);
+    shape.append({number_cnt});
+
+    // data
+    std::string json_arr;
+    for (size_t i = 0; i < str.size(); ++i) {
+        char c = str[i];
+        if (c != '[' and c != ']') {
+            json_arr += c;
+        }
+    }
+    json_arr = "[" + json_arr + "]";
+
+    // reuse json parser to resolve raw data
+    mgb::JsonLoader json;
+    std::shared_ptr<mgb::JsonLoader::Value> json_root =
+            json.load(json_arr.data(), json_arr.size());
+    mgb_assert(json_root != nullptr, "parse json fail in parse_string");
+
+    mgb::HostTensorND hv;
+    hv.comp_node(mgb::CompNode::default_cpu(), true).dtype(data_type).resize(shape);
+    mgb::dt_byte* raw_ptr = hv.raw_ptr();
+
+    const size_t array_len = json_root->len();
+    const size_t elem_size = data_type.size();
+    for (size_t idx = 0; idx < array_len; ++idx) {
+        double tmp = json_root->array()[idx]->number();
+        switch (data_type.enumv()) {
+            case megdnn::DTypeEnum::Int32: {
+                int32_t ival = std::round(tmp);
+                memcpy(((char*)raw_ptr) + idx * elem_size, &ival, elem_size);
+            } break;
+            case megdnn::DTypeEnum::Float32: {
+                float fval = tmp;
+                memcpy(((char*)raw_ptr) + idx * elem_size, &fval, elem_size);
+            } break;
+            default:
+                break;
+        }
+    }
+    inputs.insert(std::make_pair(name, std::move(hv)));
+}
--- a/lite/load_and_run/src/helpers/data_parser.h
+++ b/lite/load_and_run/src/helpers/data_parser.h
+/**
+ * \file lite/load_and_run/src/helpers/data_parser.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "megbrain/opr/io.h"
+
+namespace lar {
+/*!
+ * \brief data parser for --input
+ * support .json|.ppm|.pgm|.npy data and user define data string
+ * data string format: [0,0,227,227]
+ */
+struct DataParser {
+    struct Brace {
+        std::weak_ptr<Brace> parent;
+        std::vector<std::shared_ptr<Brace>> chidren;
+    };
+    void feed(const std::string& path);
+
+    std::unordered_map<std::string, mgb::HostTensorND> inputs;
+
+private:
+    //! parser for json data
+    void parse_json(const std::string& path);
+
+    //! parser for .ppm .pgm image
+    void parse_image(const std::string& name, const std::string& path);
+
+    //! parser for .npy data
+    void parse_npy(const std::string& name, const std::string& path);
+
+    //! parser for user define string
+    void parse_string(const std::string name, const std::string& str);
+};
+}  // namespace lar
--- a/lite/load_and_run/src/helpers/json_loader.cpp
+++ b/lite/load_and_run/src/helpers/json_loader.cpp
+/**
+ * \file lite/load_and_run/src/helpers/json_loader.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "json_loader.h"
+
+using namespace mgb;
+
+template <typename T>
+T* JsonLoader::Value::safe_cast() {
+    T* ptr = (T*)(this);
+    if (nullptr == ptr) {
+        fprintf(stderr, "cast ptr is null\n");
+    }
+    return ptr;
+}
+
+std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
+        const std::string& key) {
+    mgb_assert(Type::OBJECT == m_type);
+    auto t = safe_cast<JsonLoader::ObjectValue>();
+    return t->m_obj.at(key);
+}
+
+std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](const size_t index) {
+    mgb_assert(Type::ARRAY == m_type);
+    auto t = safe_cast<JsonLoader::ArrayValue>();
+    return t->m_obj[index];
+}
+
+std::map<std::string, std::unique_ptr<JsonLoader::Value>>& JsonLoader::Value::
+        objects() {
+    mgb_assert(Type::OBJECT == m_type);
+    auto t = safe_cast<JsonLoader::ObjectValue>();
+    return t->m_obj;
+}
+
+size_t JsonLoader::Value::len() {
+    if (Type::ARRAY == m_type) {
+        auto t = safe_cast<JsonLoader::ArrayValue>();
+        return t->m_obj.size();
+    } else if (Type::OBJECT == m_type) {
+        auto t = safe_cast<JsonLoader::ObjectValue>();
+        return t->m_obj.size();
+    }
+    return 0;
+}
+
+megdnn::SmallVector<std::unique_ptr<JsonLoader::Value>>& JsonLoader::Value::array() {
+    mgb_assert(Type::ARRAY == m_type);
+    auto t = safe_cast<JsonLoader::ArrayValue>();
+    return t->m_obj;
+}
+
+double JsonLoader::Value::number() {
+    mgb_assert(Type::NUMBER == m_type);
+    auto t = safe_cast<JsonLoader::NumberValue>();
+    return t->value();
+}
+
+std::string JsonLoader::Value::str() {
+    if (Type::STRING == m_type) {
+        auto t = safe_cast<StringValue>();
+        return t->value();
+    }
+    return std::string();
+}
+
+void JsonLoader::expect(char c) {
+    mgb_assert(c == (*m_buf));
+    m_buf++;
+}
+
+void JsonLoader::skip_whitespace() {
+    const char* p = m_buf;
+    while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') {
+        ++p;
+    }
+    m_buf = p;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::parse_object() {
+    expect('{');
+    skip_whitespace();
+
+    std::unique_ptr<JsonLoader::Value> ret;
+    JsonLoader::ObjectValue* pObject = new JsonLoader::ObjectValue();
+
+    if ('}' == *m_buf) {
+        m_buf = m_buf + 1;
+        ret.reset((JsonLoader::Value*)(pObject));
+        return ret;
+    }
+
+    while (true) {
+        std::unique_ptr<JsonLoader::Value> key = parse_string();
+        if (m_state != State::OK) {
+            return ret;
+        }
+
+        skip_whitespace();
+        if (':' != (*m_buf)) {
+            m_state = State::MISS_COLON;
+            return ret;
+        }
+        m_buf++;
+        skip_whitespace();
+
+        std::unique_ptr<JsonLoader::Value> pVal = parse_value();
+        if (m_state != State::OK) {
+            return ret;
+        }
+
+        if (pObject->m_obj.find(pVal->str()) != pObject->m_obj.end()) {
+            m_state = State::KEY_NOT_UNIQUE;
+            return ret;
+        }
+
+        pObject->m_obj.insert(std::make_pair(key->str(), std::move(pVal)));
+
+        skip_whitespace();
+        if (',' == (*m_buf)) {
+            m_buf++;
+            skip_whitespace();
+        } else if ('}' == (*m_buf)) {
+            m_buf++;
+            break;
+        } else {
+            m_state = State::MISS_BRACE;
+            break;
+        }
+    }
+
+    ret.reset((JsonLoader::Value*)(pObject));
+    return ret;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::parse_array() {
+    expect('[');
+    skip_whitespace();
+
+    std::unique_ptr<JsonLoader::Value> ret;
+    JsonLoader::ArrayValue* pArray = new JsonLoader::ArrayValue();
+
+    if (']' == *m_buf) {
+        m_buf = m_buf + 1;
+
+        ret.reset((JsonLoader::Value*)(pArray));
+        return ret;
+    }
+
+    while (true) {
+        std::unique_ptr<JsonLoader::Value> pVal = parse_value();
+        if (m_state != State::OK) {
+            mgb_assert(0, "parse value failed during pase array");
+            return ret;
+        }
+
+        pArray->m_obj.emplace_back(pVal.get());
+        pVal.release();
+
+        skip_whitespace();
+        if (',' == *m_buf) {
+            m_buf++;
+            skip_whitespace();
+        } else if (']' == *m_buf) {
+            m_buf++;
+            break;
+        } else {
+            m_state = State::BAD_ARRAY;
+            return ret;
+        }
+    }
+
+    ret.reset((JsonLoader::Value*)(pArray));
+    return ret;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::parse_string() {
+    expect('\"');
+
+    std::unique_ptr<JsonLoader::Value> ret;
+    JsonLoader::StringValue* pStr = new JsonLoader::StringValue();
+
+    const char* p = m_buf;
+    while (true) {
+        if (*p == '\"') {
+            p++;
+            break;
+        } else {
+            pStr->m_value += (*p);
+            p++;
+        }
+    }
+    m_buf = p;
+    ret.reset((JsonLoader::Value*)(pStr));
+    return ret;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::parse_number() {
+    const char* p = m_buf;
+
+    auto loop_digit = [this](const char*& p) {
+        if (not std::isdigit(*p)) {
+            m_state = State::BAD_DIGIT;
+            return;
+        }
+        while (std::isdigit(*p)) {
+            p++;
+        }
+        return;
+    };
+
+    if (*p == '-')
+        p++;
+    if (*p == '0')
+        p++;
+    else {
+        loop_digit(std::ref(p));
+    }
+    if (*p == '.') {
+        p++;
+        loop_digit(std::ref(p));
+    }
+
+    if (*p == 'e' || *p == 'E') {
+        p++;
+        if (*p == '+' || *p == '-')
+            p++;
+        loop_digit(std::ref(p));
+    }
+    JsonLoader::NumberValue* pNum = new JsonLoader::NumberValue();
+    pNum->m_value = strtod(m_buf, nullptr);
+
+    m_buf = p;
+
+    std::unique_ptr<JsonLoader::Value> ret;
+    ret.reset((JsonLoader::Value*)(pNum));
+    return ret;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::parse_value() {
+    switch (*m_buf) {
+        case '[':
+            return parse_array();
+        case '{':
+            return parse_object();
+        case '\"':
+            return parse_string();
+        case '\0':
+            m_state = State::BAD_TYPE;
+            break;
+        default:
+            return parse_number();
+    }
+    return nullptr;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::load(
+        const char* content, const size_t size) {
+    m_buf = content;
+    skip_whitespace();
+    std::unique_ptr<JsonLoader::Value> value = parse_value();
+    skip_whitespace();
+
+    if (m_state != State::OK) {
+        return nullptr;
+    }
+    mgb_assert(size == static_cast<size_t>(m_buf - content));
+
+    return value;
+}
+
+std::unique_ptr<JsonLoader::Value> JsonLoader::load(const char* path) {
+    std::unique_ptr<std::FILE, void (*)(std::FILE*)> fin(
+            std::fopen(path, "rb"), [](std::FILE* fp) { std::fclose(fp); });
+
+    mgb_assert(fin.get(), "failed to open %s: %s", path, strerror(errno));
+    std::fseek(fin.get(), 0, SEEK_END);
+    const size_t size = ftell(fin.get());
+    std::fseek(fin.get(), 0, SEEK_SET);
+
+    std::unique_ptr<char> buf(static_cast<char*>(malloc(size)));
+
+    auto nr = std::fread(buf.get(), 1, size, fin.get());
+    mgb_assert(nr == size);
+
+    return load(buf.get(), size);
+}
--- a/lite/load_and_run/src/helpers/json_loader.h
+++ b/lite/load_and_run/src/helpers/json_loader.h
+/**
+ * \file lite/load_and_run/src/helpers/json_loader.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include <cctype>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <memory>
+#include "megbrain/common.h"
+#include "megdnn/thin/small_vector.h"
+
+namespace mgb {
+/*!
+ * \brief JSON format data loader for --input
+ */
+class JsonLoader {
+public:
+    // base class for different value format
+    class Value {
+    protected:
+        enum struct Type : uint8_t { UNKNOWN, NUMBER, STRING, OBJECT, ARRAY };
+        Type m_type;
+
+    public:
+        template <typename T>
+        T* safe_cast();
+
+        Value() { m_type = Type::UNKNOWN; }
+
+        Value(Type type) : m_type(type) {}
+
+        virtual ~Value() {}
+
+        bool is_array() { return Type::ARRAY == m_type; }
+
+        bool is_object() { return Type::OBJECT == m_type; }
+
+        bool is_number() { return Type::NUMBER == m_type; }
+
+        bool is_str() { return Type::STRING == m_type; }
+
+        std::unique_ptr<Value>& operator[](const std::string& key);
+
+        std::unique_ptr<Value>& operator[](const size_t index);
+
+        std::map<std::string, std::unique_ptr<Value>>& objects();
+
+        size_t len();
+
+        megdnn::SmallVector<std::unique_ptr<Value>>& array();
+
+        double number();
+
+        std::string str();
+    };
+
+    void expect(char c);
+
+    void skip_whitespace();
+
+    std::unique_ptr<Value> parse_object();
+
+    std::unique_ptr<Value> parse_array();
+
+    std::unique_ptr<Value> parse_string();
+
+    std::unique_ptr<Value> parse_number();
+
+    std::unique_ptr<Value> parse_value();
+
+    enum struct State : uint8_t {
+        OK = 0,
+        BAD_TYPE,
+        BAD_DIGIT,
+        BAD_ARRAY,
+        MISS_COLON,
+        MISS_BRACE,
+        KEY_NOT_UNIQUE
+    };
+
+    JsonLoader() { m_state = State::OK; }
+
+    std::unique_ptr<Value> load(const char* content, const size_t size);
+
+    std::unique_ptr<Value> load(const char* path);
+
+    class NumberValue final : public Value {
+        friend std::unique_ptr<Value> JsonLoader::parse_number();
+        double m_value;
+
+    public:
+        NumberValue() : Value(Type::NUMBER) {}
+
+        double value() { return m_value; }
+    };
+
+    class StringValue final : public Value {
+        std::string m_value;
+
+    public:
+        StringValue() : Value(Type::STRING) {}
+
+        std::string value() { return m_value; }
+
+        friend std::unique_ptr<Value> JsonLoader::parse_string();
+    };
+
+    class ArrayValue final : public Value {
+        megdnn::SmallVector<std::unique_ptr<Value>> m_obj;
+
+    public:
+        ArrayValue() : Value(Type::ARRAY) {}
+
+        ArrayValue(ArrayValue& arr) : Value(arr) {
+            m_obj.clear();
+            for (auto& item : arr.m_obj) {
+                m_obj.emplace_back(item.get());
+                item.release();
+            }
+        }
+
+        ArrayValue(ArrayValue&& arr) : Value(arr) {
+            m_obj.clear();
+            for (auto& item : arr.m_obj) {
+                m_obj.emplace_back(item.get());
+                item.release();
+            }
+        }
+
+        friend std::unique_ptr<Value> JsonLoader::parse_array();
+        friend std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
+                const size_t index);
+        friend megdnn::SmallVector<std::unique_ptr<JsonLoader::Value>>& JsonLoader::
+                Value::array();
+        friend size_t JsonLoader::Value::len();
+    };
+
+    class ObjectValue final : public Value {
+        std::map<std::string, std::unique_ptr<Value>> m_obj;
+
+    public:
+        ObjectValue() : Value(Type::OBJECT) {}
+
+        ObjectValue(ObjectValue& arr) : Value(arr) {
+            m_obj.clear();
+            for (auto itra = arr.m_obj.begin(); itra != arr.m_obj.end(); ++itra) {
+                m_obj.emplace(std::make_pair(itra->first, std::move(itra->second)));
+            }
+        }
+
+        ObjectValue(ObjectValue&& arr) : Value(arr) {
+            m_obj.clear();
+            for (auto itra = arr.m_obj.begin(); itra != arr.m_obj.end(); ++itra) {
+                m_obj.emplace(std::make_pair(itra->first, std::move(itra->second)));
+            }
+        }
+
+        friend std::unique_ptr<Value> JsonLoader::parse_object();
+        friend std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
+                const std::string&);
+        friend std::map<std::string, std::unique_ptr<JsonLoader::Value>>& JsonLoader::
+                Value::objects();
+        friend size_t JsonLoader::Value::len();
+    };
+
+private:
+    const char* m_buf;
+    State m_state;
+};
+
+}  // namespace mgb
--- a/lite/load_and_run/src/helpers/npy.h
+++ b/lite/load_and_run/src/helpers/npy.h
--- a/lite/load_and_run/src/helpers/outdumper.cpp
+++ b/lite/load_and_run/src/helpers/outdumper.cpp
+/**
+ * \file lite/load_and_run/src/helpers/outdumper.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "outdumper.h"
+#include "megbrain/utils/debug.h"
+
+using namespace lar;
+
+void OutputDumper::set(mgb::SymbolVarArray& symb_var) {
+    for (auto&& i : symb_var) {
+        auto&& var = i.node();
+        DumpInfo info;
+        info.var_info = mgb::cg::dump_var_info({var});
+        info.owner_inputs_info = mgb::cg::dump_var_info(var->owner_opr()->input());
+        info.id = var->id();
+        m_infos.push_back(info);
+    }
+}
+
+mgb::ComputingGraph::Callback OutputDumper::bind() {
+    auto& info = m_infos.at(m_bind_id++);
+    mgb::ComputingGraph::Callback cb = [&info](const mgb::DeviceTensorND& dv) {
+        info.hv.copy_from(dv);
+    };
+    return cb;
+}
+
+void OutputDumper::write_to_file() {
+    if (!dump_file.empty()) {
+        for (auto&& info : m_infos) {
+            auto value = mgb::debug::dump_tensor(
+                    info.hv,
+                    mgb::ssprintf(
+                            "var=%s owner_opr_inputs= %s", info.var_info.c_str(),
+                            info.owner_inputs_info.c_str()));
+            mgb::debug::write_to_file(
+                    mgb::ssprintf(
+                            "%s/run%zu-var %zd", dump_file.c_str(), m_run_id, info.id)
+                            .c_str(),
+                    value);
+        }
+    }
+    m_run_id++;
+}
--- a/lite/load_and_run/src/helpers/outdumper.h
+++ b/lite/load_and_run/src/helpers/outdumper.h
+/**
+ * \file lite/load_and_run/src/helpers/outdumper.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include "megbrain/serialization/serializer.h"
+
+namespace lar {
+
+/*!
+ * \brief dumper for only output used for --bin-out-dump
+ */
+class OutputDumper {
+public:
+    struct DumpInfo {
+        mgb::HostTensorND hv = {};
+        std::string var_info;
+        std::string owner_inputs_info;
+        size_t id;
+    };
+    //! init the dump_file path
+    OutputDumper(const char* file) { dump_file = file; }
+
+    //! set the dump informations
+    void set(mgb::SymbolVarArray& symb_var);
+
+    //! callback function for specify output when compile computing graph
+    mgb::ComputingGraph::Callback bind();
+
+    //! write dumped output into dump_file
+    void write_to_file();
+
+private:
+    mgb::SmallVector<DumpInfo> m_infos;
+    size_t m_run_id = 0;
+    size_t m_bind_id = 0;
+    std::string dump_file;
+};
+}  // namespace lar
\ No newline at end of file
--- a/lite/load_and_run/src/helpers/text_table.cpp
+++ b/lite/load_and_run/src/helpers/text_table.cpp
+/**
+ * \file lite/load_and_run/src/helpers/text_table.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "text_table.h"
+
+using namespace mgb;
+
+namespace {
+inline void mid(std::ostream& os, const std::string& str, size_t max_w) {
+    size_t l = (max_w - str.length()) / 2 + str.length();
+    size_t r = max_w - l;
+    os << std::setw(l) << std::right << str;
+    if (r > 0)
+        os << std::setw(r) << ' ';
+}
+inline size_t char_length(char c) {
+    return c ? 1 : 0;
+}
+}  // namespace
+
+void TextTable::adjuster_last_row() {
+    if (m_rows.empty())
+        return;
+    auto& row = m_rows.back();
+    if (row.params.horizontal == 0 or row.params.vertical == 0) {
+        row.params.corner = 0;
+    }
+    if (row.params.horizontal != 0 && row.params.vertical != 0 &&
+        row.params.corner == 0) {
+        row.params.corner = row.params.horizontal;
+    }
+}
+
+void TextTable::show(std::ostream& os) {
+    if (m_rows.empty())
+        return;
+    auto& last_row = m_rows.front();
+    bool first = true;
+    for (auto& row : m_rows) {
+        auto& lrow =
+                (last_row.values.size() * char_length(last_row.params.horizontal)) >
+                                (row.values.size() * char_length(row.params.horizontal))
+                        ? last_row
+                        : row;
+        // line before row
+        if (lrow.params.horizontal) {
+            if (not first)
+                os << std::endl;
+            os << m_prefix;
+            if (lrow.params.corner)
+                os << lrow.params.corner;
+            size_t skip_size = 0;
+            // table name
+            if (first) {
+                os << m_name;
+                skip_size = m_name.length();
+            }
+            for (size_t i = 0; i < lrow.values.size(); ++i) {
+                auto max_w = m_cols_max_w.at(i) + m_padding * 2;
+                if (max_w + char_length(lrow.params.corner) <= skip_size) {
+                    skip_size = skip_size - max_w - char_length(lrow.params.corner);
+                    continue;
+                }
+                size_t rest = max_w + char_length(lrow.params.corner) - skip_size;
+                skip_size = 0;
+                if (rest > char_length(lrow.params.corner)) {
+                    os << std::string(
+                            rest - char_length(lrow.params.corner),
+                            lrow.params.horizontal);
+                    rest = char_length(lrow.params.corner);
+                }
+                if (rest > 0 && lrow.params.corner)
+                    os << lrow.params.corner;
+            }
+        } else if (first) {
+            os << m_prefix << ' ' << m_name;
+        }
+        first = false;
+        os << std::endl << m_prefix;
+        if (row.params.vertical)
+            os << row.params.vertical;
+        // row
+        for (size_t i = 0; i < row.values.size(); ++i) {
+            auto& str = row.values.at(i);
+            auto max_w = m_cols_max_w.at(i) + 2 * m_padding;
+            if (row.params.align == Align::Mid) {
+                mid(os, str, max_w);
+            } else if (row.params.align == Align::Left) {
+                os << std::setw(max_w) << std::left << str;
+            } else {
+                os << std::setw(max_w) << std::right << str;
+            }
+            if (row.params.vertical)
+                os << row.params.vertical;
+        }
+        last_row = row;
+    }
+    if (last_row.params.horizontal) {
+        os << std::endl << m_prefix;
+        if (last_row.params.corner)
+            os << last_row.params.corner;
+        for (size_t i = 0; i < last_row.values.size(); ++i) {
+            auto max_w = m_cols_max_w.at(i);
+            std::string tmp(max_w + m_padding * 2, last_row.params.horizontal);
+            os << tmp;
+            if (last_row.params.corner)
+                os << last_row.params.corner;
+        }
+    }
+}
\ No newline at end of file
--- a/lite/load_and_run/src/helpers/text_table.h
+++ b/lite/load_and_run/src/helpers/text_table.h
+/**
+ * \file lite/load_and_run/src/helpers/text_table.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include <array>
+#include <iomanip>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+#include "megbrain/common.h"
+
+namespace mgb {
+
+class TextTable {
+public:
+    enum Level { Summary, Detail };
+    enum class Align : int { Left, Right, Mid };
+    explicit TextTable(const std::string& table_name) : m_name(table_name) {}
+    TextTable& horizontal(char c) {
+        m_row.params.horizontal = c;
+        return *this;
+    }
+    TextTable& vertical(char c) {
+        m_row.params.vertical = c;
+        return *this;
+    }
+    TextTable& corner(char c) {
+        m_row.params.corner = c;
+        return *this;
+    }
+    TextTable& align(Align v) {
+        m_row.params.align = v;
+        return *this;
+    }
+    TextTable& padding(size_t w) {
+        m_padding = w;
+        return *this;
+    }
+    TextTable& prefix(const std::string& str) {
+        m_prefix = str;
+        return *this;
+    }
+
+    template <typename T>
+    TextTable& add(const T& value) {
+        m_row.values.emplace_back(value);
+        if (m_cols_max_w.size() < m_row.values.size()) {
+            m_cols_max_w.emplace_back(m_row.values.back().length());
+        } else {
+            mgb_assert(m_row.values.size() >= 1);
+            size_t i = m_row.values.size() - 1;
+            m_cols_max_w[i] = std::max(m_cols_max_w[i], m_row.values.back().length());
+        }
+        return *this;
+    }
+
+    template <
+            typename T,
+            typename std::enable_if<std::is_floating_point<T>::value, bool>::type = 0>
+    TextTable& add(const T& value) {
+        std::stringstream ss;
+        ss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
+        ss << value;
+        m_row.values.emplace_back(ss.str());
+        if (m_cols_max_w.size() < m_row.values.size()) {
+            m_cols_max_w.emplace_back(m_row.values.back().length());
+        } else {
+            mgb_assert(m_row.values.size() >= 1);
+            size_t i = m_row.values.size() - 1;
+            m_cols_max_w[i] = std::max(m_cols_max_w[i], m_row.values.back().length());
+        }
+        return *this;
+    }
+
+    template <
+            typename T,
+            typename std::enable_if<std::is_integral<T>::value, bool>::type = 0>
+    TextTable& add(const T& value) {
+        m_row.values.emplace_back(std::to_string(value));
+        return *this;
+    }
+
+    void eor() {
+        m_rows.emplace_back(m_row);
+        adjuster_last_row();
+        m_row.values.clear();
+    }
+
+    void reset() {
+        m_row = {};
+        m_cols_max_w.clear();
+        m_padding = 0;
+        m_rows.clear();
+    }
+
+    void show(std::ostream& os);
+
+private:
+    void adjuster_last_row();
+    std::string m_name;
+    std::vector<size_t> m_cols_max_w;
+    size_t m_padding = 0;
+    std::string m_prefix = "";
+    struct Row {
+        std::vector<std::string> values;
+        struct Params {
+            Align align = Align::Left;
+            char horizontal = '-', vertical = '|', corner = '+';
+        } params;
+    };
+    std::vector<Row> m_rows;
+    Row m_row;
+};
+
+inline std::ostream& operator<<(std::ostream& stream, TextTable& table) {
+    table.show(stream);
+    return stream;
+}
+
+}  // namespace mgb
\ No newline at end of file
--- a/lite/load_and_run/src/main.cpp
+++ b/lite/load_and_run/src/main.cpp
+/**
+ * \file lite/load_and_run/src/main.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <gflags/gflags.h>
+#include <string>
+#include "strategys/strategy.h"
+
+int main(int argc, char** argv) {
+    std::string usage = "load_and_run <model_path> [options...]";
+    if (argc < 2) {
+        printf("usage: %s\n", usage.c_str());
+        return -1;
+    }
+    gflags::SetUsageMessage(usage);
+    gflags::SetVersionString("1.0");
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    std::string model_path = argv[1];
+    auto strategy = lar::StrategyBase::create_strategy(model_path);
+    strategy->run();
+    gflags::ShutDownCommandLineFlags();
+
+    return 0;
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model.cpp
+++ b/lite/load_and_run/src/models/model.cpp
+
+/**
+ * \file lite/load_and_run/src/models/model.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#include "model.h"
+#include <iostream>
+#include <memory>
+#include "model_lite.h"
+#include "model_mdl.h"
+
+using namespace lar;
+
+ModelType ModelBase::get_model_type(std::string model_path) {
+    //! read magic number of dump file
+    FILE* fin = fopen(model_path.c_str(), "rb");
+    mgb_assert(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
+    char buf[16];
+    mgb_assert(fread(buf, 1, 16, fin) == 16, "read model failed");
+    fclose(fin);
+
+    // get model type
+    // uint32_t MGB_MAGIC = 0x5342474D
+    std::string tag(buf);
+    ModelType type;
+    if (tag.substr(0, 7) == std::string("mgb0001") ||
+        tag.substr(0, 8) == std::string("mgb0000a") ||
+        tag.substr(0, 4) == std::string("MGBS") ||
+        tag.substr(0, 8) == std::string("mgbtest0")) {
+        type = ModelType::MEGDL_MODEL;
+
+    } else {
+        type = ModelType::LITE_MODEL;
+    }
+
+    return type;
+}
+
+std::shared_ptr<ModelBase> ModelBase::create_model(std::string model_path) {
+    mgb_log_debug("model path %s\n", model_path.c_str());
+
+    auto model_type = get_model_type(model_path);
+
+    if (ModelType::LITE_MODEL == model_type) {
+        return std::make_shared<ModelLite>(model_path);
+    } else if (ModelType::MEGDL_MODEL == model_type) {
+        if (FLAGS_lite)
+            return std::make_shared<ModelLite>(model_path);
+        else
+            return std::make_shared<ModelMdl>(model_path);
+    } else {
+        return nullptr;
+    }
+}
+DEFINE_bool(lite, false, "using lite model to run mdl model");
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model.h
+++ b/lite/load_and_run/src/models/model.h
+/**
+ * \file lite/load_and_run/src/models/model.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include <string>
+#include "helpers/common.h"
+
+DECLARE_bool(lite);
+
+namespace lar {
+/*!
+ * \brief: base class of model
+ */
+class ModelBase {
+public:
+    //! get model type by the magic number in dump file
+    static ModelType get_model_type(std::string model_path);
+
+    //! create model by different model type
+    static std::shared_ptr<ModelBase> create_model(std::string model_path);
+
+    //! type of the model
+    virtual ModelType type() = 0;
+
+    //! set model load state
+
+    virtual void set_shared_mem(bool state) = 0;
+
+    //! load model interface for load and run strategy
+    virtual void load_model() = 0;
+
+    //! run model interface for load and run strategy
+    virtual void run_model() = 0;
+
+    //! wait asynchronous function interface for load and run strategy
+    virtual void wait() = 0;
+
+    virtual ~ModelBase() = default;
+};
+}  // namespace lar
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model_lite.cpp
+++ b/lite/load_and_run/src/models/model_lite.cpp
+/**
+ * \file lite/load_and_run/src/models/model_lite.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#include "model_lite.h"
+#include <gflags/gflags.h>
+#include <cstring>
+#include "misc.h"
+
+DECLARE_bool(share_param_mem);
+
+using namespace lar;
+ModelLite::ModelLite(const std::string& path) : model_path(path) {
+    LITE_WARN("creat lite model use CPU as default comp node");
+};
+void ModelLite::load_model() {
+    m_network = std::make_shared<lite::Network>(config, IO);
+    if (share_model_mem) {
+        //! WARNNING:maybe not right to share param memmory for this
+        LITE_WARN("enable share model memory");
+
+        FILE* fin = fopen(model_path.c_str(), "rb");
+        LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
+        fseek(fin, 0, SEEK_END);
+        size_t size = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        void* ptr = malloc(size);
+        std::shared_ptr<void> buf{ptr, free};
+        auto nr = fread(buf.get(), 1, size, fin);
+        LITE_ASSERT(nr == size, "read model file failed");
+        fclose(fin);
+
+        m_network->load_model(buf.get(), size);
+    } else {
+        m_network->load_model(model_path);
+    }
+}
+
+void ModelLite::run_model() {
+    m_network->forward();
+}
+
+void ModelLite::wait() {
+    m_network->wait();
+}
--- a/lite/load_and_run/src/models/model_lite.h
+++ b/lite/load_and_run/src/models/model_lite.h
+/**
+ * \file lite/load_and_run/src/models/model_lite.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include <string>
+#include "helpers/common.h"
+#include "helpers/data_parser.h"
+#include "lite/network.h"
+#include "model.h"
+
+namespace lar {
+/*!
+ * \brief: megengine lite model
+ */
+class ModelLite : public ModelBase {
+public:
+    using Strategy = LiteAlgoSelectStrategy;
+
+    ModelLite(const std::string& path);
+    //!  model type
+    ModelType type() override { return ModelType::LITE_MODEL; }
+
+    //! set to load from shared memory
+    void set_shared_mem(bool state) override { share_model_mem = state; }
+
+    //! load model from dump file
+    void load_model() override;
+
+    //! run model with given runtime parameter
+    void run_model() override;
+
+    //! wait the end of asynchronous function execution
+    void wait() override;
+
+    //! get the network of lite model
+    std::shared_ptr<lite::Network> get_lite_network() { return m_network; }
+
+    //! get the config of lite model
+    lite::Config& get_config() { return config; }
+
+    //! get the networkIO of lite model
+    lite::NetworkIO& get_networkIO() { return IO; }
+
+    //! get the data parser
+    DataParser& get_input_parser() { return parser; }
+
+    //! set the strategy before load model
+    void set_lite_strategy(Strategy& u_strategy) { m_strategy = u_strategy; }
+
+    //! get algo strategy
+    Strategy& get_lite_strategy() { return m_strategy; }
+
+private:
+    bool share_model_mem;
+    std::string model_path;
+
+    DataParser parser;
+    lite::Config config;
+    lite::NetworkIO IO;
+
+    std::shared_ptr<lite::Network> m_network;
+
+    Strategy m_strategy;
+};
+}  // namespace lar
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model_mdl.cpp
+++ b/lite/load_and_run/src/models/model_mdl.cpp
+/**
+ * \file lite/load_and_run/src/models/model_mdl.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "model_mdl.h"
+#include <gflags/gflags.h>
+#include <iostream>
+
+DECLARE_bool(share_param_mem);
+
+using namespace lar;
+
+ModelMdl::ModelMdl(const std::string& path) : model_path(path) {
+    mgb_log_warn("creat mdl model use XPU as default comp node");
+    m_load_config.comp_graph = mgb::ComputingGraph::make();
+    m_load_config.comp_graph->options().graph_opt_level = 0;
+    testcase_num = 0;
+}
+
+void ModelMdl::load_model() {
+    //! read dump file
+    if (share_model_mem) {
+        mgb_log_warn("enable share model memory");
+        FILE* fin = fopen(model_path.c_str(), "rb");
+        mgb_assert(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
+        fseek(fin, 0, SEEK_END);
+        size_t size = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        void* ptr = malloc(size);
+        std::shared_ptr<void> buf{ptr, free};
+        auto nr = fread(buf.get(), 1, size, fin);
+        mgb_assert(nr == size, "read model file failed");
+        fclose(fin);
+
+        m_model_file = mgb::serialization::InputFile::make_mem_proxy(buf, size);
+    } else {
+        m_model_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
+    }
+
+    //! get dump_with_testcase model testcase number
+    char magic[8];
+    m_model_file->read(magic, sizeof(magic));
+    if (strncmp(magic, "mgbtest0", 8)) {
+        m_model_file->rewind();
+    } else {
+        m_model_file->read(&testcase_num, sizeof(testcase_num));
+    }
+
+    auto format =
+            mgb::serialization::GraphLoader::identify_graph_dump_format(*m_model_file);
+    mgb_assert(
+            format.valid(),
+            "invalid format, please make sure model is dumped by GraphDumper");
+
+    //! load computing graph of model
+    m_loader = mgb::serialization::GraphLoader::make(
+            std::move(m_model_file), format.val());
+    m_load_result = m_loader->load(m_load_config, false);
+    m_load_config.comp_graph.reset();
+
+    // get testcase input generated by dump_with_testcase.py
+    if (testcase_num) {
+        for (auto&& i : m_load_result.tensor_map) {
+            test_input_tensors.emplace_back(i.first, i.second.get());
+        }
+        std::sort(test_input_tensors.begin(), test_input_tensors.end());
+    }
+    // initialize output callback
+    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
+        mgb::ComputingGraph::Callback cb;
+        m_callbacks.push_back(cb);
+    }
+}
+
+void ModelMdl::make_output_spec() {
+    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
+        auto item = m_load_result.output_var_list[i];
+        m_output_spec.emplace_back(item, std::move(m_callbacks[i]));
+    }
+
+    m_asyc_exec = m_load_result.graph_compile(m_output_spec);
+}
+
+std::shared_ptr<mgb::serialization::GraphLoader>& ModelMdl::reset_loader() {
+    m_loader = mgb::serialization::GraphLoader::make(
+            m_loader->reset_file(), m_loader->format());
+    return m_loader;
+}
+
+void ModelMdl::run_model() {
+    mgb_assert(
+            m_asyc_exec != nullptr,
+            "empty asychronous function to execute after graph compiled");
+    m_asyc_exec->execute();
+}
+
+void ModelMdl::wait() {
+    m_asyc_exec->wait();
+}
--- a/lite/load_and_run/src/models/model_mdl.h
+++ b/lite/load_and_run/src/models/model_mdl.h
+/**
+ * \file lite/load_and_run/src/models/model_mdl.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <string>
+#include "megbrain/opr/search_policy/algo_chooser_helper.h"
+#include "megbrain/plugin/opr_io_dump.h"
+#include "megbrain/serialization/extern_c_opr.h"
+#include "megbrain/serialization/serializer.h"
+#include "megbrain/utils/debug.h"
+
+#include "megbrain/plugin/num_range_checker.h"
+#include "megbrain/plugin/profiler.h"
+
+#include "helpers/common.h"
+#include "helpers/data_parser.h"
+#include "model.h"
+
+namespace lar {
+
+class ModelMdl : public ModelBase {
+public:
+    using Strategy = mgb::opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    //! interface implement of ModelBase
+    ModelMdl(const std::string& path);
+
+    ModelType type() override { return ModelType::MEGDL_MODEL; }
+
+    void set_shared_mem(bool state) override { share_model_mem = state; }
+
+    void load_model() override;
+
+    void make_output_spec();
+
+    void run_model() override;
+
+    void wait() override;
+
+    //! get load result for megDL model
+    mgb::serialization::GraphLoader::LoadResult& get_mdl_load_result() {
+        return m_load_result;
+    }
+
+    //! get load config for megDL model
+    mgb::serialization::GraphLoadConfig& get_mdl_config() { return m_load_config; }
+
+    //! reset the graph loader for dump_with_testcase model
+    std::shared_ptr<mgb::serialization::GraphLoader>& reset_loader();
+
+    //!  algo strategy for runing model
+    void set_mdl_strategy(Strategy& u_strategy) { m_strategy = u_strategy; }
+    Strategy& get_mdl_strategy() { return m_strategy; }
+
+    //! get data parser
+    DataParser& get_input_parser() { return parser; }
+    uint32_t get_testcase_num() { return testcase_num; }
+    std::vector<std::pair<std::string, mgb::HostTensorND*>>& get_test_input() {
+        return test_input_tensors;
+    }
+
+    //! get output specified configuration
+    mgb::ComputingGraph::OutputSpec& get_output_spec() { return m_output_spec; }
+    std::unique_ptr<mgb::cg::AsyncExecutable>& get_async_func() { return m_asyc_exec; }
+
+    void set_output_callback(std::vector<mgb::ComputingGraph::Callback>& cb) {
+        mgb_assert(
+                m_callbacks.size() == cb.size(),
+                "invalid output callback list to set!!");
+        for (size_t i = 0; i < cb.size(); i++) {
+            m_callbacks[i] = cb[i];
+        }
+    }
+#if MGB_ENABLE_JSON
+    std::unique_ptr<mgb::GraphProfiler>& get_profiler() { return m_profiler; }
+    void set_profiler() {
+        m_profiler =
+                std::make_unique<mgb::GraphProfiler>(m_load_config.comp_graph.get());
+    }
+#endif
+    void set_num_range_checker(float range) {
+        m_num_range_checker = std::make_unique<mgb::NumRangeChecker>(
+                m_load_config.comp_graph.get(), range);
+    }
+
+private:
+    bool share_model_mem;
+    std::string model_path;
+    std::unique_ptr<mgb::serialization::InputFile> m_model_file;
+    mgb::serialization::GraphLoadConfig m_load_config;
+
+    mgb::serialization::GraphLoader::LoadResult m_load_result;
+    std::shared_ptr<mgb::serialization::GraphLoader> m_loader;
+    std::unique_ptr<mgb::cg::AsyncExecutable> m_asyc_exec;
+
+    uint32_t testcase_num;
+    std::vector<std::pair<std::string, mgb::HostTensorND*>> test_input_tensors;
+
+    DataParser parser;
+    Strategy m_strategy = Strategy::HEURISTIC;
+    std::vector<mgb::ComputingGraph::Callback> m_callbacks;
+    mgb::ComputingGraph::OutputSpec m_output_spec;
+
+    std::unique_ptr<mgb::NumRangeChecker> m_num_range_checker;
+#if MGB_ENABLE_JSON
+    std::unique_ptr<mgb::GraphProfiler> m_profiler;
+#endif
+};
+
+}  // namespace lar
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/options/device_options.cpp
+++ b/lite/load_and_run/src/options/device_options.cpp
+/**
+ * \file lite/load_and_run/src/options/device_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <iostream>
+#include <sstream>
+#include "lite/global.h"
+#include "megbrain/comp_node_env.h"
+#include "misc.h"
+#include "device_options.h"
+#include "models/model_lite.h"
+#include "models/model_mdl.h"
+
+DECLARE_bool(weight_preprocess);
+
+using namespace lar;
+
+/////////////////// XPUDeviceOption //////////////////////
+namespace lar {
+template <>
+void XPUDeviceOption::config_model_internel<ModelLite>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        if ((enable_cpu) || (enable_cpu_default) || (enable_multithread) ||
+            (enable_multithread_default)) {
+            LITE_WARN("using cpu device\n");
+            model->get_config().device_type = LiteDeviceType::LITE_CPU;
+        }
+#if MGE_WITH_CUDA
+        if (enable_cuda) {
+            model->get_config().device_type = LiteDeviceType::LITE_CUDA;
+        }
+#endif
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        auto network = model->get_lite_network();
+        if (enable_cpu_default) {
+            LITE_WARN("using cpu default device\n");
+            lite::Runtime::set_cpu_inplace_mode(network);
+        }
+        if (enable_multithread) {
+            LITE_WARN("using multithread device\n");
+            lite::Runtime::set_cpu_threads_number(network, thread_num);
+        }
+        if (enable_multithread_default) {
+            LITE_WARN("using multithread  default device\n");
+            lite::Runtime::set_cpu_inplace_mode(network);
+            lite::Runtime::set_cpu_threads_number(network, thread_num);
+        }
+        if (enable_set_core_ids) {
+            std::string core_str;
+            for (auto id : core_ids) {
+                core_str += std::to_string(id) + ",";
+            }
+            LITE_WARN("multi thread core ids: %s\n", core_str.c_str());
+            lite::ThreadAffinityCallback affinity_callback = [&](size_t thread_id) {
+                mgb::sys::set_cpu_affinity({core_ids[thread_id]});
+            };
+            lite::Runtime::set_runtime_thread_affinity(network, affinity_callback);
+        }
+    }
+}
+
+template <>
+void XPUDeviceOption::config_model_internel<ModelMdl>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        if (enable_cpu) {
+            mgb_log_warn("using cpu device\n");
+            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
+                loc.type = mgb::CompNode::DeviceType::CPU;
+            };
+        }
+#if MGE_WITH_CUDA
+        if (enable_cuda) {
+            mgb_log_warn("using cuda device\n");
+            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
+                loc.type = mgb::CompNode::DeviceType::CUDA;
+            };
+        }
+#endif
+        if (enable_cpu_default) {
+            mgb_log_warn("using cpu default device\n");
+            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
+                loc.type = mgb::CompNode::DeviceType::CPU;
+                loc.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
+            };
+        }
+        if (enable_multithread) {
+            mgb_log_warn("using multithread device\n");
+            model->get_mdl_config().comp_node_mapper =
+                    [&](mgb::CompNode::Locator& loc) {
+                        loc.type = mgb::CompNode::DeviceType::MULTITHREAD;
+                        loc.device = 0;
+                        loc.stream = thread_num;
+                    };
+        }
+        if (enable_multithread_default) {
+            mgb_log_warn("using multithread default device\n");
+            model->get_mdl_config().comp_node_mapper =
+                    [&](mgb::CompNode::Locator& loc) {
+                        loc.type = mgb::CompNode::DeviceType::MULTITHREAD;
+                        loc.device = mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
+                        loc.stream = thread_num;
+                    };
+        }
+        if (enable_set_core_ids) {
+            std::string core_str;
+            for (auto id : core_ids) {
+                core_str += std::to_string(id) + ",";
+            }
+            mgb_log_warn("set multi thread core ids:%s\n", core_str.c_str());
+            auto affinity_callback = [&](size_t thread_id) {
+                mgb::sys::set_cpu_affinity({core_ids[thread_id]});
+            };
+            mgb::CompNode::Locator loc;
+            model->get_mdl_config().comp_node_mapper(loc);
+            auto comp_node = mgb::CompNode::load(loc);
+            mgb::CompNodeEnv::from_comp_node(comp_node).cpu_env().set_affinity(
+                    affinity_callback);
+        }
+    }
+}
+}  // namespace lar
+
+XPUDeviceOption::XPUDeviceOption() {
+    m_option_name = "xpu_device";
+    enable_cpu = FLAGS_cpu;
+#if MGE_WITH_CUDA
+    enable_cuda = FLAGS_cuda;
+#endif
+    enable_cpu_default = FLAGS_cpu_default;
+
+    if (FLAGS_multithread >= 0) {
+        thread_num = FLAGS_multithread;
+        enable_multithread = true;
+    }
+
+    if (FLAGS_multithread_default >= 0) {
+        thread_num = FLAGS_multithread_default;
+        enable_multithread_default = true;
+    }
+
+    if (!FLAGS_multi_thread_core_ids.empty()) {
+        mgb_assert(enable_multithread, "core ids should be set after --multithread");
+        std::stringstream id_stream(FLAGS_multi_thread_core_ids);
+        std::string id;
+        size_t thread_cnt = 0;
+        while (getline(id_stream, id, ',')) {
+            thread_cnt++;
+            core_ids.push_back(atoi(id.c_str()));
+        }
+        mgb_assert(
+                thread_cnt == thread_num,
+                "core ids number should be same with thread number set before");
+        enable_set_core_ids = true;
+    }
+}
+
+bool XPUDeviceOption::is_valid() {
+    bool ret = FLAGS_cpu || FLAGS_cpu_default;
+#if MGE_WITH_CUDA
+    ret = ret || FLAGS_cuda;
+#endif
+    ret = ret || FLAGS_multithread >= 0;
+    ret = ret || FLAGS_multithread_default >= 0;
+    ret = ret || !FLAGS_multi_thread_core_ids.empty();
+
+    return ret;
+}
+
+std::shared_ptr<OptionBase> XPUDeviceOption::create_option() {
+    static std::shared_ptr<lar::XPUDeviceOption> option(new XPUDeviceOption);
+    if (XPUDeviceOption::is_valid()) {
+        return std::static_pointer_cast<lar::OptionBase>(option);
+    } else {
+        return nullptr;
+    }
+}
+
+void XPUDeviceOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+///////////////////////// xpu gflags ////////////////////////////
+DEFINE_bool(cpu, false, "set CPU device as running device");
+#if MGE_WITH_CUDA
+DEFINE_bool(cuda, false, "set CUDA device as running device ");
+#endif
+DEFINE_bool(cpu_default, false, "set running device as CPU device with inplace mode");
+DEFINE_int32(multithread, -1, "set multithread device as running device");
+DEFINE_int32(
+        multithread_default, -1,
+        "set multithread device as running device with inplace mode");
+DEFINE_string(multi_thread_core_ids, "", "set multithread core id");
+REGIST_OPTION_CREATOR(xpu_device, lar::XPUDeviceOption::create_option);
\ No newline at end of file
--- a/lite/load_and_run/src/options/device_options.h
+++ b/lite/load_and_run/src/options/device_options.h
+/**
+ * \file lite/load_and_run/src/options/device_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include "models/model.h"
+#include "option_base.h"
+
+DECLARE_bool(cpu);
+#if MGE_WITH_CUDA
+DECLARE_bool(cuda);
+#endif
+DECLARE_bool(cpu_default);
+DECLARE_int32(multithread);
+DECLARE_int32(multithread_default);
+DECLARE_string(multi_thread_core_ids);
+namespace lar {
+
+class XPUDeviceOption final : public OptionBase {
+public:
+    static bool is_valid();
+    static std::shared_ptr<OptionBase> create_option();
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    XPUDeviceOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+    bool enable_cpu;
+#if MGE_WITH_CUDA
+    bool enable_cuda;
+#endif
+    bool enable_cpu_default;
+    bool enable_multithread;
+    bool enable_multithread_default;
+    bool enable_set_core_ids;
+    size_t thread_num;
+    std::vector<int> core_ids;
+    std::string m_option_name;
+};
+}  // namespace lar
\ No newline at end of file
--- a/lite/load_and_run/src/options/extern_c_opr_options.cpp
+++ b/lite/load_and_run/src/options/extern_c_opr_options.cpp
+/**
+ * \file lite/load_and_run/src/options/extern_c_opr_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "extern_c_opr_options.h"
+#include "megbrain/utils/debug.h"
+#include "misc.h"
+#include "models/model_lite.h"
+#include "models/model_mdl.h"
+
+namespace lar {
+template <>
+void COprLibOption::config_model_internel(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    MGB_MARK_USED_VAR(model);
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        if (!lib_path.empty()) {
+            lite::set_loader_lib_path(lib_path);
+        }
+        if (c_opr_args.is_run_c_opr_with_param) {
+            LITE_THROW(
+                    "lite model dont't support run with external c opr "
+                    "parmeter");
+        }
+    }
+}
+template <>
+void COprLibOption::config_model_internel(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        if (!lib_path.empty()) {
+            load_lib();
+        }
+        if (c_opr_args.is_run_c_opr_with_param) {
+            mgb_assert(
+                    c_opr_args.is_run_c_opr &&
+                            c_opr_args.copr_param_device_ptr_malloc &&
+                            c_opr_args.copr_param_device_ptr_free &&
+                            c_opr_args.copr_param_device_ptr_h2d,
+                    "--c-opr-lib-with-param need config with --c-opr-lib, also "
+                    "extern c opr loader need implemente "
+                    "copr_param_device_ptr_malloc, copr_param_device_ptr_free "
+                    "and copr_param_device_ptr_h2d symbols");
+        }
+    } else if (runtime_param.stage == RunStage::MODEL_RUNNING) {
+        if (model->get_testcase_num() && c_opr_args.is_run_c_opr_with_param) {
+            init_extern_param(model);
+            set_Copr_IO(model);
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_RUNNING_ITER) {
+        if (model->get_testcase_num() && c_opr_args.is_run_c_opr_with_param) {
+            c_opr_args.copr_param_device_ptr_free(c_opr_param.get());
+            free(c_opr_param->input);
+        }
+    }
+}
+}  // namespace lar
+
+using namespace lar;
+
+MGBDType COprLibOption::dtype_cpp2c(megdnn::DType dtype) {
+    switch (dtype.enumv()) {
+        case megdnn::DTypeEnum::Float32:
+            return MGB_DTYPE_FLOAT32;
+        case megdnn::DTypeEnum::Int32:
+            return MGB_DTYPE_INT32;
+        case megdnn::DTypeEnum::Int16:
+            return MGB_DTYPE_INT16;
+        case megdnn::DTypeEnum::Uint8:
+            return MGB_DTYPE_UINT8;
+#if !MEGDNN_DISABLE_FLOAT16
+        case megdnn::DTypeEnum::Float16:
+            return MGB_DTYPE_FLOAT16;
+#endif
+        default:
+            mgb_throw(
+                    mgb::InternalError, "unsupported dtype for extern C API: %s",
+                    dtype.name());
+    }
+}
+
+void COprLibOption::tensor_shape_to_c(
+        const megdnn::TensorShape& shape, MGBTensorShape& mgb_shape) {
+    mgb_assert(
+            shape.ndim <= MGB_TENSOR_MAX_NDIM, "shape ndim too large: %zu", shape.ndim);
+    mgb_shape.ndim = shape.ndim;
+    for (size_t i = 0; i < shape.ndim; ++i) {
+        mgb_shape.shape[i] = shape[i];
+    }
+}
+
+void COprLibOption::init_extern_param(std::shared_ptr<ModelBase> model_ptr) {
+    auto model = std::static_pointer_cast<ModelMdl>(model_ptr);
+    auto inp_tensors = model->get_test_input();
+
+    c_opr_param = std::make_shared<ExternCOprParam>();
+    memset(c_opr_param.get(), 0, sizeof(ExternCOprParam));
+
+    //! we just test input on npu case, do not test output on
+    //! npu case, so we just init input shape and type
+
+    c_opr_param->nr_input = inp_tensors.size();
+    c_opr_param->input = (ExternDeviceTensor*)malloc(
+            sizeof(ExternDeviceTensor) * inp_tensors.size());
+    memset(c_opr_param->input, 0, sizeof(ExternDeviceTensor) * inp_tensors.size());
+
+    //! init input ExternDeviceTensor shape and dtype
+    for (size_t input_idx = 0; input_idx < inp_tensors.size(); input_idx++) {
+        auto& mgb_tensor_layout = c_opr_param->input[input_idx].layout;
+        auto host_tensor_nd_p = inp_tensors[input_idx].second;
+        mgb_tensor_layout.dtype = dtype_cpp2c(host_tensor_nd_p->dtype());
+        tensor_shape_to_c(
+                inp_tensors[input_idx].second->shape(), mgb_tensor_layout.shape);
+    }
+    c_opr_param->nr_output = 0;
+
+    //! now call copr_param_device_ptr_malloc to malloc
+    //! device_ptr
+    c_opr_args.copr_param_device_ptr_malloc(c_opr_param.get());
+}
+
+void COprLibOption::load_lib() {
+    auto handle = dlopen(lib_path.c_str(), RTLD_LAZY);
+    mgb_assert(handle, "failed to open c opr lib %s: %s", lib_path.c_str(), dlerror());
+
+    const char* entry = MGB_C_OPR_INIT_FUNC_STR;
+    auto func = dlsym(handle, entry);
+    mgb_assert(func, "can not resolve %s: %s", entry, dlerror());
+    typedef void (*entry_f_t)(void*);
+    reinterpret_cast<entry_f_t>(func)(
+            reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned));
+    printf("loaded C opr library: %s\n", lib_path.c_str());
+    entry = "copr_param_device_ptr_malloc";
+    func = dlsym(handle, entry);
+    if (func) {
+        printf("get %s from: %s\n", entry, lib_path.c_str());
+        c_opr_args.copr_param_device_ptr_malloc =
+                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_MEM_T>(func);
+    }
+
+    entry = "copr_param_device_ptr_free";
+    func = dlsym(handle, entry);
+    if (func) {
+        printf("get %s from: %s\n", entry, lib_path.c_str());
+        c_opr_args.copr_param_device_ptr_free =
+                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_MEM_T>(func);
+    }
+
+    entry = "copr_param_device_ptr_h2d";
+    func = dlsym(handle, entry);
+    if (func) {
+        printf("get %s from: %s\n", entry, lib_path.c_str());
+        c_opr_args.copr_param_device_ptr_h2d =
+                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_H2D_T>(func);
+    }
+}
+
+void COprLibOption::set_Copr_IO(std::shared_ptr<ModelBase> model_ptr) {
+    auto model = std::static_pointer_cast<ModelMdl>(model_ptr);
+    auto inp_tensors = model->get_test_input();
+    auto loader = model->reset_loader();
+    auto testcase = loader->load(model->get_mdl_config(), false);
+    mgb_assert(testcase.output_var_list.size() == inp_tensors.size());
+    for (size_t i = 0; i < inp_tensors.size(); ++i) {
+        auto&& opr = testcase.output_var_list[i]
+                             .node()
+                             ->owner_opr()
+                             ->cast_final_safe<mgb::opr::SharedDeviceTensor>();
+        c_opr_args.copr_param_device_ptr_h2d(
+                c_opr_param.get(), opr.dev_data()->raw_ptr(), i);
+    }
+
+    //! now config c opr dynamic param
+    config_extern_c_opr_dynamic_param(model->get_async_func(), c_opr_param);
+}
+
+COprLibOption::COprLibOption() {
+    m_option_name = "c_opr_lib";
+    lib_path = FLAGS_c_opr_lib;
+    c_opr_args.is_run_c_opr = !lib_path.empty();
+    c_opr_args.is_run_c_opr_with_param = FLAGS_c_opr_lib_with_param;
+}
+
+bool COprLibOption::is_valid() {
+    return !FLAGS_c_opr_lib.empty() || FLAGS_c_opr_lib_with_param;
+}
+
+std::shared_ptr<OptionBase> COprLibOption::create_option() {
+    static std::shared_ptr<COprLibOption> option(new COprLibOption);
+    if (COprLibOption::is_valid()) {
+        return std::static_pointer_cast<OptionBase>(option);
+    } else {
+        return nullptr;
+    }
+}
+
+void COprLibOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+DEFINE_string(
+        c_opr_lib, "",
+        "Load external operator library. It must implement "
+        "MGB_C_OPR_INIT_FUNC_STR as the entry point");
+DEFINE_bool(
+        c_opr_lib_with_param, false,
+        "Run c opr lib with param, use to benchmark speed and check result, "
+        "need c opr loader implemente `copr_param_device_ptr_malloc, "
+        "copr_param_device_ptr_free and copr_param_device_ptr_h2d' symbols");
+
+REGIST_OPTION_CREATOR(c_opr_lib, lar::COprLibOption::create_option);
--- a/lite/load_and_run/src/options/extern_c_opr_options.h
+++ b/lite/load_and_run/src/options/extern_c_opr_options.h
+/**
+ * \file lite/load_and_run/src/options/extern_c_opr_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include "megbrain/graph/extern_copr_api.h"
+#include "models/model.h"
+#include "option_base.h"
+
+DECLARE_bool(c_opr_lib_with_param);
+DECLARE_string(c_opr_lib);
+
+namespace lar {
+
+struct COprArgs {
+    //! for run c opr
+    bool is_run_c_opr = false;
+    bool is_run_c_opr_with_param = false;
+    typedef void (*COPR_PARAM_DEVICE_PTR_MEM_T)(ExternCOprParam* param);
+    typedef void (*COPR_PARAM_DEVICE_PTR_H2D_T)(
+            ExternCOprParam* param, void* host_ptr, size_t extern_device_tensor_id);
+    COPR_PARAM_DEVICE_PTR_MEM_T copr_param_device_ptr_malloc = nullptr;
+    COPR_PARAM_DEVICE_PTR_MEM_T copr_param_device_ptr_free = nullptr;
+    COPR_PARAM_DEVICE_PTR_H2D_T copr_param_device_ptr_h2d = nullptr;
+};
+
+class COprLibOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    COprLibOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    void load_lib();
+
+    MGBDType dtype_cpp2c(megdnn::DType dtype);
+
+    void tensor_shape_to_c(const megdnn::TensorShape& shape, MGBTensorShape& mgb_shape);
+
+    void init_extern_param(std::shared_ptr<ModelBase> model);
+
+    void set_Copr_IO(std::shared_ptr<ModelBase> model);
+
+    std::string m_option_name;
+    COprArgs c_opr_args;
+    std::string lib_path;
+    std::shared_ptr<ExternCOprParam> c_opr_param;
+};
+}  // namespace lar
\ No newline at end of file
--- a/lite/load_and_run/src/options/fastrun_options.cpp
+++ b/lite/load_and_run/src/options/fastrun_options.cpp
+/**
+ * \file lite/load_and_run/src/options/fastrun_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <gflags/gflags.h>
+
+#if defined(_WIN32)
+#include <io.h>
+#define F_OK         0
+#define access(a, b) _access(a, b)
+#elif __linux__ || __unix__ || __APPLE__
+#include <unistd.h>
+#endif
+#include "fastrun_options.h"
+#include "megbrain/gopt/inference.h"
+#include "megbrain/utils/infile_persistent_cache.h"
+#include "misc.h"
+#include "models/model_lite.h"
+#include "models/model_mdl.h"
+
+namespace lar {
+
+template <>
+void FastRunOption::config_model_internel<ModelLite>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        //! set the algo policy before model load
+        using Strategy = ModelLite::Strategy;
+        uint32_t strategy = 0;
+#if MGB_ENABLE_FASTRUN
+        if (enable_full_run) {
+            LITE_WARN("enable full-run strategy for algo profile");
+            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_PROFILE) | strategy;
+        } else if (enable_fast_run) {
+            LITE_WARN("enable fast-run strategy for algo profile");
+            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_PROFILE) |
+                       static_cast<uint32_t>(Strategy::LITE_ALGO_OPTIMIZED) | strategy;
+        } else {
+            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_HEURISTIC) | strategy;
+        }
+#else
+        strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_HEURISTIC) | strategy;
+#endif
+        if (batch_binary_equal || enable_reproducible) {
+            LITE_WARN("enable reproducible strategy for algo profile");
+            if (batch_binary_equal)
+                strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_REPRODUCIBLE) |
+                           strategy;
+        }
+        auto lite_strategy = static_cast<Strategy>(strategy);
+        model->set_lite_strategy(lite_strategy);
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        auto lite_network = model->get_lite_network();
+        auto lite_strategy = model->get_lite_strategy();
+        //! set algo policy for model
+        lite::Runtime::set_network_algo_policy(
+                lite_network, lite_strategy, share_batch_size, batch_binary_equal);
+        if (!m_fast_run_cache.empty()) {
+            if (!access(m_fast_run_cache.c_str(), F_OK)) {
+                lite::set_persistent_cache(m_fast_run_cache);
+            } else {
+                lite::set_persistent_cache(m_fast_run_cache, true);
+            }
+            //! TODO:this is from mdl model settings but not matched settings in
+            //! lite model
+            // if (!enable_full_run && !enable_fast_run)
+            //     mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
+#if MGB_ENABLE_FASTRUN
+        //! dump algo cache
+        if (!m_fast_run_cache.empty()) {
+            lite::dump_persistent_cache(m_fast_run_cache);
+        }
+#endif
+    }
+}
+
+template <>
+void FastRunOption::config_model_internel<ModelMdl>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        //! set the algo policy before model load
+        using Strategy = ModelMdl::Strategy;
+        auto strategy = static_cast<Strategy>(0);
+#if MGB_ENABLE_FASTRUN
+        if (enable_full_run) {
+            mgb_log_warn("enable full-run strategy for algo profile");
+            strategy = Strategy::PROFILE | strategy;
+        } else if (enable_fast_run) {
+            mgb_log_warn("enable fast-run strategy for algo profile");
+            strategy = Strategy::PROFILE | Strategy::OPTIMIZED | strategy;
+        } else {
+            strategy = Strategy::HEURISTIC | strategy;
+        }
+#else
+        strategy = Strategy::HEURISTIC | strategy;
+#endif
+        if (batch_binary_equal || enable_reproducible) {
+            mgb_log_warn("enable reproducible strategy for algo profile");
+            strategy = Strategy::REPRODUCIBLE | strategy;
+        }
+        model->set_mdl_strategy(strategy);
+
+        //! set binary_equal_between_batch and shared_batch_size
+        if (batch_binary_equal) {
+            mgb_log_warn("enable batch binary equal");
+            model->get_mdl_config()
+                    .comp_graph->options()
+                    .fast_run_config.binary_equal_between_batch = true;
+        }
+        if (share_batch_size > 0) {
+            mgb_log_warn("set shared shared batch");
+            model->get_mdl_config()
+                    .comp_graph->options()
+                    .fast_run_config.shared_batch_size = share_batch_size;
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        auto vars = model->get_mdl_load_result().output_var_list;
+        auto strategy = model->get_mdl_strategy();
+        mgb::gopt::modify_opr_algo_strategy_inplace(vars, strategy);
+        // set algo cache path
+        if (!m_fast_run_cache.empty()) {
+            if (!access(m_fast_run_cache.c_str(), F_OK)) {
+                mgb::PersistentCache::set_impl(
+                        std::make_shared<mgb::InFilePersistentCache>(
+                                m_fast_run_cache.c_str()));
+            } else {
+                mgb::PersistentCache::set_impl(
+                        std::make_shared<mgb::InFilePersistentCache>());
+            }
+#if MGB_ENABLE_FASTRUN
+            if (!enable_full_run && !enable_fast_run)
+#endif
+                mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
+#if MGB_ENABLE_FASTRUN
+        //! dump algo cache
+        if (!m_fast_run_cache.empty()) {
+            static_cast<mgb::InFilePersistentCache&>(mgb::PersistentCache::inst())
+                    .dump_cache(m_fast_run_cache.c_str());
+        }
+#endif
+    }
+}
+
+}  // namespace lar
+
+using namespace lar;
+
+FastRunOption::FastRunOption() {
+    m_option_name = "fastrun";
+#if MGB_ENABLE_FASTRUN
+    enable_fast_run = FLAGS_fast_run;
+    enable_full_run = FLAGS_full_run;
+#endif
+    batch_binary_equal = FLAGS_binary_equal_between_batch;
+    enable_reproducible = FLAGS_reproducible;
+    m_fast_run_cache = FLAGS_fast_run_algo_policy;
+    share_batch_size = FLAGS_fast_run_shared_batch_size;
+#if MGB_ENABLE_FASTRUN
+    //! while fastrun cache file path is not empty and can't be accessed
+    if (!m_fast_run_cache.empty() && access(m_fast_run_cache.c_str(), F_OK)) {
+        mgb_assert(
+                enable_full_run || enable_fast_run,
+                "--fast-run or --full-run should be enabled");
+    }
+    if (share_batch_size) {
+        mgb_assert(
+                enable_full_run || enable_fast_run || !m_fast_run_cache.empty(),
+                "--fast-run-shared-batch-size should be used with "
+                "--fast-run|--full-run|--fast-run-algo-policy");
+    }
+#endif
+}
+
+bool FastRunOption::is_valid() {
+    bool ret = false;
+#if MGB_ENABLE_FASTRUN
+    ret = ret || FLAGS_fast_run;
+    ret = ret || FLAGS_full_run;
+#endif
+    ret = ret || FLAGS_binary_equal_between_batch;
+    ret = ret || FLAGS_fast_run_shared_batch_size > 0;
+    ret = ret || FLAGS_reproducible;
+    ret = ret || FLAGS_fast_run_algo_policy.size() > 0;
+
+    return ret;
+}
+
+std::shared_ptr<OptionBase> FastRunOption::create_option() {
+    static std::shared_ptr<FastRunOption> option(new FastRunOption);
+    if (FastRunOption::is_valid()) {
+        return std::static_pointer_cast<OptionBase>(option);
+    } else {
+        return nullptr;
+    }
+}
+
+void FastRunOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+
+#if MGB_ENABLE_FASTRUN
+DEFINE_bool(fast_run, false, "whether to use fast-run in model run");
+DEFINE_bool(full_run, false, "whether to use full-run in model run");
+#endif
+
+DEFINE_bool(
+        binary_equal_between_batch, false,
+        "Each batch of output is promised binary equal if each batch of "
+        "input is binary equal\n Note that if this option is turned on, "
+        "`--reproducible` will also be turned on.");
+DEFINE_bool(
+        reproducible, false,
+        "Enable choose algo which is reproducible. It mainly used for "
+        "cudnn algos.See "
+        "https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/"
+        "index.html#reproducibility"
+        "for more details.");
+DEFINE_uint32(fast_run_shared_batch_size, 0, "Set the batch size used during fastrun");
+DEFINE_string(fast_run_algo_policy, "", "fast-run cache path.");
+
+REGIST_OPTION_CREATOR(fastrun, lar::FastRunOption::create_option);
\ No newline at end of file
--- a/lite/load_and_run/src/options/fastrun_options.h
+++ b/lite/load_and_run/src/options/fastrun_options.h
+/**
+ * \file lite/load_and_run/src/options/fastrun_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include "models/model.h"
+#include "option_base.h"
+
+#if MGB_ENABLE_FASTRUN
+DECLARE_bool(fast_run);
+DECLARE_bool(full_run);
+#endif
+DECLARE_bool(reproducible);
+DECLARE_bool(binary_equal_between_batch);
+DECLARE_uint32(fast_run_shared_batch_size);
+DECLARE_string(fast_run_algo_policy);
+
+namespace lar {
+class FastRunOption final : public OptionBase {
+public:
+    //! get condition for construct FastRunOption
+    static bool is_valid();
+
+    //! creat option using condition from cmdline args
+    static std::shared_ptr<OptionBase> create_option();
+
+    //! configure model for different runtime_param
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    //! get options name for quickly search
+    std::string option_name() const override { return m_option_name; }
+
+private:
+    FastRunOption();
+    //! config template for different model
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>) {}
+
+#if MGB_ENABLE_FASTRUN
+    bool enable_fast_run;  //! fast run strategy flag
+    bool enable_full_run;  //! full run strategy flag
+#endif
+    bool batch_binary_equal;       //! fast run stratgey setting
+    bool enable_reproducible;      //! enable reproducible strategy
+    size_t share_batch_size;       //! fast run strategy share batch size setting
+    std::string m_fast_run_cache;  //! fast run cache file path
+    std::string m_option_name;     //! option name
+};
+}  // namespace lar
--- a/lite/load_and_run/src/options/io_options.cpp
+++ b/lite/load_and_run/src/options/io_options.cpp
+/**
+ * \file lite/load_and_run/src/options/io_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <map>
+
+#include "helpers/data_parser.h"
+#include "misc.h"
+#include "models/model_lite.h"
+#include "models/model_mdl.h"
+
+#include "io_options.h"
+namespace lar {
+template <>
+void InputOption::config_model_internel<ModelLite>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        auto parser = model->get_input_parser();
+        auto io = model->get_networkIO();
+        for (size_t idx = 0; idx < data_path.size(); ++idx) {
+            parser.feed(data_path[idx].c_str());
+        }
+
+        auto inputs = parser.inputs;
+        bool is_host = true;
+        for (auto& i : inputs) {
+            io.inputs.push_back({i.first, is_host});
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        auto config = model->get_config();
+        auto parser = model->get_input_parser();
+        auto network = model->get_lite_network();
+
+        //! datd type map from mgb data type to lite data type
+        std::map<megdnn::DTypeEnum, LiteDataType> type_map = {
+                {megdnn::DTypeEnum::Float32, LiteDataType::LITE_FLOAT},
+                {megdnn::DTypeEnum::Int32, LiteDataType::LITE_INT},
+                {megdnn::DTypeEnum::Int8, LiteDataType::LITE_INT8},
+                {megdnn::DTypeEnum::Uint8, LiteDataType::LITE_UINT8}};
+
+        for (auto& i : parser.inputs) {
+            //! get tensor information from data parser
+            auto tensor = i.second;
+            auto data_type = tensor.dtype();
+            auto tensor_shape = tensor.shape();
+            mgb::dt_byte* src = tensor.raw_ptr();
+
+            //! set lite layout
+            lite::Layout layout;
+            layout.ndim = tensor_shape.ndim;
+            for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
+                layout.shapes[idx] = tensor_shape[idx];
+            }
+            layout.data_type = type_map[data_type.enumv()];
+
+            //! set network input tensor
+            std::shared_ptr<lite::Tensor> input_tensor =
+                    network->get_io_tensor(i.first);
+            input_tensor->reset(src, layout);
+        }
+    }
+}
+
+template <>
+void InputOption::config_model_internel<ModelMdl>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        auto parser = model->get_input_parser();
+        for (size_t idx = 0; idx < data_path.size(); ++idx) {
+            parser.feed(data_path[idx].c_str());
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        auto parser = model->get_input_parser();
+        auto network = model->get_mdl_load_result();
+        auto tensormap = network.tensor_map;
+        for (auto& i : parser.inputs) {
+            mgb_assert(
+                    tensormap.find(i.first) != tensormap.end(),
+                    "can't find tesnor named %s", i.first.c_str());
+            auto& in = tensormap.find(i.first)->second;
+            in->copy_from(i.second);
+        }
+    }
+}
+
+template <>
+void IOdumpOption::config_model_internel<ModelLite>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        if (enable_io_dump) {
+            LITE_WARN("enable text io dump");
+            lite::Runtime::enable_io_txt_dump(model->get_lite_network(), dump_path);
+        }
+        if (enable_bin_io_dump) {
+            LITE_WARN("enable binary io dump");
+            lite::Runtime::enable_io_bin_dump(model->get_lite_network(), dump_path);
+        }
+        //! FIX:when add API in lite complate this
+        if (enable_io_dump_stdout || enable_io_dump_stderr) {
+            LITE_THROW("lite model don't support the stdout or stderr io dump");
+        }
+        if (enable_bin_out_dump) {
+            LITE_THROW("lite model don't support the binary output dump");
+        }
+        if (enable_copy_to_host) {
+            LITE_WARN("lite model set copy to host defaultly");
+        }
+    }
+}
+
+template <>
+void IOdumpOption::config_model_internel<ModelMdl>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        if (enable_io_dump) {
+            mgb_log_warn("enable text io dump");
+            auto iodump = std::make_unique<mgb::TextOprIODump>(
+                    model->get_mdl_config().comp_graph.get(), dump_path.c_str());
+            iodump->print_addr(false);
+            io_dumper = std::move(iodump);
+        }
+
+        if (enable_io_dump_stdout) {
+            mgb_log_warn("enable text io dump to stdout");
+            std::shared_ptr<FILE> std_out(stdout, [](FILE*) {});
+            auto iodump = std::make_unique<mgb::TextOprIODump>(
+                    model->get_mdl_config().comp_graph.get(), std_out);
+            iodump->print_addr(false);
+            io_dumper = std::move(iodump);
+        }
+
+        if (enable_io_dump_stderr) {
+            mgb_log_warn("enable text io dump to stderr");
+            std::shared_ptr<FILE> std_err(stderr, [](FILE*) {});
+            auto iodump = std::make_unique<mgb::TextOprIODump>(
+                    model->get_mdl_config().comp_graph.get(), std_err);
+            iodump->print_addr(false);
+            io_dumper = std::move(iodump);
+        }
+
+        if (enable_bin_io_dump) {
+            mgb_log_warn("enable binary io dump");
+            auto iodump = std::make_unique<mgb::BinaryOprIODump>(
+                    model->get_mdl_config().comp_graph.get(), dump_path);
+            io_dumper = std::move(iodump);
+        }
+
+        if (enable_bin_out_dump) {
+            mgb_log_warn("enable binary output dump");
+            out_dumper = std::make_unique<OutputDumper>(dump_path.c_str());
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
+        if (enable_bin_out_dump) {
+            auto load_result = model->get_mdl_load_result();
+            out_dumper->set(load_result.output_var_list);
+
+            std::vector<mgb::ComputingGraph::Callback> cb;
+            for (size_t i = 0; i < load_result.output_var_list.size(); i++) {
+                cb.push_back(out_dumper->bind());
+            }
+            model->set_output_callback(cb);
+        }
+        if (enable_copy_to_host) {
+            auto load_result = model->get_mdl_load_result();
+
+            std::vector<mgb::ComputingGraph::Callback> cb;
+            for (size_t i = 0; i < load_result.output_var_list.size(); i++) {
+                mgb::HostTensorND val;
+                auto callback = [val](const mgb::DeviceTensorND& dv) mutable {
+                    val.copy_from(dv);
+                };
+                cb.push_back(callback);
+            }
+            model->set_output_callback(cb);
+        }
+    } else if (runtime_param.stage == RunStage::AFTER_RUNNING_WAIT) {
+        if (enable_bin_out_dump) {
+            out_dumper->write_to_file();
+        }
+    }
+}
+
+}  // namespace lar
+
+////////////////////// Input options ////////////////////////
+using namespace lar;
+
+InputOption::InputOption() {
+    m_option_name = "input";
+    size_t start = 0;
+    auto end = FLAGS_input.find(";", start);
+    while (end != std::string::npos) {
+        std::string path = FLAGS_input.substr(start, end - start);
+        data_path.emplace_back(path);
+        start = end + 1;
+        end = FLAGS_input.find(";", start);
+    }
+    data_path.emplace_back(FLAGS_input.substr(start));
+}
+
+std::shared_ptr<lar::OptionBase> lar::InputOption::create_option() {
+    static std::shared_ptr<InputOption> m_option(new InputOption);
+    if (InputOption::is_valid()) {
+        return std::static_pointer_cast<OptionBase>(m_option);
+    } else {
+        return nullptr;
+    }
+}
+
+void InputOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+
+////////////////////// OprIOdump options ////////////////////////
+
+IOdumpOption::IOdumpOption() {
+    m_option_name = "iodump";
+    size_t valid_flag = 0;
+    if (!FLAGS_io_dump.empty()) {
+        dump_path = FLAGS_io_dump;
+        enable_io_dump = true;
+        valid_flag = valid_flag | (1 << 0);
+    }
+    if (!FLAGS_bin_io_dump.empty()) {
+        dump_path = FLAGS_bin_io_dump;
+        enable_bin_io_dump = true;
+        valid_flag = valid_flag | (1 << 1);
+    }
+    if (!FLAGS_bin_out_dump.empty()) {
+        dump_path = FLAGS_bin_out_dump;
+        enable_bin_out_dump = true;
+        valid_flag = valid_flag | (1 << 2);
+    }
+    if (FLAGS_io_dump_stdout) {
+        enable_io_dump_stdout = FLAGS_io_dump_stdout;
+        valid_flag = valid_flag | (1 << 3);
+    }
+    if (FLAGS_io_dump_stderr) {
+        enable_io_dump_stderr = FLAGS_io_dump_stderr;
+        valid_flag = valid_flag | (1 << 4);
+    }
+    // not only one dump set valid
+    if (valid_flag && (valid_flag & (valid_flag - 1))) {
+        mgb_log_warn(
+                "ONLY the last io dump option is validate and others is "
+                "skipped!!!");
+    }
+
+    enable_copy_to_host = FLAGS_copy_to_host;
+}
+
+bool IOdumpOption::is_valid() {
+    bool ret = !FLAGS_io_dump.empty();
+    ret = ret || FLAGS_io_dump_stdout;
+    ret = ret || FLAGS_io_dump_stderr;
+    ret = ret || !FLAGS_bin_io_dump.empty();
+    ret = ret || !FLAGS_bin_out_dump.empty();
+    ret = ret || FLAGS_copy_to_host;
+    return ret;
+}
+
+std::shared_ptr<OptionBase> IOdumpOption::create_option() {
+    static std::shared_ptr<IOdumpOption> option(new IOdumpOption);
+    if (IOdumpOption::is_valid()) {
+        return std::static_pointer_cast<OptionBase>(option);
+    } else {
+        return nullptr;
+    }
+}
+
+void IOdumpOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+////////////////////// Input gflags ////////////////////////
+DEFINE_string(
+        input, "", "Set up inputs data for model --input [ file_path | data_string]");
+
+////////////////////// OprIOdump gflags ////////////////////////
+
+DEFINE_string(io_dump, "", "set the io dump file path in text format");
+DEFINE_bool(io_dump_stdout, false, "dump io opr to stdout in text format");
+DEFINE_bool(io_dump_stderr, false, "dump io opr to stderr in text format");
+DEFINE_string(bin_io_dump, "", "set the io dump file path in binary format");
+DEFINE_string(bin_out_dump, "", "set the out dump file path in binary format");
+DEFINE_bool(copy_to_host, false, "copy device data to host");
+
+REGIST_OPTION_CREATOR(input, lar::InputOption::create_option);
+REGIST_OPTION_CREATOR(iodump, lar::IOdumpOption::create_option);
--- a/lite/load_and_run/src/options/io_options.h
+++ b/lite/load_and_run/src/options/io_options.h
+/**
+ * \file lite/load_and_run/src/options/io_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include "helpers/outdumper.h"
+#include "megbrain/plugin/opr_io_dump.h"
+#include "models/model.h"
+#include "option_base.h"
+
+DECLARE_string(input);
+
+DECLARE_string(io_dump);
+DECLARE_bool(io_dump_stdout);
+DECLARE_bool(io_dump_stderr);
+DECLARE_string(bin_io_dump);
+DECLARE_string(bin_out_dump);
+DECLARE_bool(copy_to_host);
+
+namespace lar {
+
+/*!
+ * \brief: input option for --input set
+ */
+class InputOption final : public OptionBase {
+public:
+    //! static function for registe options
+    static bool is_valid() { return !FLAGS_input.empty(); };
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+    //! interface implement from OptionBase
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    InputOption();
+
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    std::vector<std::string> data_path;  // data string or data file path
+};
+
+class IOdumpOption : public OptionBase {
+public:
+    static bool is_valid();
+    static std::shared_ptr<OptionBase> create_option();
+    //! config the model, if different has different configure code, then
+    //! dispatch
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    IOdumpOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    bool enable_io_dump;
+    bool enable_io_dump_stdout;
+    bool enable_io_dump_stderr;
+    bool enable_bin_io_dump;
+    bool enable_bin_out_dump;
+    bool enable_copy_to_host;
+    std::string m_option_name;
+    std::string dump_path;
+    std::unique_ptr<mgb::OprIODumpBase> io_dumper;
+    std::unique_ptr<OutputDumper> out_dumper;
+};
+}  // namespace lar
--- a/lite/load_and_run/src/options/layout_options.cpp
+++ b/lite/load_and_run/src/options/layout_options.cpp
+/**
+ * \file lite/load_and_run/src/options/layout_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <gflags/gflags.h>
+
+#include "misc.h"
+#include "models/model_lite.h"
+#include "models/model_mdl.h"
+
+#include "layout_options.h"
+namespace lar {
+template <>
+void LayoutOption::config_model_internel<ModelLite>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+#define ENABLE_LAYOUT(layout)                           \
+    LITE_WARN("enable " #layout " optimization");       \
+    model->get_config().options.enable_##layout = true; \
+    break;
+
+        switch (option_flag) {
+            case OptLayoutType::NCHW4:
+                ENABLE_LAYOUT(nchw4)
+
+            case OptLayoutType::CHWN4:
+                LITE_THROW("lite model unsupport chwn4 layout");
+                break;
+            case OptLayoutType::NCHW44:
+                ENABLE_LAYOUT(nchw44)
+
+            case OptLayoutType::NCHW88:
+                ENABLE_LAYOUT(nchw88)
+
+            case OptLayoutType::NCHW32:
+                ENABLE_LAYOUT(nchw32)
+
+            case OptLayoutType::NCHW64:
+                ENABLE_LAYOUT(nchw64)
+
+            case OptLayoutType::NHWCD4:
+                ENABLE_LAYOUT(nhwcd4)
+
+            case OptLayoutType::NCHW44_DOT:
+                ENABLE_LAYOUT(nchw44_dot)
+            default:
+                break;
+        }
+#undef ENABLE_LAYOUT
+    }
+}
+
+template <>
+void lar::LayoutOption::config_model_internel<ModelMdl>(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
+    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
+        mgb_log_debug("mdl  layout config start");
+#define ENABLE_LAYOUT(layout)                                                  \
+    mgb_log_warn("enable " #layout " optimization");                           \
+    model->get_mdl_config().comp_graph->options().graph_opt.enable_##layout(); \
+    break;
+
+        switch (option_flag) {
+            case OptLayoutType::NCHW4:
+                ENABLE_LAYOUT(nchw4)
+
+            case OptLayoutType::CHWN4:
+                ENABLE_LAYOUT(chwn4)
+
+            case OptLayoutType::NCHW44:
+                ENABLE_LAYOUT(nchw44)
+
+            case OptLayoutType::NCHW88:
+                ENABLE_LAYOUT(nchw88)
+
+            case OptLayoutType::NCHW32:
+                ENABLE_LAYOUT(nchw32)
+
+            case OptLayoutType::NCHW64:
+                ENABLE_LAYOUT(nchw64)
+
+            case OptLayoutType::NHWCD4:
+                ENABLE_LAYOUT(nhwcd4)
+
+            case OptLayoutType::NCHW44_DOT:
+                ENABLE_LAYOUT(nchw44_dot)
+
+            default:
+                break;
+        }
+        mgb_log_debug("mdl layout config end");
+
+#undef ENABLE_LAYOUT
+    }
+}
+}  // namespace lar
+
+using namespace lar;
+
+OptLayoutType LayoutOption::option_flag;
+
+LayoutOption::LayoutOption() {
+    m_option_name = "layout";
+}
+
+bool LayoutOption::is_valid() {
+    size_t valid_flag = 0;
+    if (FLAGS_enable_nchw4) {
+        valid_flag = valid_flag | (1 << 0);
+    }
+    if (FLAGS_enable_chwn4) {
+        valid_flag = valid_flag | (1 << 1);
+    }
+    if (FLAGS_enable_nchw44) {
+        valid_flag = valid_flag | (1 << 2);
+    }
+    if (FLAGS_enable_nchw88) {
+        valid_flag = valid_flag | (1 << 3);
+    }
+    if (FLAGS_enable_nchw32) {
+        valid_flag = valid_flag | (1 << 4);
+    }
+    if (FLAGS_enable_nchw64) {
+        valid_flag = valid_flag | (1 << 5);
+    }
+    if (FLAGS_enable_nhwcd4) {
+        valid_flag = valid_flag | (1 << 6);
+    }
+    if (FLAGS_enable_nchw44_dot) {
+        valid_flag = valid_flag | (1 << 7);
+    }
+
+    bool ret = valid_flag && !(valid_flag & (valid_flag - 1));
+    if (ret) {
+        option_flag = static_cast<OptLayoutType>(valid_flag);
+    } else {
+        option_flag = static_cast<OptLayoutType>(0);
+    }
+
+    return ret;
+};
+
+std::shared_ptr<OptionBase> LayoutOption::create_option() {
+    static std::shared_ptr<LayoutOption> option(new LayoutOption);
+    if (LayoutOption::is_valid()) {
+        return std::static_pointer_cast<OptionBase>(option);
+    } else {
+        return nullptr;
+    }
+}
+
+void LayoutOption::config_model(
+        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
+    CONFIG_MODEL_FUN;
+}
+
+DEFINE_bool(enable_nchw4, false, "enable nchw4 layout optimization!!");
+DEFINE_bool(enable_chwn4, false, "enable chwn4 layout optimization!!");
+DEFINE_bool(enable_nchw44, false, "enable nchw44 layout optimization!!");
+DEFINE_bool(enable_nchw88, false, "enable nchw88 layout optimization!!");
+DEFINE_bool(enable_nchw32, false, "enable nchw32 layout optimization!!");
+DEFINE_bool(enable_nchw64, false, "enable nchw64 layout optimization!!");
+DEFINE_bool(enable_nhwcd4, false, "enable nhwcd4 layout optimization!!");
+DEFINE_bool(enable_nchw44_dot, false, "enable nchw444-dot layout optimization!!");
+
+REGIST_OPTION_CREATOR(layout, lar::LayoutOption::create_option);
\ No newline at end of file
--- a/lite/load_and_run/src/options/layout_options.h
+++ b/lite/load_and_run/src/options/layout_options.h
+/**
+ * \file lite/load_and_run/src/options/layout_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include "helpers/common.h"
+#include "models/model.h"
+#include "option_base.h"
+
+DECLARE_bool(enable_nchw4);
+DECLARE_bool(enable_chwn4);
+DECLARE_bool(enable_nchw44);
+DECLARE_bool(enable_nchw88);
+DECLARE_bool(enable_nchw32);
+DECLARE_bool(enable_nchw64);
+DECLARE_bool(enable_nhwcd4);
+DECLARE_bool(enable_nchw44_dot);
+
+namespace lar {
+/*!
+ * \brief: layout option for optimization
+ */
+class LayoutOption final : public OptionBase {
+public:
+    //! check the validation  of option flag
+    static bool is_valid();
+
+    //! creat options when option is used
+    static std::shared_ptr<OptionBase> create_option();
+
+    //! config the model, dispatch configuration for different model implement
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    //! get option name
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    //! Constructor
+    LayoutOption();
+
+    //! configuration for different model implement
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    static OptLayoutType option_flag;
+    std::string m_option_name;
+};
+}  // namespace lar
\ No newline at end of file
--- a/lite/load_and_run/src/options/optimize_options.cpp
+++ b/lite/load_and_run/src/options/optimize_options.cpp
--- a/lite/load_and_run/src/options/optimize_options.h
+++ b/lite/load_and_run/src/options/optimize_options.h
+/**
+ * \file lite/load_and_run/src/options/optimize_options.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <gflags/gflags.h>
+#include "helpers/common.h"
+#include "models/model.h"
+#include "option_base.h"
+
+DECLARE_bool(enable_fuse_preprocess);
+DECLARE_bool(weight_preprocess);
+DECLARE_bool(enable_fuse_conv_bias_nonlinearity);
+DECLARE_bool(enable_fuse_conv_bias_with_z);
+
+DECLARE_bool(const_shape);
+DECLARE_bool(fake_first);
+DECLARE_bool(no_sanity_check);
+DECLARE_bool(record_comp_seq);
+DECLARE_bool(record_comp_seq2);
+DECLARE_bool(disable_mem_opt);
+DECLARE_uint64(workspace_limit);
+
+DECLARE_bool(enable_jit);
+#if MGB_ENABLE_TENSOR_RT
+DECLARE_bool(tensorrt);
+DECLARE_string(tensorrt_cache);
+#endif
+namespace lar {
+///////////////////////// fuse_preprocess optimize options //////////////
+class FusePreprocessOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    FusePreprocessOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool enable_fuse_preprocess;
+};
+
+///////////////////////// weight preprocess optimize options //////////////
+class WeightPreprocessOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    WeightPreprocessOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool weight_preprocess;
+};
+
+/////////////// fuse_conv_bias_nonlinearity optimize options ///////////////
+class FuseConvBiasNonlinearOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    FuseConvBiasNonlinearOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool enable_fuse_conv_bias_nonlinearity;
+};
+
+///////////////////////// fuse_conv_bias_with_z optimize options //////////////
+class FuseConvBiasElemwiseAddOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    FuseConvBiasElemwiseAddOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+    std::string m_option_name;
+    bool enable_fuse_conv_bias_with_z;
+};
+
+///////////////////////// graph record options ///////////////////////////
+class GraphRecordOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    GraphRecordOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    size_t m_record_comp_seq;
+    bool const_shape;
+    bool fake_first;
+    bool no_sanity_check;
+};
+
+///////////////////////// memory optimize options /////////////////////////
+class MemoryOptimizeOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    MemoryOptimizeOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool disable_mem_opt;
+    uint64_t workspace_limit;
+};
+
+///////////////////////// other options for optimization /////////////////
+class JITOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    JITOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool enable_jit;
+};
+///////////////////////// TensorRT options for optimization /////////////////
+#if MGB_ENABLE_TENSOR_RT
+class TensorRTOption final : public OptionBase {
+public:
+    static bool is_valid();
+
+    static std::shared_ptr<OptionBase> create_option();
+
+    void config_model(
+            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
+
+    std::string option_name() const override { return m_option_name; };
+
+private:
+    TensorRTOption();
+    template <typename ModelImpl>
+    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
+
+    std::string m_option_name;
+    bool enable_tensorrt;
+    std::string tensorrt_cache;
+};
+#endif
+}  // namespace lar
\ No newline at end of file
--- a/lite/load_and_run/src/options/option_base.h
+++ b/lite/load_and_run/src/options/option_base.h
--- a/lite/load_and_run/src/options/plugin_options.cpp
+++ b/lite/load_and_run/src/options/plugin_options.cpp
--- a/lite/load_and_run/src/options/plugin_options.h
+++ b/lite/load_and_run/src/options/plugin_options.h
--- a/lite/load_and_run/src/options/strategy_options.cpp
+++ b/lite/load_and_run/src/options/strategy_options.cpp
--- a/lite/load_and_run/src/options/strategy_options.h
+++ b/lite/load_and_run/src/options/strategy_options.h
--- a/lite/load_and_run/src/strategys/strategy.cpp
+++ b/lite/load_and_run/src/strategys/strategy.cpp
--- a/lite/load_and_run/src/strategys/strategy.h
+++ b/lite/load_and_run/src/strategys/strategy.h
--- a/lite/load_and_run/src/strategys/strategy_fitting.cpp
+++ b/lite/load_and_run/src/strategys/strategy_fitting.cpp
--- a/lite/load_and_run/src/strategys/strategy_normal.cpp
+++ b/lite/load_and_run/src/strategys/strategy_normal.cpp