feat(lite): open source for lite

GitOrigin-RevId: f442431381cc9eb3bb92eb03f744ba6ffa7e2b64

feat(lite): open source for lite
GitOrigin-RevId: f442431381cc9eb3bb92eb03f744ba6ffa7e2b64
71230e9a · Megvii Engine Team · 5fe789ab · 71230e9a · 71230e9a · 71230e9a
101 changed file
--- a/lite/.gitattributes
+++ b/lite/.gitattributes
+test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text
+third_party/librknn_api filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
+option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON)
+
+# config lite_build_config.h.in
+set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL})
+set(LITE_WITH_CUDA ${MGE_WITH_CUDA})
+set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING})
+set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
+set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC})
+
+if(NOT MGB_WITH_FLATBUFFERS)
+    include(../cmake/flatbuffers.cmake)
+endif()
+
+file(GLOB_RECURSE SRC_FBS src/**/*.fbs)
+build_flatbuffers(
+    "${SRC_FBS}"
+    ""
+    lite_fbs_generate
+    ""
+    "${CMAKE_CURRENT_BINARY_DIR}"
+    ""
+    ""
+    )
+
+file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp)
+
+if(MGE_WITH_MINIMUM_SIZE)
+    set(LITE_ENABLE_LOGGING OFF)
+    set(LITE_ENABLE_EXCEPTION OFF)
+endif()
+
+# Write out lite_build_config.h
+# It defines macros needed by lite
+configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# begin config lite
+if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
+    # FXIME third_party cpp redis do not support build with clang-cl
+    file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp)
+    list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS})
+    file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
+    list(APPEND SOURCES_LITE ${SOURCES_TACOPIE})
+endif()
+add_library(lite_static STATIC ${SOURCES_LITE})
+add_dependencies(lite_static lite_fbs_generate)
+include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)
+
+if(LITE_BUILD_WITH_MGE)
+    target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+    add_compile_definitions(LITE_BUILD_WITH_MGE=1)
+    message(STATUS "build lite with MegEngine.")
+else()
+    target_link_libraries(lite_static PUBLIC flatbuffers)
+endif()
+
+include_directories(
+    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>
+    )
+# end config lite
+
+# define a shared lib
+add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>)
+if(LITE_BUILD_WITH_MGE)
+    target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+endif()
+if(ANDROID)
+    link_libraries(log)
+    target_link_libraries(lite_static PRIVATE log)
+    target_link_libraries(lite_shared PRIVATE log)
+endif()
+
+if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
+    # FXIME third_party cpp redis do not support build with clang-cl
+    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
+    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
+    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
+    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
+endif()
+set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script")
+add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT})
+if(NOT MSVC AND NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+endif()
+#TODO: implemente version script for other OS
+if (UNIX AND NOT APPLE)
+    target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
+    set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
+endif()
+
+# config install
+install(TARGETS lite_static
+    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+    ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
+
+install(TARGETS lite_shared
+    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+    ARCHIVE DESTINATION lite/lib/${MGE_ARCH}
+    )
+
+install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(example)
+if(MGE_WITH_TEST)
+    add_subdirectory(test)
+endif()
+
+# tools and example
+add_executable(rc4_encryptor tools/rc4_encrypt.cpp)
+
+target_link_libraries(rc4_encryptor lite_static)
+if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(rc4_encryptor megdnn)
+endif()
+target_include_directories(rc4_encryptor PRIVATE
+    {PROJECT_SOURCE_DIR}/lite/src/decryption)
+install (TARGETS rc4_encryptor
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/tools)
--- a/lite/README.md
+++ b/lite/README.md
+# Lite
+
+It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in 
+user's SDK
+
+## bazel build 
+
+目前支持内部 bazel 和 CMake 编译，支持 C++/C, Python 接口，
+下面是 bazel 中 lite_shared 目标的编译，可以作为其他目标的编译的参考，
+该编译依赖内部 bazel 编译以及 megvii3。
+
+### 配置编译环境
+
+需要使用 megvii3 workspace 来完成 bazel 的编译
+
+#### Clone megvii3 安装 bazel
+
+```bash
+    git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git
+    ./utils/bazel/get_bazel.sh
+```
+
+#### Clone megbrain
+```
+    git submodule update brain/megbrain brain/midout
+```
+
+### 编译 x86 CUDA 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
+        --compiler="gcc7_cuda10" -c opt
+```
+
+### 编译 x86 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
+        --compiler="gcc9" -c opt
+```
+
+### 编译 arm OpenCL 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \
+        -c opt --define enable_opencl=1  --define enable_opencl_search=1
+```
+### 编译 arm opencl lite_examples
+bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \
+--cpu=android_aarch64 --define enable_opencl=1  --define enable_opencl_search=1
+####如何运行snpe_loder 的lite_exampes 请查看下面的wiki
+https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906
+
+### 编译 armv7 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \
+        -c opt
+```
+
+### 编译 arm64 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
+        -c opt
+```
+
+### 编译 arm64 CPU v8.2 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
+       --copt -march=armv8.2-a+fp16+dotprod  -c opt
+```
+
+## 同时支持cmake构建
+cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine
+和RKNPU后端且打开OpenCL的release模式
+```bash
+EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \
+-DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh"
+```
+* 如果需要支持性能分析的 profile 功能，则需要在编译时候加上
+ --copt -DMGB_ENABLE_JSON=1 该参数
+* 如果需要支持 fast-run 功能则需要加上
+ --copt -DMGB_ENABLE_FASTRUN=1，开启 fast-run 功能
+* 如果编译 arm64，可以加上 --copt -mcpu=cortex-a53 选项进行优化。
+
+### midout 裁减编译
+具体 midout 的裁减原理见 megbrain 中 midout 裁减，裁减方法见 MegBrain 
+和 MegEngine 的裁减方法
+
+## 模型
+
+### 支持的模型
+
+lite 目前支持只支持 MegEngine dump 的模型格式，可以加载的模型文件包括原始
+的模型文件，原始的加密模型，pack 之后的加密或者非加密模型。加密算法以及
+加密的秘钥可以用户自定义，然后注册到 lite 中，详见 example 中加解密部分。
+
+* 原始模型未加密：直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型
+* 原始加密模型：将上述 dump 的模型通过加密算法进行加密，lite 提供两种默认
+的加密算法，在 tools 中，分别为 aes 和 rc4. 对应为：aes_encypt.sh 和
+rc4_encrypt.cpp，rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在
+加载时候需要在 Config 中配置模型的加密方式。
+* pack 之后的模型：模型结构将在下面介绍，可以将上面加密或者未加密的模型，和下面
+定义的 json config 文件一同打包为一个 pack 之后的模型，可以使用 tools 下面
+的 pack_model_and_info.py 工具中完成，pack_model_and_info.py 的使用详见其中
+的 help 输出。
+
+### 模型结构
+
+不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分.
+
+* 打包处理之后的文件：
+  模型打包过程可以通过脚本 pack_model_and_json.py 来完成，其将模型info文件（
+  可以是任意格式，推荐使用JSON，可以加密也可以不加密）和加密或者未加密的模型文件
+  一同打包在一起，并在文件开头加上 Header 来帮助解析。
+* 原始文件和原始的加密文件没有 Header 和模型 info部分，模型加载需要的信息
+  可以通过 Config 和 NetworkIO 进行传递。
+
+### Header
+
+Header 部分最开始为一个明文固定model_tag，目前定义为"packed_model"字符串，
+后面主要包含模型文件各个部分的信息，每个部分的加密方式，load 模型时候可以
+调用相应的解密方法对各个部分进行解密，以及model infomation 部分的解析方法。
+具体细节参考lite/src/parse_model/pack_model.fbs
+
+### Info部分
+
+Info 部分主要用来解释模型，如用户关心的：模型的输入数据的格式，模型运行的平台
+等信息，这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。
+由于这个 Info 部分不同的用户需求不一致，想传递的信息也无法统一，所以目前
+Lite 中提供自定义的方式，用户可以自定义自己 Info 部分的类容，并在 Header 中
+指定 **Info 解析方式名字** ，并注册以该名字为 key 的解析函数到 Lite 中，
+以这样方式来可以实现用户自定义 Info 格式。同时，Lite 中也提供了一套定义好的
+格式，其名字为 "LITE_default"，并已经实现了对应的解析函数，该 info
+为 JSON 格式，具体内容定义如下：
+
+```json
+{
+    "name": "shufflenet_test",
+    "valid": true,
+    "version": "8.9999.0",
+    "has_compression": false,
+    "device": {
+        "type": "CPU",
+        "device_id": 0,
+        "number_threads": 1,
+        "use_tensor_rt": false,
+        "enable_inplace_model": false
+    },
+    "options":{
+        "weight_preprocess": false,
+        "var_sanity_check_first_run": true,
+        "const_shape": false,
+        "jit_level": 0,
+        "record_level": 0
+    },
+    "IO":{
+        "inputs":[
+             {
+                "name": "data",
+                "io_type": "value",
+                "is_host": true,
+                "dtype": "float32",
+                "shape": {
+                    "dim0": 1,
+                    "dim1": 3,
+                    "dim2": 224,
+                    "dim3": 224
+                }
+            }
+        ],
+        "outputs":[
+             {
+                "name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]",
+                "io_type": "value",
+                "is_host": true,
+                "dtype": "float32",
+                "shape": {
+                    "dim0": 1,
+                    "dim1": 1000,
+                    "dim2": 0,
+                    "dim3": 0
+                }
+            }
+        ]
+    }
+}
+```
+
+* model_name: 指这个模型的名字，用户可以用来验证是否运行了正确的模型，
+和 Header 部分中的进行对比 check
+* valid: 指在这个 info 文件中的设置是否影响模型的 Config
+* version: 指模型对应的 megbrain 的版本号，load 模型时候会进行 check
+* has_compression: 标识这个模型文件中 tensor 的数据是否压缩过
+* device: 目前支持字段包括："CPU","CUDA","OPENCL","ATLAS"
+* number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效
+* IO::inputs::type: 包括 value,shape，详见 include"network.h"
+* IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端
+* IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端
+* IO::outputs::shape::dimx: 如果为0，则便是该 dim 无效
+
+### Model部分
+
+可以是加密的模型文件或者未加密的模型文件
+
+## 使用
+
+丰富的使用方法详见文件 example 中文档和对应的 example。
+
+## 工具
+
+目前 lite 中有三个工具保存在 tools 目录中，其他 megbrain 工具
+没有包含在内，分别为：
+
+* pack_model_and_info.py 为上面提到的模型打包工具，其为一个
+  python 脚本，可以直接用其对已有的模型和模型 information 的文件，按照上面
+  的格式进行打包模型，用户可以指定模型名字，模型加密方式，模型信息
+  文件加密方式，解析方式等，如下：
+
+    ```bash
+    python3 pack_model_and_info.py --input-model xxx.mge \
+        --model-name="shufflenet_test" \
+        --model-cryption="RC4_default" \
+        --input-info xxx.json \
+        --info-cryption="RC4_default" \
+        --info-parser="LITE_default" \
+        -o xxx.lite
+    ```
+* aes_encrypt.sh 为一个 aes 加密方式的加密脚本，可以将一个文件，
+通过指定的的 key 加密成一个 aes 加密的文件，其中 key 为 32 个字节
+16进制数。
+    ```bash
+    aes_encrypt.sh  xxx.mdl  xxx_encrypted.mdl \
+        000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
+    ```
+
+* rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具，这个工具可以通过
+  制定的 key 或者默认的 key 加密制定的文件，支持 rc4 方法和
+  simple_fast_rc4 两种方法，支持自定义 key。
+    * bazel 编译 x86 命令为：
+    ```bash
+    bazel build //brain/megbrain/lite:rc4_encryptor \
+        --cpu='k8' --compiler='gcc9'
+    ```
+    * 加密文件，具体用法见 help
+    ```bash
+    rc4_encryptor encrypt_predefined_rc4 \
+        to_be_encrypt.file encrypted.file
+    ```
--- a/lite/build_config/lite_build_config.h
+++ b/lite/build_config/lite_build_config.h
+/**
+ * \file lite/build_config/lite_build_config.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#ifndef _HEADER_LITE_BUILD_CONFIG
+#define _HEADER_LITE_BUILD_CONFIG
+
+#ifndef LITE_ENABLE_LOGGING
+#define LITE_ENABLE_LOGGING 1
+#endif
+
+#ifndef LITE_ENABLE_EXCEPTION
+#if __cpp_exceptions || __EXCEPTIONS || \
+        (defined(_MSC_VER) && defined(_CPPUNWIND))
+#define LITE_ENABLE_EXCEPTION 1
+#else
+#define LITE_ENABLE_EXCEPTION 0
+#endif
+#endif
+
+#ifndef LITE_WITH_CUDA
+#define LITE_WITH_CUDA 0
+#endif
+
+#ifndef LITE_ASSERT_LOC
+#define LITE_ASSERT_LOC 1
+#endif
+#endif  // _HEADER_LITE_BUILD_CONFIG
--- a/lite/example/CMakeLists.txt
+++ b/lite/example/CMakeLists.txt
+file (GLOB_RECURSE SOURCES ./*.cpp)
+add_executable(lite_examples  ${SOURCES})
+
+if(LITE_BUILD_WITH_RKNPU)
+    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(lite_examples  PRIVATE "-fuse-ld=gold")
+endif()
+
+target_link_libraries(lite_examples lite_static)
+if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(lite_examples megdnn)
+endif()
+
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(lite_examples dl)
+    else()
+        target_link_libraries(lite_examples dl rt)
+    endif()
+endif()
+
+install (TARGETS lite_examples
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/bin)
+
+# add lite_examples_depends_shared for CI check symbol export valid
+add_executable(lite_examples_depends_shared  ${SOURCES})
+
+if(LITE_BUILD_WITH_RKNPU)
+    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(lite_examples_depends_shared  PRIVATE "-fuse-ld=gold")
+endif()
+
+target_link_libraries(lite_examples_depends_shared lite_shared)
+
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(lite_examples_depends_shared dl)
+    else()
+        target_link_libraries(lite_examples_depends_shared dl rt)
+    endif()
+endif()
+
+install (TARGETS lite_examples_depends_shared
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/bin)
--- a/lite/example/example.h
+++ b/lite/example/example.h
+/**
+ * \file example/example.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include <lite_build_config.h>
+
+#include "lite/global.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+
+#include "npy.h"
+
+#include <string.h>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace lite {
+namespace example {
+
+void set_cpu_affinity(const std::vector<int>& cpuset);
+
+struct Args {
+    int args_parse_ret = 0;
+    std::string example_name;
+    std::string model_path;
+    std::string input_path;
+    std::string output_path;
+    std::string loader_path;
+    static Args from_argv(int argc, char** argv);
+};
+
+std::shared_ptr<Tensor> parse_npy(
+        const std::string& path,
+        LiteBackend backend = LiteBackend::LITE_DEFAULT);
+
+using ExampleFunc = std::function<bool(const Args&)>;
+using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>;
+
+ExampleFuncMap* get_example_function_map();
+
+bool register_example(std::string example_name, const ExampleFunc& fuction);
+
+template <int>
+struct Register;
+
+#if LITE_BUILD_WITH_MGE
+#if LITE_WITH_CUDA
+bool load_from_path_run_cuda(const Args& args);
+#endif
+bool basic_load_from_path(const Args& args);
+bool basic_load_from_path_with_loader(const Args& args);
+bool basic_load_from_memory(const Args& args);
+bool cpu_affinity(const Args& args);
+bool network_share_same_weights(const Args& args);
+bool reset_input(const Args& args);
+bool reset_input_output(const Args& args);
+bool config_user_allocator(const Args& args);
+bool register_cryption_method(const Args& args);
+bool update_cryption_key(const Args& args);
+bool async_forward(const Args& args);
+
+#if LITE_WITH_CUDA
+bool device_input(const Args& args);
+bool device_input_output(const Args& args);
+bool pinned_host_input(const Args& args);
+#endif
+#endif
+
+}  // namespace example
+}  // namespace lite
+
+#if LITE_BUILD_WITH_MGE
+bool basic_c_interface(const lite::example::Args& args);
+bool device_io_c_interface(const lite::example::Args& args);
+bool async_c_interface(const lite::example::Args& args);
+#endif
+
+#define CONCAT_IMPL(a, b) a##b
+#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)
+
+#define REGIST_EXAMPLE(name_, func_) \
+    REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_)
+
+#define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_)          \
+    template <>                                                 \
+    struct Register<number_> {                                  \
+        Register() { register_example(name_, func_); }          \
+    };                                                          \
+    namespace {                                                 \
+    Register<number_> MACRO_CONCAT(example_function_, number_); \
+    }
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/main.cpp
+++ b/lite/example/main.cpp
+/**
+ * \file example/example.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/global.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+
+#include "example.h"
+#include "npy.h"
+
+#include <string.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+using namespace lite;
+using namespace example;
+
+Args Args::from_argv(int argc, char** argv) {
+    Args ret;
+    if (argc < 4) {
+        printf("usage: lite_examples <example_name> <model file> <input "
+               "file> <output file>.\n");
+        printf("*********The output file is optional.*************\n");
+        printf("The registered examples include:\n");
+        size_t index = 0;
+        for (auto it : *get_example_function_map()) {
+            printf("%zu : %s\n", index, it.first.c_str());
+            index++;
+        }
+        ret.args_parse_ret = -1;
+        return ret;
+    }
+    ret.example_name = argv[1];
+    ret.model_path = argv[2];
+    ret.input_path = argv[3];
+    if (argc > 4) {
+        ret.output_path = argv[4];
+    }
+    if (argc > 5) {
+        ret.loader_path = argv[5];
+    }
+    return ret;
+}
+
+ExampleFuncMap* lite::example::get_example_function_map() {
+    static ExampleFuncMap static_map;
+    return &static_map;
+}
+
+bool lite::example::register_example(std::string example_name,
+                                     const ExampleFunc& fuction) {
+    auto map = get_example_function_map();
+    if (map->find(example_name) != map->end()) {
+        printf("Error!!! This example is registed yet\n");
+        return false;
+    }
+    (*map)[example_name] = fuction;
+    return true;
+}
+
+std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path,
+                                                 LiteBackend backend) {
+    std::string type_str;
+    std::vector<npy::ndarray_len_t> stl_shape;
+    std::vector<int8_t> raw;
+    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);
+
+    auto lite_tensor =
+            std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU);
+    Layout layout;
+    layout.ndim = stl_shape.size();
+    const std::map<std::string, LiteDataType> type_map = {
+            {"f4", LiteDataType::LITE_FLOAT},
+            {"i4", LiteDataType::LITE_INT},
+            {"i1", LiteDataType::LITE_INT8},
+            {"u1", LiteDataType::LITE_UINT8}};
+    layout.shapes[0] = 1;
+    for (size_t i = 0; i < layout.ndim; i++) {
+        layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
+    }
+
+    for (auto& item : type_map) {
+        if (type_str.find(item.first) != std::string::npos) {
+            layout.data_type = item.second;
+            break;
+        }
+    }
+    lite_tensor->set_layout(layout);
+    size_t length = lite_tensor->get_tensor_total_size_in_byte();
+    void* dest = lite_tensor->get_memory_ptr();
+    memcpy(dest, raw.data(), length);
+    //! rknn not support reshape now
+    if (layout.ndim == 3) {
+            lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]),
+                                  static_cast<int>(layout.shapes[1]),
+                                  static_cast<int>(layout.shapes[2])});
+    }
+    return lite_tensor;
+}
+
+void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) {
+#if defined(__APPLE__) || defined(WIN32)
+#pragma message("set_cpu_affinity not enabled on apple and windows platform")
+#else
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i : cpuset) {
+        CPU_SET(i, &mask);
+    }
+    auto err = sched_setaffinity(0, sizeof(mask), &mask);
+    if (err) {
+        printf("failed to sched_setaffinity: %s (error ignored)",
+               strerror(errno));
+    }
+#endif
+}
+
+int main(int argc, char** argv) {
+    set_log_level(LiteLogLevel::WARN);
+    auto&& args = Args::from_argv(argc, argv);
+    if (args.args_parse_ret)
+        return -1;
+    auto map = get_example_function_map();
+    auto example = (*map)[args.example_name];
+    if (example) {
+        printf("Begin to run %s example.\n", args.example_name.c_str());
+        return example(args);
+    } else {
+        printf("The example of %s is not registed.", args.example_name.c_str());
+        return -1;
+    }
+}
+namespace lite {
+namespace example {
+
+#if LITE_BUILD_WITH_MGE
+#if LITE_WITH_CUDA
+REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda);
+#endif
+REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path);
+REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader);
+REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory);
+REGIST_EXAMPLE("cpu_affinity", cpu_affinity);
+REGIST_EXAMPLE("register_cryption_method", register_cryption_method);
+REGIST_EXAMPLE("update_cryption_key", update_cryption_key);
+REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights);
+REGIST_EXAMPLE("reset_input", reset_input);
+REGIST_EXAMPLE("reset_input_output", reset_input_output);
+REGIST_EXAMPLE("config_user_allocator", config_user_allocator);
+REGIST_EXAMPLE("async_forward", async_forward);
+
+REGIST_EXAMPLE("basic_c_interface", basic_c_interface);
+REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface);
+REGIST_EXAMPLE("async_c_interface", async_c_interface);
+
+#if LITE_WITH_CUDA
+REGIST_EXAMPLE("device_input", device_input);
+REGIST_EXAMPLE("device_input_output", device_input_output);
+REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
+#endif
+#endif
+}  // namespace example
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/README.md
+++ b/lite/example/mge/README.md
+# Example
+
+在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子，主要
+是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example 
+都是使用 shufflenet 来进行演示。
+
+## Example bazel 的编译和运行
+
+* 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境，编译 CPU 版本
+```bash
+    ./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \
+        --compiler="gcc9" -c opt
+```
+* 运行时需要指定运行的具体 example 名字，运行的模型，模型运行的数据
+ * 获取所有的 example 名字
+```
+    bazel-bin/brain/megbrain/lite/lite_examples
+```
+ * 运行 example，下面命令运行 basic_load_from_memory
+```
+    bazel-bin/brain/megbrain/lite/lite_examples \
+        basic_load_from_memory \
+        path-to-megbrain/lite/test/resource/lite/shufflenet.mge \
+        path-to-megbrain/lite/test/resource/lite/input_data.npy
+```
+
+## basic 使用
+
+* **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和
+ basic_load_from_memory**
+
+* 该 example 使用 lite 来完成基本的 inference 功能，load 模型使用默认的配置，
+进行 forward 之前将输入数据 copy 到输入 tensor 中，完成 forward 之后，再将
+数据从输出 tensor 中 copy 到用户的内存中，输入 tensor 和输出 tensor 都是从
+Network 中通过 name 来获取的，输入输出 tensor 的 layout 也可以从对应的 tensor
+中直接获取获取，**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。**
+
+## 输入输出指定的内存
+
+* **实现在 reset_io.cpp 中，包括两个 example，reset_input 和 reset_input_output
+两个 example。**
+
+* 该 example 中演示输入 tensor 的内存为用户指定的内存（该内存中已经保存好输入
+数据），输出 tensor 也可以是用户指定的内存，这样 Network 完成 Forward 之后就会将数据
+保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。
+
+* 主要是通过 tensor 中的 reset 接口，该接口可以重新指定 tensor 的内存和对应的
+layout，如果 layout 没有指定，默认为 tensor 中原来的 layout。
+
+* **该方法中由于内存是用户申请，需要用户提前知道输入，输出 tensor 对应的 layout，然后
+根据 layout 来申请内存，另外通过 reset 设置到 tensor 中的内存，生命周期不由 tensor
+管理，由外部用户来管理。**
+
+## 输入输出指定 device 上内存
+
+* **实现在 device_io.cpp 中，device_input 和 device_input_output 两个 example。**
+
+* 该 example 中配置模型运行在 device(CUDA) 上，并且使用用户提前申请的 device 上的内存
+作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上，不设置默认
+在 CPU 上，其他地方和**输入输出为用户指定的内存**的使用相同
+
+* 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端
+
+## 申请 pinned host 内存作为输入
+
+* **实现在 device_io.cpp 中，函数名字为 pinned_host_input。**
+
+* 这个 example 中模型运行在 device(CUDA) 上，但是输入输出在 CPU 上，为了加速 host2device 的
+copy，将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出
+output tensor 不是 device 上的时候，默认就是 pinned host 的。
+
+* 申请 pinned host 内存的方法是：构建 tensor 的时候指定 device，layout，以及 is_host_pinned
+参数，这样申请的内存就是 pinned host 的内存。
+
+    ```C
+     bool is_pinned_host = true;
+     auto tensor_pinned_input =
+             Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
+    ```
+
+## 用户指定内存分配器
+
+* **实现在 user_allocator.cpp 中，函数名为：config_user_allocator。**
+
+* 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法，用户自定义
+内存分配器需要继承自 lite 中的 Allocator 基类，并实现 allocate 和 free 两个接口。目前在 CPU
+上验证是正确的，其他设备上有待测试。
+
+* 设置自定定义内存分配器的接口为 Network 中如下接口：
+    ```C
+    Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator);
+    ```
+
+## 多个 Network 共享同一份模型 weights
+
+* **实现在 network_share_weights.cpp 中，函数名为：network_share_same_weights。**
+
+* 很多情况用户希望多个 Network 共享同一份 weights，因为模型中 weights 是只读的，这样可以节省
+模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能，首先创建一个新的 Network，
+用户可以指定新的 Config 和 NetworkIO 以及其他一些配置，使得新创建出来的 Network 完成不同的
+功能。
+
+* 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口：
+    ```C
+        static void shared_weight_with_network(
+            std::shared_ptr<Network> dst_network,
+            const std::shared_ptr<Network> src_network);
+    ```
+    * dst_network: 指新 load 出来的 Network
+    * src_network：已经 load 的老的 Network
+
+## CPU 绑核
+
+* **实现在 cpu_affinity.cpp 中，函数名为：cpu_affinity。**
+
+* 该 example 之中指定模型运行在 CPU 多线程上，然后使用 Network 中的
+set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来，用户可以
+根据该 id 决定具体绑核行为，在多线程中，如果线程总数为 n，则 id 为 n-1 的线程为主线程。
+
+## 用户注册自定义解密算法和 key
+
+* **实现在 user_cryption.cpp 中，函数名为：register_cryption_method 和 update_aes_key 。**
+
+* 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口，实现了使用用户自定的解密算法
+实现模型的 load 操作。在这个 example 中，自定义了一个解密方法，(其实没有做任何事情，
+将模型两次异或上 key 之后返回，等于将原始模型直接返回)，然后将其注册到 lite 中，后面创建 Network 时候在其
+config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其
+key 的更新操作。
+目前 lite 里面定义好了几种解密算法：
+    * AES_default : 其 key 是由 32 个 unsighed char 组成，默认为0到31
+    * RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char，hash
+      key 在前，enc_key 在后。
+    * SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。
+大概命名规则为：前面大写是具体算法的名字，'_'后面的小写，代表解密 key。
+具体的接口为：
+    ```C
+    bool register_decryption_and_key(std::string decrypt_name,
+                                    const DecryptionFunc& func,
+                                    const std::vector<uint8_t>& key);
+    bool update_decryption_or_key(std::string decrypt_name,
+                                    const DecryptionFunc& func,
+                                    const std::vector<uint8_t>& key);
+    ```
+register 接口中必须要求三个参数都是正确的值，update中 decrypt_nam 必须为已有的解密算法，
+将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新
+
+## 异步执行模式
+
+* **实现在 basic.cpp 中，函数名为：async_forward。**
+
+* 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式，
+目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持，在 inference 时异步模式，
+主线程可以在工作线程正在执行计算的同时做一些其他的运算，避免长时间等待，但是
+在一些单核处理器上没有收益。
+
+## 纯 C example
+
+* **实现在 lite_c_interface.cpp，函数名为：basic_c_interface，
+device_io_c_interface，async_c_interface**
+
+* Lite 完成对 C++ 接口的封装，对外暴露了纯 C 的接口，用户如果不是源码依赖 Lite
+的情况下，应该使用纯 C 接口来完成集成。
+* 纯 C 的所有接口都是返回一个 int，如果这个 int 的数值不为 0，则又错误产生，需要
+调用 LITE_get_last_error 来获取错误信息。
+* 纯 C 的所有 get 函数都需要先定义一个对应的对象，然后将该对象的指针传递进接口，
+Lite 会将结果写入到 对应指针的地址里面。
--- a/lite/example/mge/basic.cpp
+++ b/lite/example/mge/basic.cpp
+/**
+ * \file example/basic.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <thread>
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+#include <cstdio>
+
+#include "misc.h"
+
+using namespace lite;
+using namespace example;
+
+namespace {
+void output_info(std::shared_ptr<Network> network, size_t output_size) {
+    for (size_t index = 0; index < output_size; index++) {
+        printf("output[%zu] names %s \n", index,
+               network->get_all_output_name()[index].c_str());
+        std::shared_ptr<Tensor> output_tensor =
+                network->get_output_tensor(index);
+        size_t ndim = output_tensor->get_layout().ndim;
+        for (size_t i = 0; i < ndim; i++) {
+            printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
+                   output_tensor->get_layout().shapes[i]);
+        }
+    }
+}
+
+void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
+    for (size_t index = 0; index < output_size; index++) {
+        auto output_tensor = network->get_output_tensor(index);
+        void* out_data = output_tensor->get_memory_ptr();
+        size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                            output_tensor->get_layout().get_elem_size();
+        LiteDataType dtype = output_tensor->get_layout().data_type;
+        float max = -1000.0f;
+        float min = 1000.0f;
+        int max_idx = 0;
+        int min_idx = 0;
+        float sum = 0.0f;
+#define cb(_dtype, _real_dtype)                                        \
+    case LiteDataType::_dtype: {                                       \
+        for (size_t i = 0; i < out_length; i++) {                      \
+            _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
+            sum += data;                                               \
+            if (max < data) {                                          \
+                max = data;                                            \
+                max_idx = i;                                           \
+            }                                                          \
+            if (min > data) {                                          \
+                min = data;                                            \
+                min_idx = i;                                           \
+            }                                                          \
+        }                                                              \
+    } break;
+
+        switch (dtype) {
+            cb(LITE_FLOAT, float);
+            cb(LITE_INT, int);
+            cb(LITE_INT8, int8_t);
+            cb(LITE_UINT8, uint8_t);
+            default:
+                printf("unknow datatype");
+        }
+        printf("output_length %zu index %zu  max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
+               out_length, index, max, max_idx, min, min_idx, sum);
+    }
+#undef cb
+}
+}  // namespace
+
+#if LITE_WITH_CUDA
+bool lite::example::load_from_path_run_cuda(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    set_log_level(LiteLogLevel::DEBUG);
+    //! config the network running in CUDA device
+    lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
+    //! set NetworkIO
+    NetworkIO network_io;
+    std::string input_name = "img0_comp_fullface";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    network_io.inputs.push_back(device_input);
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory if filled with user input data, set it to the
+    //! input tensor
+    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+#endif
+bool lite::example::basic_load_from_path(const Args& args) {
+    set_log_level(LiteLogLevel::DEBUG);
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto layout = input_tensor->get_layout();
+    for (size_t i = 0; i < layout.ndim; i++) {
+        printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
+    }
+
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    auto layout0 = src_tensor->get_layout();
+    for (size_t i = 0; i < layout0.ndim; i++) {
+        printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
+    }
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+
+bool lite::example::basic_load_from_path_with_loader(const Args& args) {
+    set_log_level(LiteLogLevel::DEBUG);
+    lite::set_loader_lib_path(args.loader_path);
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto input_layout = input_tensor->get_layout();
+
+    //! copy or forward data to network
+    auto src_tensor = parse_npy(input_path);
+    auto src_layout = src_tensor->get_layout();
+    if (src_layout.ndim != input_layout.ndim) {
+        printf("src dim is not equal model input dim\n");
+    }
+    //! pay attention the input shape can change
+    for (size_t i = 0; i < input_layout.ndim; i++) {
+        if (input_layout.shapes[i] != src_layout.shapes[i]) {
+            printf("src shape not equal input shape");
+        }
+    }
+    input_tensor->set_layout(src_tensor->get_layout());
+
+    //! reset or forward data to network
+    input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+
+bool lite::example::basic_load_from_memory(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    FILE* fin = fopen(network_path.c_str(), "rb");
+    if (!fin) {
+        printf("failed to open %s.", network_path.c_str());
+    }
+
+    fseek(fin, 0, SEEK_END);
+    size_t size = ftell(fin);
+    fseek(fin, 0, SEEK_SET);
+    void* ptr = malloc(size);
+    std::shared_ptr<void> buf{ptr, ::free};
+    auto len = fread(buf.get(), 1, size, fin);
+    if (len < 1) {
+        printf("read file failed.\n");
+    }
+    fclose(fin);
+
+    network->load_model(buf.get(), size);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::async_forward(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    Config config;
+    config.options.var_sanity_check_first_run = false;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! set async mode and callback
+    volatile bool finished = false;
+    network->set_async_callback([&finished]() {
+#if !__DEPLOY_ON_XP_SP2__
+        std::cout << "worker thread_id:" << std::this_thread::get_id()
+                  << std::endl;
+#endif
+        finished = true;
+    });
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    //! forward
+    network->forward();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("Forward finish, count is %zu\n", count);
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/cpu_affinity.cpp
+++ b/lite/example/mge/cpu_affinity.cpp
+/**
+ * \file example/cpu_affinity.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::cpu_affinity(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    //! run with multi theads
+    Runtime::set_cpu_threads_number(network, 4);
+
+    network->load_model(network_path);
+
+    std::vector<int> core_ids = {0, 1, 2, 3};
+    auto affinity = [core_ids](int id) {
+        //! add user define affinity function
+        set_cpu_affinity({core_ids[id]});
+        printf("set thread id = %d with the affinity of core %d.\n", id,
+               core_ids[id]);
+    };
+    Runtime::set_runtime_thread_affinity(network, affinity);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/device_io.cpp
+++ b/lite/example/mge/device_io.cpp
+/**
+ * \file example/device_io.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <thread>
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+#if LITE_WITH_CUDA
+
+bool lite::example::device_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! set NetworkIO
+    NetworkIO network_io;
+    std::string input_name = "data";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    network_io.inputs.push_back(device_input);
+
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory if filled with user input data, set it to the
+    //! input tensor
+    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::device_input_output(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! set NetworkIO include input and output
+    NetworkIO network_io;
+    std::string input_name = "data";
+    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    IO device_output{output_name, is_host};
+    network_io.inputs.push_back(device_input);
+    network_io.outputs.push_back(device_output);
+
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
+    Layout input_layout = input_tensor_device->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory is filled with user input data, set it to the
+    //! input tensor
+    input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! output is in device, should copy it to host
+    std::shared_ptr<Tensor> output_tensor_device =
+            network->get_io_tensor(output_name);
+
+    auto output_tensor = std::make_shared<Tensor>();
+    output_tensor->copy_from(*output_tensor_device);
+
+    //! get the output data or read tensor set in network_in
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::pinned_host_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+    //! malloc the pinned host memory
+    bool is_pinned_host = true;
+    auto tensor_pinned_input =
+            Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
+    //! copy to the pinned memory
+    tensor_pinned_input.copy_from(*src_tensor);
+    //! set the pinned host memory to the network as input
+    input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/lite_c_interface.cpp
+++ b/lite/example/mge/lite_c_interface.cpp
+/**
+ * \file example/basic_c_interface.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#include "misc.h"
+#if LITE_BUILD_WITH_MGE
+#include "lite-c/global_c.h"
+#include "lite-c/network_c.h"
+#include "lite-c/tensor_c.h"
+
+#include <thread>
+
+#define LITE_CAPI_CHECK(_expr)                 \
+    do {                                       \
+        int _ret = (_expr);                    \
+        if (_ret) {                            \
+            LITE_THROW(LITE_get_last_error()); \
+        }                                      \
+    } while (0)
+
+bool basic_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+
+    //! create and load the network
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, *default_config(), *default_network_io()));
+
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    void* dst_ptr;
+    size_t length_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_in_byte));
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr));
+    //! copy or forward data to network
+    memcpy(dst_ptr, src_ptr, length_in_byte);
+
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    //! get the output data or read tensor data
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    //! get the first output tensor name
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    void* output_ptr;
+    size_t length_output_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
+                                                       &length_output_in_byte));
+
+    size_t out_length = length_output_in_byte / sizeof(float);
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(output_ptr)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool device_io_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+    size_t length_read_in = src_tensor->get_tensor_total_size_in_byte();
+
+    //! create and load the network
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, *default_config(), *default_network_io()));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    size_t length_tensor_in;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_tensor_in));
+    if (length_read_in != length_tensor_in) {
+        LITE_THROW("The input data size is not match the network input tensro "
+               "size,\n");
+    }
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
+                                             length_tensor_in));
+
+    //! reset the output tensor memory with user allocated memory
+    size_t out_length = 1000;
+    LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
+    std::shared_ptr<float> ptr(new float[out_length],
+                               [](float* ptr) { delete[] ptr; });
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    LITE_CAPI_CHECK(
+            LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));
+
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    void* out_data = ptr.get();
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+namespace {
+volatile bool finished = false;
+int async_callback(void) {
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+    finished = true;
+    return 0;
+}
+}  // namespace
+
+bool async_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+
+    LiteNetwork c_network;
+    LiteConfig config = *default_config();
+    config.options.var_sanity_check_first_run = false;
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io()));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    size_t length_tensor_in;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_tensor_in));
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
+                                             length_tensor_in));
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback));
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("The count is %zu\n", count);
+    finished = false;
+
+    //! get the output data or read tensor data
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    //! get the first output tensor name
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    void* output_ptr;
+    size_t length_output_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
+                                                       &length_output_in_byte));
+
+    size_t out_length = length_output_in_byte / sizeof(float);
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(output_ptr)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/network_share_weights.cpp
+++ b/lite/example/mge/network_share_weights.cpp
+/**
+ * \file example/network_share_weights.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::network_share_same_weights(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+
+    //! load a new network from the created network and share the same weights,
+    Config config_new;
+    config_new.options.const_shape = true;
+    NetworkIO network_io_new;
+    std::shared_ptr<Network> weight_shared_network =
+            std::make_shared<Network>(config_new, network_io_new);
+    Runtime::shared_weight_with_network(weight_shared_network, network);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    std::shared_ptr<Tensor> input_tensor2 =
+            weight_shared_network->get_input_tensor(0);
+    void* dst_ptr2 = input_tensor2->get_memory_ptr();
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+    memcpy(dst_ptr2, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    weight_shared_network->forward();
+    weight_shared_network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    std::shared_ptr<Tensor> output_tensor2 =
+            weight_shared_network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    void* out_data2 = output_tensor2->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        float data2 = static_cast<float*>(out_data2)[i];
+        if (data != data2) {
+            printf("the result between the origin network and weight share "
+                   "netwrok is different.\n");
+        }
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/reset_io.cpp
+++ b/lite/example/mge/reset_io.cpp
+/**
+ * \file example/reset_io.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::reset_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    lite::Config config;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! 6. get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::reset_input_output(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    lite::Config config;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! set output ptr to store the network output
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < 1000; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/user_allocator.cpp
+++ b/lite/example/mge/user_allocator.cpp
+/**
+ * \file example/user_allocator.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+using namespace lite;
+using namespace example;
+
+namespace {
+class CheckAllocator : public lite::Allocator {
+public:
+    //! allocate memory of size in the given device with the given align
+    void* allocate(LiteDeviceType, int, size_t size, size_t align) override {
+#ifdef WIN32
+        return _aligned_malloc(size, align);
+#elif defined(__ANDROID__) || defined(ANDROID)
+        return memalign(align, size);
+#else
+        void* ptr = nullptr;
+        auto err = posix_memalign(&ptr, align, size);
+        if (!err) {
+            printf("failed to malloc %zu bytes with align %zu", size, align);
+        }
+        return ptr;
+#endif
+    };
+
+    //! free the memory pointed by ptr in the given device
+    void free(LiteDeviceType, int, void* ptr) override {
+#ifdef WIN32
+        _aligned_free(ptr);
+#else
+        ::free(ptr);
+#endif
+    };
+};
+}  // namespace
+
+bool lite::example::config_user_allocator(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    auto allocator = std::make_shared<CheckAllocator>();
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    Runtime::set_memory_allocator(network, allocator);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/user_cryption.cpp
+++ b/lite/example/mge/user_cryption.cpp
+/**
+ * \file example/user_cryption.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+namespace {
+std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size,
+                                   const std::vector<uint8_t>& key) {
+    if (key.size() == 1) {
+        std::vector<uint8_t> ret(size, 0);
+        const uint8_t* ptr = static_cast<const uint8_t*>(model_mem);
+        uint8_t key_data = key[0];
+        for (size_t i = 0; i < size; i++) {
+            ret[i] = ptr[i] ^ key_data ^ key_data;
+        }
+        return ret;
+    } else {
+        printf("the user define decrypt method key length is wrong.\n");
+        return {};
+    }
+}
+}  // namespace
+
+bool lite::example::register_cryption_method(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! register the decryption method
+    register_decryption_and_key("just_for_test", decrypt_model, {15});
+
+    lite::Config config;
+    config.bare_model_cryption_name = "just_for_test";
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::update_cryption_key(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! update the decryption method key
+    std::vector<uint8_t> key(32, 0);
+    for (size_t i = 0; i < 32; i++) {
+        key[i] = 31 - i;
+    }
+    update_decryption_or_key("AES_default", nullptr, key);
+
+    lite::Config config;
+    config.bare_model_cryption_name = "AES_default";
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/npy.h
+++ b/lite/example/npy.h
--- a/lite/include/lite/common_enum_c.h
+++ b/lite/include/lite/common_enum_c.h
+/**
+ * \file inlude/lite/common_enum_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_COMMON_ENUM_C_H_
+#define LITE_COMMON_ENUM_C_H_
+
+/*!
+ * \brief The log level.
+ */
+typedef enum LiteLogLevel {
+    DEBUG = 0, /*!< The lowest level and most verbose */
+    INFO = 1,  /*!< The lowest level and most verbose */
+    WARN = 2,  /*!< Print only warning and errors */
+    ERROR = 3, /*!< Print only errors */
+} LiteLogLevel;
+
+typedef enum LiteBackend {
+    LITE_DEFAULT = 0, //! default backend is mge
+} LiteBackend;
+
+typedef enum LiteDeviceType {
+    LITE_CPU = 0,
+    LITE_CUDA = 1,
+    LITE_ATLAS = 3,
+    LITE_NPU = 4,
+    //! when the device information is set in model, so set LITE_DEVICE_DEFAULT
+    //! in lite
+    LITE_DEVICE_DEFAULT = 5,
+} LiteDeviceType;
+
+typedef enum LiteDataType {
+    LITE_FLOAT = 0,
+    LITE_HALF = 1,
+    LITE_INT = 2,
+    LITE_INT16 = 3,
+    LITE_INT8 = 4,
+    LITE_UINT8 = 5,
+    LITE_UINT = 6,
+    LITE_UINT16 = 7,
+    LITE_INT64 = 8,
+} LiteCDataType;
+
+typedef enum LiteTensorPhase {
+    //! Tensor maybe input or output
+    LITE_IO = 0,
+    //! Tensor is input
+    LITE_INPUT = 1,
+    //! Tensor is output
+    LITE_OUTPUT = 2,
+} LiteTensorPhase;
+
+/*!
+ * \brief the input and output type, include SHAPE and VALUE
+ * sometimes user only need the shape of the output tensor
+ */
+typedef enum LiteIOType {
+    LITE_IO_VALUE = 0,
+    LITE_IO_SHAPE = 1,
+} LiteIOType;
+
+/*!
+ * \brief operation algorithm seletion strategy type, some operations have
+ * multi algorithms, different algorithm has different attribute, according to
+ * the strategy, the best algorithm will be selected.
+ *
+ * Note: These strategies can be combined
+ *
+ * 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
+ * use heuristic instead
+ *
+ * 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
+ * reproducible algo
+ *
+ * 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
+ * algorithm from the reproducible algorithms set
+ *
+ * 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
+ * algorithm form the optimzed algorithms, thus profile will process fast
+ *
+ * 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
+ * profile the best algorithm form the optimzed and reproducible algorithms
+ */
+typedef enum LiteAlgoSelectStrategy {
+    LITE_ALGO_HEURISTIC = 1 << 0,
+    LITE_ALGO_PROFILE = 1 << 1,
+    LITE_ALGO_REPRODUCIBLE = 1 << 2,
+    LITE_ALGO_OPTIMIZED = 1 << 3,
+} LiteAlgoSelectStrategy;
+
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/global.h
+++ b/lite/include/lite/global.h
+/**
+ * \file inlude/lite/global.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "macro.h"
+#include "network.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace lite {
+
+/**
+ * \brief Model decryption function
+ *
+ * \param[in] const void* is the decrypted model memory pointer
+ * \param[in] size_t the size the decrypted model memory in byte
+ * \param[in] const std::vector<uint8_t>& the decryption key vector
+ */
+using DecryptionFunc = std::function<std::vector<uint8_t>(
+        const void*, size_t, const std::vector<uint8_t>&)>;
+
+/**
+ * \brief register a custom decryption method and key to lite.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model.
+ *
+ * \param[in] key the decryption key of the method
+ */
+LITE_API bool register_decryption_and_key(std::string decrypt_name,
+                                          const DecryptionFunc& func,
+                                          const std::vector<uint8_t>& key);
+
+/**
+ * \brief update decryption function or key of a custom decryption method.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model. if
+ * function is nullptr, it will not be updated.
+ *
+ * \param[in] key the decryption key of the method, if the size of key is zero,
+ * it will not be updated
+ */
+LITE_API bool update_decryption_or_key(std::string decrypt_name,
+                                       const DecryptionFunc& func,
+                                       const std::vector<uint8_t>& key);
+
+/**
+ * \brief Model information parse function
+ *
+ * \param[in] const void* is the information memory
+ * \param[in] size_t the size the information memory
+ * \param[in] const std::string the model name used for check whether the
+ * infomation match the model
+ * \param[in] Config the model config, ParseInfoFunc can fill it with the
+ * information in json, the config will influence Network loading later
+ * \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the
+ * information in json, the networkio will influence Network forwarding later
+ * \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the
+ * other config not inclue in config and networkIO, ParseInfoFunc can fill it
+ * with the information in json, now support:
+ * "device_id" : int, default 0
+ * "number_threads" : size_t, default 1
+ * "is_inplace_model" : bool, default false
+ * "use_tensorrt" : bool, default false
+ */
+using ParseInfoFunc = std::function<bool(
+        const void*, size_t, const std::string model_name, Config& config,
+        NetworkIO& network_io,
+        std::unordered_map<std::string, LiteAny>& isolated_config_map,
+        std::string& extra_info)>;
+
+/**
+ * \brief register a custom parser function to lite.
+ *
+ * \param[in] info_type the name of the parser function, which will act as the
+ * hash key to find the parser method.
+ *
+ * \param[in] parse_func the parser function, which will parse the given
+ * information and modify the Network Config and IO.
+ *
+ */
+LITE_API bool register_parse_info_func(std::string info_type,
+                                       const ParseInfoFunc& parse_func);
+
+/*! \brief Get version
+ */
+LITE_API void get_version(int& major, int& minor, int& patch);
+
+/*! \brief Set the current log level.
+ * \param[in] level The new log level
+ */
+LITE_API void set_log_level(LiteLogLevel level);
+
+/*! \brief Get the current log level.
+ * \return The current log level
+ */
+LITE_API LiteLogLevel get_log_level();
+
+/*! \brief Get device count
+ * \param[in] device_type device type
+ * \return the device count
+ */
+LITE_API size_t get_device_count(LiteDeviceType device_type);
+
+/*! \brief try to coalesce all free memory in megenine
+ */
+LITE_API void try_coalesce_all_free_memory();
+
+/*!
+ * \brief Set the loader to the lite
+ * \param loader_path is the file path which store the cache
+ */
+LITE_API void set_loader_lib_path(const std::string& loader_path);
+
+/*!
+ * \brief Set the algo policy cache file for CPU/CUDA ...
+ * \param cache_path is the file path which store the cache
+ * \param always_sync sync the cache when model run
+ */
+LITE_API void set_persistent_cache(const std::string& cache_path,
+                                   bool always_sync = false);
+
+/*!
+ * \brief dump the PersistentCache policy cache to file, if the network is set
+ * to profile when forward, though this the algo policy will dump to file
+ */
+LITE_API void dump_persistent_cache(const std::string& cache_path);
+
+/*!
+ * \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
+ */
+LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path);
+
+/*!
+ * \brief dump the TensorRT cache to the file set in set_tensor_rt_cache
+ */
+LITE_API void dump_tensor_rt_cache();
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/macro.h
+++ b/lite/include/lite/macro.h
+/**
+ * \file include/lite/macro.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_MACRO_H_
+#define LITE_MACRO_H_
+
+#if defined(_WIN32)
+#define LITE_API __declspec(dllexport)
+#else
+#define LITE_API __attribute__((visibility("default")))
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
+/**
+ * \file inlude/lite/network.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "macro.h"
+#include "tensor.h"
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+namespace lite {
+
+LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x,
+                                                 LiteAlgoSelectStrategy y) {
+    return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) |
+                                               static_cast<uint32_t>(y));
+}
+
+/*!
+ * \brief the inference options which will be translated to megenine
+ *
+ * \param weight_preprocess is the option wich optimize the inferece performance
+ * with preprocess the const weights
+ *
+ * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
+ * dimshuffle
+ *
+ * \param fake_next_exec  whether only to perform non-computing tasks (like
+ * memory allocation and queue initialization) for next exec. This would be
+ * reset to false when the graph is executed.
+ *
+ * \param var_sanity_check_first_run Disable var sanity check on the first run.
+ * Var sanity check is enabled on the first-time execution by default, and can
+ * be used to find some potential memory access errors in the operator
+ * implementation.
+ *
+ * \param const_shape This can be used to reduce memory usage since some
+ * static inference data structures can be omitted.
+ *
+ * \param force_dynamic_alloc force dynamic memory alloc for all vars
+ *
+ * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
+ * which are used as CallbackCaller input when call compile() function
+ *
+ * \param no_profiling_on_shape_change do not re-profile to select best impl
+ * algo when input shape changes (use previous algo)
+ *
+ * \param jit_level Execute supported operators with JIT (support MLIR,
+ * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
+ * 1 for basic elemwise opr;
+ * 2 for including reduce operator
+ *
+ * \param record_level flag optimize the inference performace with record the
+ * kernel tasks in first run, hereafter the inference all need to execute the
+ * recorded tasks.
+ * level = 0 means the normal inference,
+ * level = 1 means use record inference,
+ * level = 2 means record inference with free the extra memory
+ *
+ * \param graph_opt_level optimization level:
+ * 0: disable
+ * 1: level-1: inplace arith transformations during graph
+ *    construction
+ * 2: level-2: level-1, plus global optimization before graph
+ *    compiling
+ * 3: also enable JIT
+ * <0: corresponding level, with result check for debug
+ *
+ * \param async_exec_level exec: dispatch on separate threads for different
+ * comp_node.
+ * 0: do not perform async dispatch
+ * 1: dispatch async if there are more than one comp node with limited queue
+ * mask 0b10: async if there are multiple comp nodes with
+ * mask 0b100: always async
+ */
+struct LITE_API Options {
+    bool weight_preprocess = false;
+    bool fuse_preprocess = false;
+    bool fake_next_exec = false;
+    bool var_sanity_check_first_run = true;
+    bool const_shape = false;
+    bool force_dynamic_alloc = false;
+    bool force_output_dynamic_alloc = false;
+    bool no_profiling_on_shape_change = false;
+    uint8_t jit_level = 0;
+    uint8_t comp_node_seq_record_level = 0;
+    uint8_t graph_opt_level = 2;
+    uint16_t async_exec_level = 1;
+
+    //! layout transform options
+    bool enable_nchw44 = false;
+    bool enable_nchw44_dot = false;
+    bool enable_nchw88 = false;
+    bool enable_nhwcd4 = false;
+    bool enable_nchw4 = false;
+    bool enable_nchw32 = false;
+    bool enable_nchw64 = false;
+};
+
+/*!
+ * \brief Configuration when load and compile the graph
+ *
+ * \param bare_model_cryption_name is the bare model cryption method name, bare
+ *model is not pack json info inside
+ *
+ *\param has_compression flag whether the model is compressed, the compress
+ *method will read form the model
+ */
+struct LITE_API Config {
+    bool has_compression = false;
+    int device_id = 0;
+    LiteDeviceType device_type = LiteDeviceType::LITE_CPU;
+    LiteBackend backend = LiteBackend::LITE_DEFAULT;
+    std::string bare_model_cryption_name = {};
+    Options options = {};
+};
+
+/*!
+ * \brief config the network input and output item
+ *
+ */
+struct LITE_API IO {
+    //! the tensor name in the graph corresponding to the IO
+    std::string name;
+
+    //! Used to mark where the input tensor comes from and the output where copy
+    //! to, if is_host is true, the input is from host and output copy to host,
+    //! otherwise device. Sometimes The input is from device and output no need
+    //! copy to host, default is true.
+    bool is_host = true;
+
+    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+    //! output tensor value is invaid, only shape will be set, default is VALUE
+    LiteIOType io_type = LiteIOType::LITE_IO_VALUE;
+
+    //! The layout of the config from user, if other layout is set before
+    //! forward or get after forward by input tensor reset, this layout will by
+    //! pass. if no other layout is set before forward, this layout will work.
+    //! if this layout is no set, the model will forward with its origin layout.
+    //! if in output, it will used to check.
+    Layout config_layout = {};
+};
+
+/*!
+ * \brief the input and output information when load the network
+ * the NetworkIO will remain in the network until the network is destroyed
+ */
+struct LITE_API NetworkIO {
+    std::vector<IO> inputs = {};
+    std::vector<IO> outputs = {};
+};
+
+/*!
+ * \brief A user-implemented allocator interface
+ */
+class LITE_API Allocator {
+public:
+    virtual ~Allocator() = default;
+
+    //! allocate memory of size in the given device with the given align
+    virtual void* allocate(LiteDeviceType device_type, int device_id,
+                           size_t size, size_t align) = 0;
+
+    //! free the memory pointed by ptr in the given device
+    virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0;
+};
+
+/*!
+ * \brief the thread affinith callback type
+ * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
+ * thread_id of (nr_threads - 1) is the main worker thread.
+ */
+using ThreadAffinityCallback = std::function<void(int thread_id)>;
+
+using AsyncCallback = std::function<void(void)>;
+
+/*!
+ * \brief the start/finish callback function
+ * \param unordered_map map from the io tensor name to the pair of which is the
+ * corresponding IO of user config and the realy input or output tensor.
+ */
+using StartCallback = std::function<void(
+        const std::unordered_map<std::string,
+                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;
+using FinishCallback = std::function<void(
+        const std::unordered_map<std::string,
+                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;
+
+/*!
+ * \brief The network is construct form a model, implement model load, init,
+ * forward, and display some model information
+ */
+class LITE_API Network {
+public:
+    class NetworkImplBase;
+
+    ~Network();
+
+    Network(const Config& config = {}, const NetworkIO& networkio = {});
+
+    Network(const NetworkIO& networkio, const Config& config = {});
+
+    //! load the model form memory
+    void load_model(void* model_mem, size_t size);
+
+    //! load the model from a model path
+    void load_model(std::string model_path);
+
+    //! only compute the output tensor in user configured
+    void compute_only_configured_output();
+
+    //! get the network input and output tensor, the layout of which is
+    //! sync from mge tensor, when the name of input and output tensor  are the
+    //! same, use LiteTensorPhase to separate
+    std::shared_ptr<Tensor> get_io_tensor(
+            std::string io_name,
+            LiteTensorPhase phase = LiteTensorPhase::LITE_IO);
+
+    //! get the network input by index
+    std::shared_ptr<Tensor> get_input_tensor(size_t index);
+
+    //! get the network output tensor by index
+    std::shared_ptr<Tensor> get_output_tensor(size_t index);
+
+    //! set the network forward in async mode and set the async callback
+    //! function
+    Network& set_async_callback(const AsyncCallback& async_callback);
+
+    //! set the start forward callback function, which will be execute before
+    //! forward. this can be used to check network input or dump model inputs
+    //! for debug
+    Network& set_start_callback(const StartCallback& start_callback);
+
+    //! set the finish forward callback function, which will be execute after
+    //! forward. this can be used to dump model outputs for debug
+    Network& set_finish_callback(const FinishCallback& finish_callback);
+
+    //! forward the network with filled input data and fill the output data
+    //! to the output tensor
+    void forward();
+
+    //! waite until forward finish in sync model
+    void wait();
+
+    //! get the input tensor name in the order in load return
+    std::string get_input_name(size_t index) const;
+
+    //! get the output tensor name in the order in load return
+    std::string get_output_name(size_t index) const;
+
+    //! get all the input tensor name in the order in load return
+    std::vector<std::string> get_all_input_name() const;
+
+    //! get all the output tensor name in the order in load return
+    std::vector<std::string> get_all_output_name() const;
+
+    //! set/get device id, default device id = 0
+    Network& set_device_id(int device_id);
+    int get_device_id() const;
+
+    //! set/get stream id, default stream id = 0
+    Network& set_stream_id(int stream_id);
+    int get_stream_id() const;
+
+    //! enable profile the network, a file will be generated
+    void enable_profile_performance(std::string profile_file_path);
+
+    //! get model extra info
+    const std::string& get_model_extra_info();
+
+    //! get device type
+    LiteDeviceType get_device_type() const;
+
+public:
+    friend class NetworkHelper;
+
+private:
+    //! update member from implement
+    void update_from_implement();
+
+    //! decrypt and parse the model file
+    void prase_model(std::shared_ptr<void> model_data, size_t size);
+
+private:
+    bool m_loaded = false;
+    Config m_config;
+    NetworkIO m_network_io;
+    std::unique_ptr<NetworkImplBase> m_impl;
+    std::string m_extra_info;
+};
+
+/*********************** MGE special network function ***************/
+class LITE_API Runtime {
+public:
+    //! When device is CPU, this interface will set the to be loaded model
+    //! run in multi thread mode with the given thread number.
+    static void set_cpu_threads_number(std::shared_ptr<Network> dst_network,
+                                       size_t nr_threads);
+    static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network);
+
+    //! set threads affinity callback;
+    static void set_runtime_thread_affinity(
+            std::shared_ptr<Network> network,
+            const ThreadAffinityCallback& thread_affinity_callback);
+
+    //! Set cpu default mode when device is CPU, in some low computation
+    //! device or single core device, this mode will get good performace
+    static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
+    static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
+
+    //! Set use tensorrt forward
+    static void use_tensorrt(std::shared_ptr<Network> dst_network);
+
+    //! set opr algorithm selection strategy in the network
+    //! shared_batch_size: the batch size used by fastrun,
+    //!                    Non-zero value means that fastrun use this batch size
+    //!                    regardless of the batch size of the model. Zero means
+    //!                    fastrun use batch size of the model
+    //! binary_equal_between_batch: if the content of each input batch is binary
+    //!                             equal,whether the content of each output
+    //!                             batch is promised to be equal
+    static void set_network_algo_policy(
+            std::shared_ptr<Network> dst_network,
+            LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0,
+            bool binary_equal_between_batch = false);
+
+    //! set workspace_limit for oprs with multiple algorithms, set
+    //! workspace limitation can save memory but may influence the performance
+    static void set_network_algo_workspace_limit(
+            std::shared_ptr<Network> dst_network, size_t workspace_limit);
+
+    //! set the network memroy allocator, the allocator is defined by user
+    static void set_memory_allocator(std::shared_ptr<Network> dst_network,
+                                     std::shared_ptr<Allocator> user_allocator);
+
+    //! share the runtime memory with other network, the weights is not shared
+    static void share_runtime_memory_with(std::shared_ptr<Network> dst_network,
+                                          std::shared_ptr<Network> src_network);
+
+    //! Dump input/output values of all internal variables to output
+    //! file, in txt format
+    static void enable_io_txt_dump(std::shared_ptr<Network> dst_network,
+                                   std::string io_txt_out_file);
+
+    //! Dump input/output values of all internal variables to output
+    //! directory, in binary format
+    static void enable_io_bin_dump(std::shared_ptr<Network> dst_network,
+                                   std::string io_bin_out_dir);
+
+    //! load a new network which will share weights with src network
+    static void shared_weight_with_network(
+            std::shared_ptr<Network> dst_network,
+            const std::shared_ptr<Network> src_network);
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/tensor.h
+++ b/lite/include/lite/tensor.h
+/**
+ * \file inlude/lite/tensor.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "common_enum_c.h"
+#include "macro.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace lite {
+
+/*!
+ * \brief the simple layout description
+ */
+struct LITE_API Layout {
+    static constexpr uint32_t MAXDIM = 7;
+    size_t shapes[MAXDIM];
+    size_t ndim = 0;
+    LiteDataType data_type = LiteDataType::LITE_FLOAT;
+
+    //! get the total byte of a layout
+    size_t get_elem_size() const;
+
+    //! compare whether the two layout is equal
+    bool operator==(const Layout& other) const;
+};
+
+/*!
+ * \brief warpper of the MegEngine Tensor
+ *
+ * The memory is not alloc directly, when call get_memory_ptr() the memory
+ * will be allocated in tensor implement, which will be deleted automatically
+ *
+ * Note: if the tensor memory is set through reset() interface, the memory is
+ * managed by the user, it will not be freed by the tensor
+ *
+ * If the device or layout is not set, when copy form other source tensor, its
+ * device and layout will be copy form the source tensor
+ *
+ * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
+ * this is used to Optimize the H2D or D2H memory copy, if the device or layout
+ * is not set, when copy form other device(CUDA) tensor, this tensor
+ * will be automatically set to pinned tensor
+ */
+class LITE_API Tensor {
+    class TensorImpl;
+
+public:
+    class TensorImplBase;
+
+    Tensor();
+    Tensor(LiteDeviceType device_type, bool is_pinned_host = false);
+    Tensor(LiteDeviceType device_type, const Layout& layout,
+           bool is_pinned_host = false);
+    Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {},
+           bool is_pinned_host = false);
+    Tensor(int device_id, int stream_id, LiteDeviceType device_type,
+           bool is_pinned_host = false);
+    Tensor(LiteBackend backend,
+           LiteDeviceType device_type = LiteDeviceType::LITE_CPU,
+           int device_id = 0, const Layout& layout = {},
+           bool is_pinned_host = false);
+    ~Tensor();
+
+    LiteDeviceType get_device_type() const { return m_device_type; };
+
+    int get_device_id() const { return m_device_id; };
+
+    Layout get_layout() const { return m_layout; };
+
+    bool is_pinned_host() const { return m_is_pinned_host; };
+
+    //! set layout will change the layout and reallocate memory of the tensor
+    void set_layout(const Layout& layout);
+
+    //! which will trigger memory alloc in tensor implement
+    void* get_memory_ptr() const;
+
+    //! get the memory with the offset describe in idx
+    void* get_memory_ptr(const std::vector<size_t>& idx) const;
+
+    //! get the tensor capacity in byte
+    size_t get_tensor_total_size_in_byte() const;
+
+    //! use the user allocated data to reset the memory of the tensor, the
+    //! memory will not be managed by the lite, later, the user should delete
+    //! it.
+    void reset(void* prepared_data, size_t data_length_in_byte);
+
+    //! use the user allocated data and corresponding layout to reset the data
+    //! and layout of the tensor, the memory will not be managed by lite, later,
+    //! the user should delete it.
+    void reset(void* prepared_data, const Layout& layout);
+
+    //! reshape the tensor with new shape, keep the data_type the same
+    void reshape(const std::vector<int>& shape);
+
+    //! get a new tensor slice from the origin tensor
+    std::shared_ptr<Tensor> slice(const std::vector<size_t>& start,
+                                  const std::vector<size_t>& end,
+                                  const std::vector<size_t>& step = {});
+
+    //! set the tensor memory with zero
+    void fill_zero();
+
+    //! copy tensor form other tensor
+    //! Note: the best way for tensor copy is just set the dst device, left
+    //! layout empty, when copying the dst layout will be set the same with
+    //! src
+    void copy_from(const Tensor& src);
+
+    //! share memory with other tensor
+    void share_memory_with(const Tensor& src_tensor);
+
+    //! whether the memory of tensor is continue
+    bool is_continue_memory() const;
+
+    //! update the menbers from the implement
+    void update_from_implement();
+
+public:
+    friend class TensorHelper;
+
+private:
+    std::shared_ptr<TensorImplBase> m_tensor_impl;
+
+    //! flag whether the storage of the tensor is pinned, this is only used
+    //! when the compnode is not in CPU
+    bool m_is_pinned_host = false;
+    int m_device_id = 0;
+    Layout m_layout;
+    //! the device of the tensor should not be changed after the tensor has
+    //! constructed
+    LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU;
+};
+
+/**
+ * \brief a class can hold any type data, but not check whether the visit type
+ * is valid
+ */
+class LITE_API LiteAny {
+public:
+    LiteAny() = default;
+    template <class T>
+    LiteAny(T value) : m_holder(new AnyHolder<T>(value)) {
+        m_is_string = std::is_same<std::string, T>();
+    }
+
+    LiteAny(const LiteAny& any) {
+        m_holder = any.m_holder->clone();
+        m_is_string = any.is_string();
+    }
+    LiteAny& operator=(const LiteAny& any) {
+        m_holder = any.m_holder->clone();
+        m_is_string = any.is_string();
+        return *this;
+    }
+    bool is_string() const { return m_is_string; }
+
+    class HolderBase {
+    public:
+        virtual ~HolderBase() = default;
+        virtual std::shared_ptr<HolderBase> clone() = 0;
+        virtual size_t type_length() const = 0;
+    };
+
+    template<class T>
+    class AnyHolder : public HolderBase {
+    public:
+        AnyHolder(const T value) :
+            m_value(value) {
+        }
+        virtual std::shared_ptr<HolderBase> clone() override {
+            return std::make_shared<AnyHolder>(m_value);
+        }
+        virtual size_t type_length() const override { return sizeof(T); }
+
+    public:
+        T m_value;
+    };
+    //! if type is miss matching, it will throw
+    void type_missmatch(size_t expect, size_t get) const;
+
+    //! only check the storage type and the visit type length, so it's not safe
+    template <class T>
+    T unsafe_cast() const {
+        if (sizeof(T) != m_holder->type_length()) {
+            type_missmatch(m_holder->type_length(), sizeof(T));
+        }
+        return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value;
+    }
+    //! only check the storage type and the visit type length, so it's not safe
+    void* cast_void_ptr() const {
+        return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value;
+    }
+
+private:
+    std::shared_ptr<HolderBase> m_holder;
+    bool m_is_string = false;
+};
+
+/*********************** special tensor function ***************/
+class LITE_API TensorUtils {
+public:
+    //! concat all the input tensor to one on the specified dim, the result
+    //! tensor reside in dst_device_id of dst_device, if dst_device is
+    //! LITE_DEVICE_DEFAULT, the device will get from the first tensor
+    static std::shared_ptr<Tensor> concat(
+            const std::vector<Tensor>& tensors, int dim,
+            LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT,
+            int dst_device_id = -1);
+};
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/global_c.h
+++ b/lite/lite-c/include/lite-c/global_c.h
+/**
+ * \file lite-c/include/lite-c/global-c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_GLOBAL_H_
+#define LITE_C_GLOBAL_H_
+
+#include "macro.h"
+#include "network_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Get version
+ */
+LITE_API int LITE_get_version(int* major, int* minor, int* patch);
+
+/*! \brief Get the last error message.
+ * \return the message pointer
+ */
+LITE_API const char* LITE_get_last_error();
+
+/*! \brief Get device count
+ * \param[in] device_type device type
+ * \return the device count
+ */
+LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count);
+
+/*! \brief try to coalesce all free memory in megenine
+ */
+LITE_API int LITE_try_coalesce_all_free_memory();
+
+/**
+ * \brief Model decryption function
+ *
+ * \param[in] input_data is the decrypted model memory pointer
+ * \param[in] input_size the size the decrypted model memory in byte
+ * \param[in] key_data decryption key data
+ * \param[in] key_size the size of decryption key data
+ * \param[out] output_data the data of decrypted data, if output_data is
+ * nullptr, just query the output memory length, else write the decryted data to
+ * the output_data
+ * \return size of decrypted data
+ */
+typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size,
+                                     const uint8_t* key_data, size_t key_size,
+                                     const void* output_data);
+
+/**
+ * \brief Model information parse function
+ *
+ * \param[in] info_data is the information memory
+ * \param[in] info_size the size the information memory
+ * \param[in] model_name the model name used for check whether the
+ * infomation match the model
+ * \param[in] config the model config, ParseInfoFunc can fill it with the
+ * information in json, the config will influence Network loading later
+ * \param[in] network_io the model IO, ParseInfoFunc can fill it with the
+ * information in json, the networkio will influence Network forwarding later
+ * \param[in] device_id the address to store device_id, default 0
+ * \param[in] nr_threads the address to store nr_threads, default 1
+ * \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default
+ * \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default
+ * false
+ */
+typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size,
+                                 const char* model_name, LiteConfig* config,
+                                 LiteNetworkIO* network_io, int* device_id,
+                                 size_t* nr_threads, int* is_cpu_inplace_mode,
+                                 int* use_tensorrt);
+
+/**
+ * \brief register a custom decryption method and key to lite.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model.
+ * \param[in] key_data the decryption key of the method
+ * \param[in] key_size the size of decryption key
+ */
+LITE_API int LITE_register_decryption_and_key(const char* decrypt_name,
+                                              const LiteDecryptionFunc func,
+                                              const uint8_t* key_data,
+                                              size_t key_size);
+
+/**
+ * \brief update decryption function or key of a custom decryption method.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model. if
+ * function is nullptr, it will not be updated.
+ *
+ * \param[in] key the decryption key of the method, if the size of key is zero,
+ * it will not be updated
+ */
+LITE_API int LITE_update_decryption_or_key(const char* decrypt_name,
+                                           const LiteDecryptionFunc func,
+                                           const uint8_t* key_data,
+                                           size_t key_size);
+
+/**
+ * \brief register a custom parser function to lite.
+ *
+ * \param[in] info_type the name of the parser function, which will act as the
+ * hash key to find the parser method.
+ *
+ * \param[in] parse_func the parser function, which will parse the given
+ * information and modify the Network Config and IO.
+ *
+ */
+LITE_API int LITE_register_parse_info_func(const char* info_type,
+                                           const LiteParseInfoFunc parse_func);
+
+/*!
+ * \brief Set the loader to the lite
+ * \param[in] loader_path is the file path which store the cache
+ */
+LITE_API int LITE_set_loader_lib_path(const char* loader_path);
+
+/*!
+ * \brief Set the algo policy cache file for CPU/CUDA ...
+ * \param[in] cache_path is the file path which store the cache
+ * \param[in] always_sync sync the cache when cache updated
+ */
+LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync);
+
+/*!
+ * \brief Set the tensor policy cache file for CPU/CUDA ...
+ * \param[in] cache_path is the file path which store the cache
+ */
+LITE_API int LITE_set_tensor_rt_cache(const char* cache_path);
+
+/*! \brief Set the current log level.
+ * \param[in] level The new log level
+ */
+LITE_API int LITE_set_log_level(LiteLogLevel level);
+
+/*! \brief Get the current log level.
+ * \param[in] level The pointer to log level
+ */
+LITE_API int LITE_get_log_level(LiteLogLevel* level);
+/*!
+ * \brief dump the algo policy cache to file, if the network is set to profile
+ * when forward, though this the algo policy will dump to file
+ * \param[in] cache_path is the file path which store the cache
+ */
+LITE_API int LITE_dump_persistent_cache(const char* cache_path);
+
+/*!
+ * \brief dump the tensorrt policy cache to file
+ */
+LITE_API int LITE_dump_tensor_rt_cache();
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
+/**
+ * \file lite-c/include/lite-c/network_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_NETWORK_H_
+#define LITE_C_NETWORK_H_
+
+#include "tensor_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief the inference options which will be translated to megenine
+ *
+ * \param weight_preprocess is the option wich optimize the inferece performance
+ * with preprocess the const weights
+ *
+ * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
+ * dimshuffle
+ *
+ * \param fake_next_exec  whether only to perform non-computing tasks (like
+ * memory allocation and queue initialization) for next exec. This would be
+ * reset to false when the graph is executed.
+ *
+ * \param var_sanity_check_first_run Disable var sanity check on the first run.
+ * Var sanity check is enabled on the first-time execution by default, and can
+ * be used to find some potential memory access errors in the operator
+ * implementation.
+ *
+ * \param const_shape This can be used to reduce memory usage since some
+ * static inference data structures can be omitted.
+ *
+ * \param force_dynamic_alloc force dynamic memory alloc for all vars
+ *
+ * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
+ * which are used as CallbackCaller input when call compile() function
+ *
+ * \param no_profiling_on_shape_change do not re-profile to select best impl
+ * algo when input shape changes (use previous algo)
+ *
+ * \param jit_level Execute supported operators with JIT (support MLIR,
+ * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
+ * 1 for basic elemwise opr;
+ * 2 for including reduce operator
+ *
+ * \param record_level flag optimize the inference performace with record the
+ * kernel tasks in first run, hereafter the inference all need to execute the
+ * recorded tasks.
+ * level = 0 means the normal inference,
+ * level = 1 means use record inference,
+ * level = 2 means record inference with free the extra memory
+ *
+ * \param graph_opt_level optimization level:
+ * 0: disable
+ * 1: level-1: inplace arith transformations during graph
+ *    construction
+ * 2: level-2: level-1, plus global optimization before graph
+ *    compiling
+ * 3: also enable JIT
+ * <0: corresponding level, with result check for debug
+ *
+ * \param async_exec_level exec: dispatch on separate threads for different
+ * comp_node.
+ * 0: do not perform async dispatch
+ * 1: dispatch async if there are more than one comp node with limited queue
+ * mask 0b10: async if there are multiple comp nodes with
+ * mask 0b100: always async
+ */
+typedef struct Options {
+    int weight_preprocess;
+    int fuse_preprocess;
+    int fake_next_exec;
+    int var_sanity_check_first_run;
+    int const_shape;
+    int force_dynamic_alloc;
+    int force_output_dynamic_alloc;
+    int no_profiling_on_shape_change;
+    int jit_level;
+    int comp_node_seq_record_level;
+    int graph_opt_level;
+    int async_exec_level;
+
+    //! layout transform options
+    int enable_nchw44;
+    int enable_nchw44_dot;
+    int enable_nchw88;
+    int enable_nhwcd4;
+    int enable_nchw4;
+    int enable_nchw32;
+    int enable_nchw64;
+} LiteOptions;
+
+//! define a default Options
+extern LITE_API const LiteOptions default_option;
+
+/*!
+ * \brief Configuration when load and compile the graph
+ *
+ * \param bare_model_cryption_name is the bare model cryption method name, bare
+ *model is not pack json info inside
+ *
+ *\param has_compression flag whether the model is compressed, the compress
+ *method will read form the model
+ */
+typedef struct LiteConfig {
+    int has_compression;
+    int device_id;
+    LiteDeviceType device_type;
+    LiteBackend backend;
+    const char* bare_model_cryption_name;
+    LiteOptions options;
+} LiteConfig;
+
+//! get default config
+LITE_API LiteConfig* default_config();
+
+/*!
+ * \brief config the network input and output item
+ *
+ */
+typedef struct LiteIO {
+    //! the tensor name in the graph corresponding to the IO
+    const char* name;
+
+    //! Used to mark where the input tensor comes from and the output where copy
+    //! to, if is_host is true, the input is from host and output copy to host,
+    //! otherwise device. Sometimes The input is from device and output no need
+    //! copy to host, default is true.
+    int is_host;
+
+    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+    //! output tensor value is invaid, only shape will be set, default is VALUE
+    LiteIOType io_type;
+
+    //! The layout of the config from user, if other layout is set before
+    //! forward or get after forward, this layout will by pass. if no other
+    //! layout is set before forward, this layout will work. if this layout is
+    //! no set, the model will forward with its origin layout. if in output, it
+    //! will used to check.
+    LiteLayout config_layout;
+} LiteIO;
+
+//! define a default IO
+extern LITE_API const LiteIO default_io;
+
+/*!
+ * \brief the input and output information when load the network
+ * the NetworkIO will remain in the network until the network is destroyed
+ */
+typedef struct LiteNetworkIO {
+    LiteIO* inputs;
+    LiteIO* outputs;
+    size_t input_size;   //! the number IO in inputs
+    size_t output_size;  //! the number IO in outputs
+} LiteNetworkIO;
+
+//! get default NetworkIO
+LITE_API LiteNetworkIO* default_network_io();
+
+/*!
+ * \brief A user-implemented allocator function
+ */
+//! allocate memory of size in the given device with the given align
+typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id,
+                              size_t size, size_t align);
+//! free the memory pointed by ptr in the given device
+typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr);
+
+/*!
+ * \brief the thread affinith callback type
+ * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
+ * thread_id of (nr_threads - 1) is the main worker thread.
+ */
+typedef int (*LiteThreadAffinityCallback)(int thread_id);
+
+typedef int (*LiteAsyncCallback)();
+
+/*!
+ * \brief the start/finish callback function
+ * \param unordered_map map from the io tensor name to the pair of which is the
+ * corresponding IO of user config and the realy input or output tensor.
+ */
+
+typedef int (*LiteStartCallback)(const LiteIO* inputs,
+                                 const LiteTensor* input_tensors, size_t size);
+
+typedef int (*LiteFinishCallback)(const LiteIO* outputs,
+                                  const LiteTensor* output_tensors,
+                                  size_t size);
+
+/*!
+ * \brief The network is construct form a model, implement model load, init,
+ * forward, and display some model information
+ */
+typedef void* LiteNetwork;
+
+/**
+ * \brief Create a lite Network object with default config and networkIO.
+ * \param[out] network The netwrok pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_make_default_network(LiteNetwork* network);
+
+/**
+ * \brief Create a lite Network object from the given config and networkIO.
+ * \param[in] config The configration to create the network
+ * \param[in] network_io The configration io to create the network
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config,
+                               const LiteNetworkIO network_io);
+
+/**
+ * \brief Create a lite Network object from the given config and networkIO.
+ * \param[in] config The configration to create the network
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config);
+
+
+/**
+ * \brief load the model to network form memory
+ * \param[in] model_mem The model in memory
+ * \param[in] size The size of the model memory
+ * \param[out] network The network to be load model in
+ */
+LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
+                                      size_t size);
+
+/**
+ * \brief load the model to network form given path
+ * \param[in] model_path The model path
+ * \param[out] network The network to be load model in
+ */
+LITE_API int LITE_load_model_from_path(LiteNetwork network,
+                                       const char* model_path);
+
+/**
+ * \brief load a new network which will share weights with src network
+ * \param[in] origin_network The origin network pointer
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network,
+                                             const LiteNetwork src_network);
+
+/**
+ * \brief Destroy a lite network object.
+ * \param[in] network The network pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_destroy_network(LiteNetwork network);
+
+/**
+ * \brief forward the network with filled input data and fill the output data
+ * to the output tensor
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_forward(const LiteNetwork network);
+
+/**
+ * \brief waite until forward finish in sync model
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_wait(const LiteNetwork network);
+
+/**
+ * \brief get the network input and ouput tensor, the layout of which is
+ * get from model
+ * \param[in] network The loaded model
+ * \param[in] io_name The input or output name
+ * \param[in] phase The tensor phase
+ * \param[out] tensor The IO tensor get from the network
+ */
+LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
+                                LiteTensorPhase phase, LiteTensor* tensor);
+
+/**
+ * \brief get the input tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] index The index of input tensor
+ * \param[out] name The input tensor name
+ */
+LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index,
+                                 const char** name);
+
+/**
+ * \brief get the output tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] index The index of output tensor
+ * \param[out] name The output tensor name
+ */
+LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index,
+                                  const char** name);
+
+/**
+ * \brief get all the input tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] size The number of the input tensor
+ * \param[out] name The input tensor names
+ */
+LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
+                                     const char** name);
+
+/**
+ * \brief get all the output tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] size The number of output tensor
+ * \param[out] name The output tensor name
+ */
+LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
+                                      const char** name);
+
+/**
+ * \brief get whether the model is running in cpu inplace mode
+ * \param[in] network The loaded model
+ * \param[out] is_cpu_inplace_mode whether is in cpu inplace mode
+ */
+LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network,
+                                      int* is_cpu_inplace_mode);
+
+/**
+ * \brief get the number of thread the network will run with
+ * \param[in] network The loaded model
+ * \param[out] nr_threads the thread number when the network running
+ */
+LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network,
+                                         size_t* nr_threads);
+
+/**
+ * \brief get the device id the network will run with
+ * \param[in] network The loaded model
+ * \param[out] device_id the device id of the network will run
+ */
+LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id);
+
+/**
+ * \brief get the stream id the network will run with
+ * \param[in] network The loaded model
+ * \param[out] stream_id the stream id of the network will run
+ */
+LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id);
+
+/**
+ * \brief get the device type the network will run with
+ * \param[in] network The loaded model
+ * \param[out] device_type the device type of the network will run
+ */
+LITE_API int LITE_get_device_type(const LiteNetwork network,
+                                  LiteDeviceType* device_type);
+
+/**
+ * \brief get the device type the network will run with
+ * \param[in] network The loaded model
+ * \param[out] info  : the json format memory
+ * \param[out] info_size: the json format memory size
+ */
+LITE_API int LITE_get_model_extra_info(const LiteNetwork network,
+                                       const char** info, int* info_size);
+
+/**
+ * \brief Set cpu default mode when device is CPU, in some low computation
+ * device or single core device, this mode will get good performace
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network);
+
+/**
+ * \brief When device is CPU, this interface will set the to be loaded model
+ * run in multi thread mode with the given thread number.
+ * \param[in] network The loaded model
+ * \param[in] nr_threads The threads number
+ */
+LITE_API int LITE_set_cpu_threads_number(LiteNetwork network,
+                                         size_t nr_threads);
+
+/**
+ * \brief set device id, default device id = 0
+ * \param[in] network The loaded model
+ * \param[in] device_id The device id to be set
+ */
+LITE_API int LITE_set_device_id(LiteNetwork network, int device_id);
+
+/**
+ * \brief set stream id, default stream id = 0
+ * \param[in] network The loaded model
+ * \param[in] stream_id The stream id to be set
+ */
+LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id);
+
+/**
+ * \brief enable tensorrt
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_use_tensorrt(LiteNetwork network);
+
+/**
+ * \brief set opr algorithm selection strategy in the network
+ * \param[in] network The loaded model
+ * \param[in] select_strategy The operator algorithm selection strategy
+ */
+LITE_API int LITE_set_network_algo_policy(LiteNetwork network,
+                                          LiteAlgoSelectStrategy strategy);
+
+/**
+ * \brief set opr algorithm selection strategy in the network
+ * \param[in] network The loaded model
+ * \param[in] shared_batch_size: the batch size used by fastrun,
+ *                      Non-zero value means that fastrun use this batch size
+ *                      regardless of the batch size of the model. Zero means
+ *                      fastrun use batch size of the model
+ * \param[in] binary_equal_between_batch: if the content of each input batch is
+ *                      binary equal,whether the content of each output batch is
+ *                      promised to be equal
+ */
+LITE_API int LITE_set_network_algo_fastrun_config(
+        LiteNetwork network, unsigned int shared_batch_size,
+        int binary_equal_between_batch);
+
+/**
+ * \brief set workspace_limit for oprs with multiple algorithms, set
+ * workspace limit can save memory but may influence the performance
+ * \param[in] network The loaded model
+ * \param[in] workspace_limit The operator algorithm workspace limit
+ */
+LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network,
+                                                   size_t workspace_limit);
+
+/**
+ * \brief set the network forward in async mode and set the async callback
+ * function
+ * \param[in] network The loaded model
+ * \param[in] async_callback when network finish forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_async_callback(LiteNetwork network,
+                                     const LiteAsyncCallback async_callback);
+
+/**
+ * \brief set the start forward callback function, which will be execute beform
+ *  forward, this can be used to check network input or dump model inputs
+ *  for debug
+ * \param[in] network The loaded model
+ * \param[in] start_callback when network start forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_start_callback(LiteNetwork network,
+                                     const LiteStartCallback start_callback);
+
+/**
+ * \brief set the finish forward callback function, which will be execute after
+ * forward, this can be used to dump model outputs for debug
+ * \param[in] network The loaded model
+ * \param[in] finish_callback when network finish forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_finish_callback(LiteNetwork network,
+                                      const LiteFinishCallback finish_callback);
+
+/**
+ * \brief set threads affinity callback
+ * \param[in] network The loaded model
+ * \param[in] thread_affinity_callback
+ */
+LITE_API int LITE_set_runtime_thread_affinity(
+        LiteNetwork network,
+        const LiteThreadAffinityCallback thread_affinity_callback);
+
+/**
+ * \brief set the network memroy allocator, the allocator is defined by user
+ * \param[in] network The loaded model
+ * \param[in] allocate_fun The allocate function of the user defined allocator
+ * \param[in] free_fun The free function of the user defined allocator
+ */
+LITE_API int LITE_set_memory_allocator(LiteNetwork network,
+                                       const LiteAllocate allocate_fun,
+                                       const LiteFree free_fun);
+
+/**
+ * \brief the dst_network share the runtime memory with src_network
+ * \param[in] src_network The source network
+ * \param[in] dst_network The dst network to shared memory with src_network
+ */
+LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network,
+                                       LiteNetwork dst_network);
+
+/**
+ * \brief enable profile the network, a JSON format file will be generated
+ * \param[in] network The loaded model
+ * \param[in] profile_json_file_path The profile result file path
+ */
+LITE_API int LITE_enable_profile_performance(
+        LiteNetwork network, const char* profile_json_file_path);
+
+/**
+ * \brief Dump input/output values of all internal variables to output file,
+ * in text format
+ * \param[in] network The loaded model
+ * \param[in] io_txt_out_file The dumped txt file name
+ */
+LITE_API int LITE_enable_io_txt_dump(LiteNetwork network,
+                                     const char* io_txt_out_file);
+
+/**
+ * \brief Dump input/output values of all internal variables to output
+ * directory, in binary format
+ * \param[in] network The loaded model
+ * \param[in] io_bin_out_dir The dumped bin file directory
+ */
+LITE_API int LITE_enable_io_bin_dump(LiteNetwork network,
+                                     const char* io_bin_out_dir);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/tensor_c.h
+++ b/lite/lite-c/include/lite-c/tensor_c.h
+/**
+ * \file lite-c/include/lite-c/tensor_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_TENSOR_C_H_
+#define LITE_TENSOR_C_H_
+
+#include "common_enum_c.h"
+#include "macro.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "stddef.h"
+#include "stdint.h"
+
+#define LAYOUT_MAX_DIM (7)
+
+/*!
+ * \brief the simple layout description
+ */
+typedef struct LiteLayout {
+    size_t shapes[LAYOUT_MAX_DIM];
+    size_t ndim;
+    LiteDataType data_type;
+} LiteLayout;
+
+//! define a default LiteLayout
+extern LITE_API const LiteLayout default_layout;
+
+/*!
+ * \brief warpper of the MegEngine Tensor
+ *
+ * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
+ * this is used to Optimize the H2D or D2H memory copy, if the device or layout
+ * is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor
+ * will be automatically set to pinned tensor
+ */
+typedef struct LiteTensorDesc {
+    //! flag whether the storage of the tensor is pinned, this is only used when
+    //! the compnode is not in CPU
+    int is_pinned_host;
+
+    //! the layout of the tensor
+    LiteLayout layout;
+
+    //! the device of the tensor should not be changed after the tensor has
+    //! constructed
+    LiteDeviceType device_type;
+
+    //! device id of the tensor
+    int device_id;
+} LiteTensorDesc;
+
+//! define a default TensorDesc
+extern LITE_API const LiteTensorDesc default_desc;
+
+/*!
+ * \brief The pointer to a Lite Tensor object
+ */
+typedef void* LiteTensor;
+
+/**
+ * \brief Create a lite tensor object from the given describe.
+ * \param[in] tensor_describe The description to create the Tensor
+ * \param[out] tensor The Tensor pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe,
+                              LiteTensor* tensor);
+
+/**
+ * \brief Destroy a lite tensor object.
+ * \param[in] tensor The Tensor pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_destroy_tensor(LiteTensor tensor);
+
+/**
+ * \brief change the layout of a Tensor object.
+ * \param[in] tensor The Tensor
+ * \param[out] layout The Layout to be set to a tensor
+ */
+LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout);
+
+/**
+ * \brief use the user allocated data to reset the memory of the tensor, the
+ * memory will not be managed by the lite, later, the user should delete
+ * it.
+ * \param[in] tensor The Tensor
+ * \param[in] prepared_data The allocated memory which satisfy the Tensor
+ * \param[in] data_length_in_byte The length of the allocated memory
+ * layout
+ */
+LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
+                                      size_t data_length_in_byte);
+
+/**
+ * \brief  use the user allocated data and corresponding layout to reset the
+ * data and layout of the tensor, the memory will not be managed by lite, later,
+ * the user should delete it.
+ * \param[in] tensor The Tensor
+ * \param[in] layout The Layout to be set to the tensor
+ * \param[in] prepared_data The allocated memory which satisfy the layout to be
+ * set
+ */
+LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
+                               void* prepared_data);
+
+/**
+ * \brief reshape a tensor with the memroy not change, the total number of
+ * element in the reshaped tensor must equal to the origin tensor, the input
+ * shape must only contain one or zero -1 to flag it can be deduced
+ * automatically.
+ * \param[in] tensor The Tensor to be reshape
+ * \param[in] shape the user input shape
+ * \param[in] size the number of data in shape,
+ */
+LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size);
+
+/**
+ * \brief slice a tensor with input param
+ * \param[in] tensor The Tensor to be slice
+ * \param[in] start start index of every axis of to be sliced
+ * \param[in] end end index of every axis of to be sliced
+ * \param[in] step step of every axis of to be sliced, if nullptr, step will be
+ * 1
+ * \param[in] size the number axis to be sliced
+ * \param[out] sliced_tensor the result tensor sliced from the origin tensor
+ */
+LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
+                               const size_t* end, const size_t* step,
+                               size_t size, LiteTensor* slice_tensor);
+
+/**
+ * \brief fill zero to the tensor
+ * \param[in] tensor The Tensor to be memset
+ */
+LITE_API int LITE_tensor_fill_zero(LiteTensor tensor);
+
+/**
+ * \brief copy tensor form other tensor
+ * \param[out] dst_tensor The Tensor to copy into
+ * \param[in] src_tensor The Tensor to copy from
+ */
+LITE_API int LITE_tensor_copy(LiteTensor dst_tensor,
+                              const LiteTensor src_tensor);
+
+/**
+ * \brief share memory form other tensor
+ * \param[out] dst_tensor The Tensor to share into
+ * \param[in] src_tensor The Tensor to be shared
+ */
+LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
+                                           const LiteTensor src_tensor);
+
+/**
+ * \brief get the memory pointer of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] data a pointer to void pointer
+ */
+LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data);
+
+/**
+ * \brief get the memory pointer of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[in] index The coordinate in the tensor
+ * \param[in] size The lenght of coordinate
+ * \param[out] data a pointer to void pointer
+ */
+LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
+                                               const size_t* index, size_t size,
+                                               void** data);
+
+/**
+ * \brief get the tensor capacity in byte of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] size_ptr a pointer to the return size
+
+ */
+LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor,
+                                                size_t* size);
+
+/**
+ * \brief get the tensor layout of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] layout_ptr a pointer will be write with the layout of the tensor
+ */
+LITE_API int LITE_get_tensor_layout(const LiteTensor tensor,
+                                    LiteLayout* layout);
+
+/**
+ * \brief get the tensor device of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] device_ptr a pointer will be write with the device of the tensor
+ */
+LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor,
+                                         LiteDeviceType* device_type);
+
+/**
+ * \brief get the tensor device id of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] device_id a pointer will be write with the device id of the
+ * tensor
+ */
+LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id);
+
+/**
+ * \brief whether the tensor is is_pinned_host.
+ * \param[in] tensor The input Tensor
+ * \param[out] is_pinned_host_ptr a int pointer will be write with whether the
+ * tensor is pinned host
+ */
+LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host);
+
+/**
+ * \brief whether the tensor memory is continue.
+ * \param[in] tensor The input Tensor
+ * \param[out] is_continue a int pointer will be write with whether the
+ * tensor continue
+ */
+LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue);
+/**
+ * \brief concat the inputs tensor to one big tensor
+ * \param[in] tensors ptr The input Tensors
+ * \param[in] nr_tensors number input Tensor
+ * \param[in] dim the dim concat act on
+ * \param[in] dst_device the device type of result tensor, when
+ * LITE_DEVICE_DEFAULT, the result tensor device type will get from the first
+ * tensor
+ * \param[in] device_id the device id of result tensor, when -1, the result
+ * tensor device id will get from the first tensor
+ * \param[out] result_tensor the result tensor after concat
+ */
+LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
+                                LiteDeviceType dst_device, int device_id,
+                                LiteTensor* result_tensor);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/common.h
+++ b/lite/lite-c/src/common.h
+/**
+ * \file lite-c/src/common.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_COMMON_H_
+#define LITE_C_COMMON_H_
+
+#include "../src/misc.h"
+#include "lite-c/network_c.h"
+#include "lite-c/tensor_c.h"
+#include "lite/network.h"
+
+#include <exception>
+#include <stdexcept>
+
+//! convert c Layout to lite::Layout
+lite::Layout convert_to_layout(const LiteLayout& layout);
+
+//! convert lite::Layout to C Layout
+LiteLayout convert_to_clayout(const lite::Layout& layout);
+
+//! convert c config to lite::config
+lite::Config convert_to_lite_config(const LiteConfig c_config);
+
+//! convert C NetworkIO io to lite::NetworkIO
+lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io);
+
+/*!
+ * \brief handle exception
+ * \param e the exception
+ * \return the return value of the error
+ */
+int LiteHandleException(const std::exception& e);
+#if LITE_ENABLE_EXCEPTION
+/*! \brief  macro to guard a function */
+#define LITE_CAPI_BEGIN() try {
+/*! \brief every function starts with LITE_CAPI_BEGIN();
+ * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
+ */
+#define LITE_CAPI_END()                       \
+    }                                         \
+    catch (std::exception & _except_) {       \
+        return LiteHandleException(_except_); \
+    }                                         \
+    return 0;
+#else
+/*! \brief  macro to guard a function */
+#define LITE_CAPI_BEGIN()  {
+/*! \brief every function starts with LITE_CAPI_BEGIN();
+ * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
+ */
+#define LITE_CAPI_END()                       \
+    }                                         \
+    return 0;
+#endif
+/*!
+ * \brief catch the exception with stms
+ */
+#define LITE_CAPI_END_WITH_STMS(_stms)        \
+    }                                         \
+    catch (std::exception & _except_) {       \
+        _stms;                                \
+        return LiteHandleException(_except_); \
+    }                                         \
+    return 0;
+
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/global.cpp
+++ b/lite/lite-c/src/global.cpp
+/**
+ * \file lite-c/src/tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/global.h"
+#include "common.h"
+#include "lite-c/global_c.h"
+
+#include <exception>
+#include <mutex>
+
+namespace {
+
+class ErrorMsg {
+public:
+    std::string& get_error_msg() { return error_msg; }
+    void set_error_msg(const std::string& msg) { error_msg = msg; }
+
+private:
+    std::string error_msg;
+};
+ErrorMsg& get_global_error() {
+    static thread_local ErrorMsg error_msg;
+    return error_msg;
+}
+}  // namespace
+
+int LiteHandleException(const std::exception& e) {
+    get_global_error().set_error_msg(e.what());
+    return -1;
+}
+
+const char* LITE_get_last_error() {
+    return get_global_error().get_error_msg().c_str();
+}
+
+int LITE_get_version(int* major, int* minor, int* patch) {
+    LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null");
+    lite::get_version(*major, *minor, *patch);
+    return 0;
+}
+
+int LITE_get_device_count(LiteDeviceType device_type, size_t* count) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(count, "The ptr pass to LITE api is null");
+    *count = lite::get_device_count(device_type);
+    LITE_CAPI_END();
+}
+
+int LITE_try_coalesce_all_free_memory(){
+    LITE_CAPI_BEGIN();
+    lite::try_coalesce_all_free_memory();
+    LITE_CAPI_END();
+}
+
+int LITE_register_decryption_and_key(const char* decrypt_name,
+                                     const LiteDecryptionFunc func,
+                                     const uint8_t* key_data, size_t key_size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(decrypt_name && key_data && func,
+                "The ptr pass to LITE api is null");
+    std::vector<uint8_t> key;
+    for (size_t i = 0; i < key_size; i++) {
+        key.push_back(key_data[i]);
+    }
+    auto decrypt_func = [func](const void* input_data, size_t input_size,
+                               const std::vector<uint8_t>& key) {
+        auto size =
+                func(input_data, input_size, key.data(), key.size(), nullptr);
+        std::vector<uint8_t> output(size, 0);
+        func(input_data, input_size, key.data(), key.size(), output.data());
+        return output;
+    };
+    lite::register_decryption_and_key(decrypt_name, decrypt_func, key);
+    LITE_CAPI_END();
+}
+
+int LITE_update_decryption_or_key(const char* decrypt_name,
+                                  const LiteDecryptionFunc func,
+                                  const uint8_t* key_data, size_t key_size) {
+    LITE_CAPI_BEGIN();
+    std::vector<uint8_t> key;
+    for (size_t i = 0; i < key_size; i++) {
+        key.push_back(key_data[i]);
+    }
+    lite::DecryptionFunc decrypt_func = nullptr;
+    if (func) {
+        decrypt_func = [func](const void* input_data, size_t input_size,
+                              const std::vector<uint8_t>& key) {
+            auto size = func(input_data, input_size, key.data(), key.size(),
+                             nullptr);
+            std::vector<uint8_t> output(size, 0);
+            func(input_data, input_size, key.data(), key.size(), output.data());
+            return output;
+        };
+    }
+    lite::update_decryption_or_key(decrypt_name, decrypt_func, key);
+    LITE_CAPI_END();
+}
+
+int LITE_register_parse_info_func(const char* info_type,
+                                  const LiteParseInfoFunc parse_func) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null");
+    auto lite_func = [parse_func](
+                             const void* info_data, size_t info_size,
+                             const std::string model_name, lite::Config& config,
+                             lite::NetworkIO& network_io,
+                             std::unordered_map<std::string, lite::LiteAny>&
+                                     separate_config_map,
+                             std::string& extra_info) {
+        LITE_MARK_USED_VAR(extra_info);
+        size_t nr_threads = 1;
+        int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false;
+        LiteNetworkIO c_io;
+        LiteConfig c_config;
+        auto ret = parse_func(info_data, info_size, model_name.c_str(),
+                              &c_config, &c_io, &device_id, &nr_threads,
+                              &is_cpu_inplace_mode, &use_tensorrt);
+        config = convert_to_lite_config(c_config);
+        network_io = convert_to_lite_io(c_io);
+        if (device_id != 0) {
+            separate_config_map["device_id"] = device_id;
+        }
+        if (nr_threads != 1) {
+            separate_config_map["nr_threads"] = nr_threads;
+        }
+        if (is_cpu_inplace_mode != false) {
+            separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode;
+        }
+        if (use_tensorrt != false) {
+            separate_config_map["use_tensorrt"] = use_tensorrt;
+        }
+        return ret;
+    };
+    lite::register_parse_info_func(info_type, lite_func);
+    LITE_CAPI_END();
+}
+
+int LITE_set_loader_lib_path(const char* loader_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(loader_path, "The ptr pass to LITE api is null");
+    lite::set_loader_lib_path(loader_path);
+    LITE_CAPI_END();
+}
+
+int LITE_set_persistent_cache(const char* cache_path, int always_sync) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::set_persistent_cache(cache_path, always_sync);
+    LITE_CAPI_END();
+}
+
+int LITE_set_tensor_rt_cache(const char* cache_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::set_tensor_rt_cache(cache_path);
+    LITE_CAPI_END();
+}
+
+int LITE_set_log_level(LiteLogLevel level) {
+    LITE_CAPI_BEGIN();
+    lite::set_log_level(level);
+    LITE_CAPI_END();
+}
+
+int LITE_get_log_level(LiteLogLevel* level) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(level, "The ptr pass to LITE api is null");
+    *level = lite::get_log_level();
+    LITE_CAPI_END();
+}
+
+int LITE_dump_persistent_cache(const char* cache_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::dump_persistent_cache(cache_path);
+    LITE_CAPI_END();
+}
+
+int LITE_dump_tensor_rt_cache() {
+    LITE_CAPI_BEGIN();
+    lite::dump_tensor_rt_cache();
+    LITE_CAPI_END();
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
--- a/lite/lite-c/src/tensor.cpp
+++ b/lite/lite-c/src/tensor.cpp
--- a/lite/pylite/megenginelite/__init__.py
+++ b/lite/pylite/megenginelite/__init__.py
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from .base import *
+from .global_setting import *
+from .network import *
+from .struct import *
+from .tensor import *
+from .utils import *
--- a/lite/pylite/megenginelite/base.py
+++ b/lite/pylite/megenginelite/base.py
--- a/lite/pylite/megenginelite/global_setting.py
+++ b/lite/pylite/megenginelite/global_setting.py
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from ctypes import *
+
+import numpy as np
+
+from .base import _Ctensor, _lib, _LiteCObjBase
+from .network import *
+from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure
+from .tensor import *
+
+LiteDecryptionFunc = CFUNCTYPE(
+    c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p
+)
+
+
+class _GlobalAPI(_LiteCObjBase):
+    """
+    get the api from the lib
+    """
+
+    _api_ = [
+        ("LITE_get_device_count", [c_int, POINTER(c_size_t)]),
+        ("LITE_try_coalesce_all_free_memory", []),
+        (
+            "LITE_register_decryption_and_key",
+            [c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t],
+        ),
+        (
+            "LITE_update_decryption_or_key",
+            [c_char_p, c_void_p, POINTER(c_uint8), c_size_t],
+        ),
+        ("LITE_set_loader_lib_path", [c_char_p]),
+        ("LITE_set_persistent_cache", [c_char_p, c_int]),
+        # ('LITE_set_tensor_rt_cache', [c_char_p]),
+        ("LITE_dump_persistent_cache", [c_char_p]),
+        ("LITE_dump_tensor_rt_cache", [c_char_p]),
+    ]
+
+
+def decryption_func(func):
+    """the decryption function decorator
+    :type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte
+    """
+
+    @CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p)
+    def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data):
+        in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length)
+        key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length)
+        if c_out_data:
+            out_length = func(in_arr, None)
+            out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length)
+            return func(in_arr, key_arr, out_arr)
+        # just query the output length
+        else:
+            return func(in_arr, key_arr, None)
+
+    return wrapper
+
+
+class LiteGlobal(object):
+    """
+    some global config in lite
+    """
+
+    _api = _GlobalAPI()._lib
+
+    @staticmethod
+    def register_decryption_and_key(decryption_name, decryption_func, key):
+        c_name = c_char_p(decryption_name.encode("utf-8"))
+        key_length = len(key)
+        c_key = (c_uint8 * key_length)(*key)
+        LiteGlobal._api.LITE_register_decryption_and_key(
+            c_name, decryption_func, c_key, key_length
+        )
+
+    @staticmethod
+    def update_decryption_key(decryption_name, key):
+        c_name = c_char_p(decryption_name.encode("utf-8"))
+        key_length = len(key)
+        c_key = (c_uint8 * key_length)(*key)
+        LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length)
+
+    @staticmethod
+    def set_loader_lib_path(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_loader_lib_path(c_path)
+
+    @staticmethod
+    def set_persistent_cache(path, always_sync=False):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync)
+
+    @staticmethod
+    def set_tensorrt_cache(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_tensorrt_cache(c_path)
+
+    @staticmethod
+    def dump_persistent_cache(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_dump_persistent_cache(c_path)
+
+    @staticmethod
+    def dump_tensorrt_cache():
+        LiteGlobal._api.LITE_dump_tensorrt_cache()
+
+    @staticmethod
+    def get_device_count(device_type):
+        count = c_size_t()
+        LiteGlobal._api.LITE_get_device_count(device_type, byref(count))
+        return count.value
+
+    @staticmethod
+    def try_coalesce_all_free_memory():
+        LiteGlobal._api.LITE_try_coalesce_all_free_memory()
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
--- a/lite/pylite/megenginelite/struct.py
+++ b/lite/pylite/megenginelite/struct.py
--- a/lite/pylite/megenginelite/tensor.py
+++ b/lite/pylite/megenginelite/tensor.py
--- a/lite/pylite/megenginelite/utils.py
+++ b/lite/pylite/megenginelite/utils.py
--- a/lite/pylite/pylite.md
+++ b/lite/pylite/pylite.md
--- a/lite/pylite/requires.txt
+++ b/lite/pylite/requires.txt
+numpy>=1.18
--- a/lite/pylite/scripts/format.sh
+++ b/lite/pylite/scripts/format.sh
--- a/lite/pylite/setup.py
+++ b/lite/pylite/setup.py
--- a/lite/pylite/test/test_global.py
+++ b/lite/pylite/test/test_global.py
--- a/lite/pylite/test/test_network.py
+++ b/lite/pylite/test/test_network.py
--- a/lite/pylite/test/test_network_cuda.py
+++ b/lite/pylite/test/test_network_cuda.py
--- a/lite/pylite/test/test_tensor.py
+++ b/lite/pylite/test/test_tensor.py
--- a/lite/pylite/test/test_utils.py
+++ b/lite/pylite/test/test_utils.py
--- a/lite/src/decryption/aes_decrypt.h
+++ b/lite/src/decryption/aes_decrypt.h
--- a/lite/src/decryption/decrypt_base.h
+++ b/lite/src/decryption/decrypt_base.h
--- a/lite/src/decryption/mbedtls/aes.cc
+++ b/lite/src/decryption/mbedtls/aes.cc
--- a/lite/src/decryption/mbedtls/aes.h
+++ b/lite/src/decryption/mbedtls/aes.h
--- a/lite/src/decryption/mbedtls/config.h
+++ b/lite/src/decryption/mbedtls/config.h
--- a/lite/src/decryption/rc4/rc4_cryption_base.h
+++ b/lite/src/decryption/rc4/rc4_cryption_base.h
--- a/lite/src/decryption/rc4/rc4_cryption_impl.cpp
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.cpp
--- a/lite/src/decryption/rc4/rc4_cryption_impl.h
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.h
--- a/lite/src/decryption/rc4_cryption.cpp
+++ b/lite/src/decryption/rc4_cryption.cpp
--- a/lite/src/decryption/rc4_cryption.h
+++ b/lite/src/decryption/rc4_cryption.h
--- a/lite/src/function_base.h
+++ b/lite/src/function_base.h
--- a/lite/src/global.cpp
+++ b/lite/src/global.cpp
--- a/lite/src/lite_build_config.h.in
+++ b/lite/src/lite_build_config.h.in
--- a/lite/src/mge/algo_cache/file_cache.cpp
+++ b/lite/src/mge/algo_cache/file_cache.cpp
--- a/lite/src/mge/algo_cache/file_cache.h
+++ b/lite/src/mge/algo_cache/file_cache.h
--- a/lite/src/mge/algo_cache/redis_cache.cpp
+++ b/lite/src/mge/algo_cache/redis_cache.cpp
--- a/lite/src/mge/algo_cache/redis_cache.h
+++ b/lite/src/mge/algo_cache/redis_cache.h
--- a/lite/src/mge/common.cpp
+++ b/lite/src/mge/common.cpp
--- a/lite/src/mge/common.h
+++ b/lite/src/mge/common.h
--- a/lite/src/mge/function_dft.h
+++ b/lite/src/mge/function_dft.h
--- a/lite/src/mge/memory_allocator.h
+++ b/lite/src/mge/memory_allocator.h
--- a/lite/src/mge/network_impl.cpp
+++ b/lite/src/mge/network_impl.cpp
--- a/lite/src/mge/network_impl.h
+++ b/lite/src/mge/network_impl.h
--- a/lite/src/mge/tensor_impl.cpp
+++ b/lite/src/mge/tensor_impl.cpp
--- a/lite/src/mge/tensor_impl.h
+++ b/lite/src/mge/tensor_impl.h
--- a/lite/src/misc.cpp
+++ b/lite/src/misc.cpp
--- a/lite/src/misc.h
+++ b/lite/src/misc.h
--- a/lite/src/network.cpp
+++ b/lite/src/network.cpp
--- a/lite/src/network_impl_base.h
+++ b/lite/src/network_impl_base.h
--- a/lite/src/parse_info/default_parse.h
+++ b/lite/src/parse_info/default_parse.h
--- a/lite/src/parse_info/parse_info_base.h
+++ b/lite/src/parse_info/parse_info_base.h
--- a/lite/src/parse_model/model_parser.cpp
+++ b/lite/src/parse_model/model_parser.cpp
--- a/lite/src/parse_model/model_parser.h
+++ b/lite/src/parse_model/model_parser.h
--- a/lite/src/parse_model/pack_model.fbs
+++ b/lite/src/parse_model/pack_model.fbs
--- a/lite/src/tensor.cpp
+++ b/lite/src/tensor.cpp
--- a/lite/src/tensor_impl_base.h
+++ b/lite/src/tensor_impl_base.h
--- a/lite/src/type_info.h
+++ b/lite/src/type_info.h
--- a/lite/src/version_lite.ld
+++ b/lite/src/version_lite.ld
--- a/lite/test/CMakeLists.txt
+++ b/lite/test/CMakeLists.txt
--- a/lite/test/main.cpp
+++ b/lite/test/main.cpp
--- a/lite/test/npy.h
+++ b/lite/test/npy.h
--- a/lite/test/test_common.h
+++ b/lite/test/test_common.h
--- a/lite/test/test_misc.cpp
+++ b/lite/test/test_misc.cpp
--- a/lite/test/test_network.cpp
+++ b/lite/test/test_network.cpp
--- a/lite/test/test_network_c.cpp
+++ b/lite/test/test_network_c.cpp
--- a/lite/test/test_network_options.cpp
+++ b/lite/test/test_network_options.cpp
--- a/lite/test/test_tensor.cpp
+++ b/lite/test/test_tensor.cpp
--- a/lite/test/test_tensor_c.cpp
+++ b/lite/test/test_tensor_c.cpp
--- a/lite/tools/aes_encrypt.sh
+++ b/lite/tools/aes_encrypt.sh
--- a/lite/tools/dump_model_mgb.py
+++ b/lite/tools/dump_model_mgb.py
--- a/lite/tools/pack_model/encrypt_info_and_model.sh
+++ b/lite/tools/pack_model/encrypt_info_and_model.sh
--- a/lite/tools/pack_model/pack_model_and_info.py
+++ b/lite/tools/pack_model/pack_model_and_info.py
--- a/lite/tools/rc4_encrypt.cpp
+++ b/lite/tools/rc4_encrypt.cpp
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh