diff --git a/lite/.gitattributes b/lite/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..c268a9ab651063425e1c50893566b10bed7f84fc
--- /dev/null
+++ b/lite/.gitattributes
@@ -0,0 +1,10 @@
+test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text
+third_party/librknn_api filter=lfs diff=lfs merge=lfs -text
+test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cc61418870f59f4dc75931c6ac86ffdebb644024
--- /dev/null
+++ b/lite/CMakeLists.txt
@@ -0,0 +1,135 @@
+option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON)
+
+# config lite_build_config.h.in
+set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL})
+set(LITE_WITH_CUDA ${MGE_WITH_CUDA})
+set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING})
+set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
+set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC})
+
+if(NOT MGB_WITH_FLATBUFFERS)
+    include(../cmake/flatbuffers.cmake)
+endif()
+
+file(GLOB_RECURSE SRC_FBS src/**/*.fbs)
+build_flatbuffers(
+    "${SRC_FBS}"
+    ""
+    lite_fbs_generate
+    ""
+    "${CMAKE_CURRENT_BINARY_DIR}"
+    ""
+    ""
+    )
+
+file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp)
+
+if(MGE_WITH_MINIMUM_SIZE)
+    set(LITE_ENABLE_LOGGING OFF)
+    set(LITE_ENABLE_EXCEPTION OFF)
+endif()
+
+# Write out lite_build_config.h
+# It defines macros needed by lite
+configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# begin config lite
+if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
+    # FXIME third_party cpp redis do not support build with clang-cl
+    file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp)
+    list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS})
+    file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
+    list(APPEND SOURCES_LITE ${SOURCES_TACOPIE})
+endif()
+add_library(lite_static STATIC ${SOURCES_LITE})
+add_dependencies(lite_static lite_fbs_generate)
+include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)
+
+if(LITE_BUILD_WITH_MGE)
+    target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+    add_compile_definitions(LITE_BUILD_WITH_MGE=1)
+    message(STATUS "build lite with MegEngine.")
+else()
+    target_link_libraries(lite_static PUBLIC flatbuffers)
+endif()
+
+include_directories(
+    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>
+    )
+# end config lite
+
+# define a shared lib
+add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>)
+if(LITE_BUILD_WITH_MGE)
+    target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+endif()
+if(ANDROID)
+    link_libraries(log)
+    target_link_libraries(lite_static PRIVATE log)
+    target_link_libraries(lite_shared PRIVATE log)
+endif()
+
+if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
+    # FXIME third_party cpp redis do not support build with clang-cl
+    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
+    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
+    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
+    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
+endif()
+set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script")
+add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT})
+if(NOT MSVC AND NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+endif()
+#TODO: implemente version script for other OS
+if (UNIX AND NOT APPLE)
+    target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
+    set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
+endif()
+
+# config install
+install(TARGETS lite_static
+    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+    ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
+
+install(TARGETS lite_shared
+    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+    ARCHIVE DESTINATION lite/lib/${MGE_ARCH}
+    )
+
+install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(example)
+if(MGE_WITH_TEST)
+    add_subdirectory(test)
+endif()
+
+# tools and example
+add_executable(rc4_encryptor tools/rc4_encrypt.cpp)
+
+target_link_libraries(rc4_encryptor lite_static)
+if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(rc4_encryptor megdnn)
+endif()
+target_include_directories(rc4_encryptor PRIVATE
+    {PROJECT_SOURCE_DIR}/lite/src/decryption)
+install (TARGETS rc4_encryptor
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/tools)
diff --git a/lite/README.md b/lite/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..8720a1057f228b643abcd07c1cc03c0ff526cee2
--- /dev/null
+++ b/lite/README.md
@@ -0,0 +1,251 @@
+# Lite
+
+It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in 
+user's SDK
+
+## bazel build 
+
+目前支持内部 bazel 和 CMake 编译，支持 C++/C, Python 接口，
+下面是 bazel 中 lite_shared 目标的编译，可以作为其他目标的编译的参考，
+该编译依赖内部 bazel 编译以及 megvii3。
+
+### 配置编译环境
+
+需要使用 megvii3 workspace 来完成 bazel 的编译
+
+#### Clone megvii3 安装 bazel
+
+```bash
+    git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git
+    ./utils/bazel/get_bazel.sh
+```
+
+#### Clone megbrain
+```
+    git submodule update brain/megbrain brain/midout
+```
+
+### 编译 x86 CUDA 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
+        --compiler="gcc7_cuda10" -c opt
+```
+
+### 编译 x86 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
+        --compiler="gcc9" -c opt
+```
+
+### 编译 arm OpenCL 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \
+        -c opt --define enable_opencl=1  --define enable_opencl_search=1
+```
+### 编译 arm opencl lite_examples
+bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \
+--cpu=android_aarch64 --define enable_opencl=1  --define enable_opencl_search=1
+####如何运行snpe_loder 的lite_exampes 请查看下面的wiki
+https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906
+
+### 编译 armv7 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \
+        -c opt
+```
+
+### 编译 arm64 CPU 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
+        -c opt
+```
+
+### 编译 arm64 CPU v8.2 版本
+
+```bash
+    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
+       --copt -march=armv8.2-a+fp16+dotprod  -c opt
+```
+
+## 同时支持cmake构建
+cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine
+和RKNPU后端且打开OpenCL的release模式
+```bash
+EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \
+-DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh"
+```
+* 如果需要支持性能分析的 profile 功能，则需要在编译时候加上
+ --copt -DMGB_ENABLE_JSON=1 该参数
+* 如果需要支持 fast-run 功能则需要加上
+ --copt -DMGB_ENABLE_FASTRUN=1，开启 fast-run 功能
+* 如果编译 arm64，可以加上 --copt -mcpu=cortex-a53 选项进行优化。
+
+### midout 裁减编译
+具体 midout 的裁减原理见 megbrain 中 midout 裁减，裁减方法见 MegBrain 
+和 MegEngine 的裁减方法
+
+## 模型
+
+### 支持的模型
+
+lite 目前支持只支持 MegEngine dump 的模型格式，可以加载的模型文件包括原始
+的模型文件，原始的加密模型，pack 之后的加密或者非加密模型。加密算法以及
+加密的秘钥可以用户自定义，然后注册到 lite 中，详见 example 中加解密部分。
+
+* 原始模型未加密：直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型
+* 原始加密模型：将上述 dump 的模型通过加密算法进行加密，lite 提供两种默认
+的加密算法，在 tools 中，分别为 aes 和 rc4. 对应为：aes_encypt.sh 和
+rc4_encrypt.cpp，rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在
+加载时候需要在 Config 中配置模型的加密方式。
+* pack 之后的模型：模型结构将在下面介绍，可以将上面加密或者未加密的模型，和下面
+定义的 json config 文件一同打包为一个 pack 之后的模型，可以使用 tools 下面
+的 pack_model_and_info.py 工具中完成，pack_model_and_info.py 的使用详见其中
+的 help 输出。
+
+### 模型结构
+
+不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分.
+
+* 打包处理之后的文件：
+  模型打包过程可以通过脚本 pack_model_and_json.py 来完成，其将模型info文件（
+  可以是任意格式，推荐使用JSON，可以加密也可以不加密）和加密或者未加密的模型文件
+  一同打包在一起，并在文件开头加上 Header 来帮助解析。
+* 原始文件和原始的加密文件没有 Header 和模型 info部分，模型加载需要的信息
+  可以通过 Config 和 NetworkIO 进行传递。
+
+### Header
+
+Header 部分最开始为一个明文固定model_tag，目前定义为"packed_model"字符串，
+后面主要包含模型文件各个部分的信息，每个部分的加密方式，load 模型时候可以
+调用相应的解密方法对各个部分进行解密，以及model infomation 部分的解析方法。
+具体细节参考lite/src/parse_model/pack_model.fbs
+
+### Info部分
+
+Info 部分主要用来解释模型，如用户关心的：模型的输入数据的格式，模型运行的平台
+等信息，这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。
+由于这个 Info 部分不同的用户需求不一致，想传递的信息也无法统一，所以目前
+Lite 中提供自定义的方式，用户可以自定义自己 Info 部分的类容，并在 Header 中
+指定 **Info 解析方式名字** ，并注册以该名字为 key 的解析函数到 Lite 中，
+以这样方式来可以实现用户自定义 Info 格式。同时，Lite 中也提供了一套定义好的
+格式，其名字为 "LITE_default"，并已经实现了对应的解析函数，该 info
+为 JSON 格式，具体内容定义如下：
+
+```json
+{
+    "name": "shufflenet_test",
+    "valid": true,
+    "version": "8.9999.0",
+    "has_compression": false,
+    "device": {
+        "type": "CPU",
+        "device_id": 0,
+        "number_threads": 1,
+        "use_tensor_rt": false,
+        "enable_inplace_model": false
+    },
+    "options":{
+        "weight_preprocess": false,
+        "var_sanity_check_first_run": true,
+        "const_shape": false,
+        "jit_level": 0,
+        "record_level": 0
+    },
+    "IO":{
+        "inputs":[
+             {
+                "name": "data",
+                "io_type": "value",
+                "is_host": true,
+                "dtype": "float32",
+                "shape": {
+                    "dim0": 1,
+                    "dim1": 3,
+                    "dim2": 224,
+                    "dim3": 224
+                }
+            }
+        ],
+        "outputs":[
+             {
+                "name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]",
+                "io_type": "value",
+                "is_host": true,
+                "dtype": "float32",
+                "shape": {
+                    "dim0": 1,
+                    "dim1": 1000,
+                    "dim2": 0,
+                    "dim3": 0
+                }
+            }
+        ]
+    }
+}
+```
+
+* model_name: 指这个模型的名字，用户可以用来验证是否运行了正确的模型，
+和 Header 部分中的进行对比 check
+* valid: 指在这个 info 文件中的设置是否影响模型的 Config
+* version: 指模型对应的 megbrain 的版本号，load 模型时候会进行 check
+* has_compression: 标识这个模型文件中 tensor 的数据是否压缩过
+* device: 目前支持字段包括："CPU","CUDA","OPENCL","ATLAS"
+* number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效
+* IO::inputs::type: 包括 value,shape，详见 include"network.h"
+* IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端
+* IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端
+* IO::outputs::shape::dimx: 如果为0，则便是该 dim 无效
+
+### Model部分
+
+可以是加密的模型文件或者未加密的模型文件
+
+## 使用
+
+丰富的使用方法详见文件 example 中文档和对应的 example。
+
+## 工具
+
+目前 lite 中有三个工具保存在 tools 目录中，其他 megbrain 工具
+没有包含在内，分别为：
+
+* pack_model_and_info.py 为上面提到的模型打包工具，其为一个
+  python 脚本，可以直接用其对已有的模型和模型 information 的文件，按照上面
+  的格式进行打包模型，用户可以指定模型名字，模型加密方式，模型信息
+  文件加密方式，解析方式等，如下：
+
+    ```bash
+    python3 pack_model_and_info.py --input-model xxx.mge \
+        --model-name="shufflenet_test" \
+        --model-cryption="RC4_default" \
+        --input-info xxx.json \
+        --info-cryption="RC4_default" \
+        --info-parser="LITE_default" \
+        -o xxx.lite
+    ```
+* aes_encrypt.sh 为一个 aes 加密方式的加密脚本，可以将一个文件，
+通过指定的的 key 加密成一个 aes 加密的文件，其中 key 为 32 个字节
+16进制数。
+    ```bash
+    aes_encrypt.sh  xxx.mdl  xxx_encrypted.mdl \
+        000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
+    ```
+
+* rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具，这个工具可以通过
+  制定的 key 或者默认的 key 加密制定的文件，支持 rc4 方法和
+  simple_fast_rc4 两种方法，支持自定义 key。
+    * bazel 编译 x86 命令为：
+    ```bash
+    bazel build //brain/megbrain/lite:rc4_encryptor \
+        --cpu='k8' --compiler='gcc9'
+    ```
+    * 加密文件，具体用法见 help
+    ```bash
+    rc4_encryptor encrypt_predefined_rc4 \
+        to_be_encrypt.file encrypted.file
+    ```
diff --git a/lite/build_config/lite_build_config.h b/lite/build_config/lite_build_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a606c9fda2f5cf2dc8cfe63cf903da74a4b776e
--- /dev/null
+++ b/lite/build_config/lite_build_config.h
@@ -0,0 +1,32 @@
+/**
+ * \file lite/build_config/lite_build_config.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#ifndef _HEADER_LITE_BUILD_CONFIG
+#define _HEADER_LITE_BUILD_CONFIG
+
+#ifndef LITE_ENABLE_LOGGING
+#define LITE_ENABLE_LOGGING 1
+#endif
+
+#ifndef LITE_ENABLE_EXCEPTION
+#if __cpp_exceptions || __EXCEPTIONS || \
+        (defined(_MSC_VER) && defined(_CPPUNWIND))
+#define LITE_ENABLE_EXCEPTION 1
+#else
+#define LITE_ENABLE_EXCEPTION 0
+#endif
+#endif
+
+#ifndef LITE_WITH_CUDA
+#define LITE_WITH_CUDA 0
+#endif
+
+#ifndef LITE_ASSERT_LOC
+#define LITE_ASSERT_LOC 1
+#endif
+#endif  // _HEADER_LITE_BUILD_CONFIG
diff --git a/lite/example/CMakeLists.txt b/lite/example/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9012fded9a2d8bef2b43b0feb07c8d5203628708
--- /dev/null
+++ b/lite/example/CMakeLists.txt
@@ -0,0 +1,47 @@
+file (GLOB_RECURSE SOURCES ./*.cpp)
+add_executable(lite_examples  ${SOURCES})
+
+if(LITE_BUILD_WITH_RKNPU)
+    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(lite_examples  PRIVATE "-fuse-ld=gold")
+endif()
+
+target_link_libraries(lite_examples lite_static)
+if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(lite_examples megdnn)
+endif()
+
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(lite_examples dl)
+    else()
+        target_link_libraries(lite_examples dl rt)
+    endif()
+endif()
+
+install (TARGETS lite_examples
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/bin)
+
+# add lite_examples_depends_shared for CI check symbol export valid
+add_executable(lite_examples_depends_shared  ${SOURCES})
+
+if(LITE_BUILD_WITH_RKNPU)
+    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(lite_examples_depends_shared  PRIVATE "-fuse-ld=gold")
+endif()
+
+target_link_libraries(lite_examples_depends_shared lite_shared)
+
+if(UNIX)
+    if(APPLE OR ANDROID)
+        target_link_libraries(lite_examples_depends_shared dl)
+    else()
+        target_link_libraries(lite_examples_depends_shared dl rt)
+    endif()
+endif()
+
+install (TARGETS lite_examples_depends_shared
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/bin)
diff --git a/lite/example/example.h b/lite/example/example.h
new file mode 100644
index 0000000000000000000000000000000000000000..410ec0e610c6ed4757ada4bfc30bb81402aa300d
--- /dev/null
+++ b/lite/example/example.h
@@ -0,0 +1,101 @@
+/**
+ * \file example/example.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include <lite_build_config.h>
+
+#include "lite/global.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+
+#include "npy.h"
+
+#include <string.h>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace lite {
+namespace example {
+
+void set_cpu_affinity(const std::vector<int>& cpuset);
+
+struct Args {
+    int args_parse_ret = 0;
+    std::string example_name;
+    std::string model_path;
+    std::string input_path;
+    std::string output_path;
+    std::string loader_path;
+    static Args from_argv(int argc, char** argv);
+};
+
+std::shared_ptr<Tensor> parse_npy(
+        const std::string& path,
+        LiteBackend backend = LiteBackend::LITE_DEFAULT);
+
+using ExampleFunc = std::function<bool(const Args&)>;
+using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>;
+
+ExampleFuncMap* get_example_function_map();
+
+bool register_example(std::string example_name, const ExampleFunc& fuction);
+
+template <int>
+struct Register;
+
+#if LITE_BUILD_WITH_MGE
+#if LITE_WITH_CUDA
+bool load_from_path_run_cuda(const Args& args);
+#endif
+bool basic_load_from_path(const Args& args);
+bool basic_load_from_path_with_loader(const Args& args);
+bool basic_load_from_memory(const Args& args);
+bool cpu_affinity(const Args& args);
+bool network_share_same_weights(const Args& args);
+bool reset_input(const Args& args);
+bool reset_input_output(const Args& args);
+bool config_user_allocator(const Args& args);
+bool register_cryption_method(const Args& args);
+bool update_cryption_key(const Args& args);
+bool async_forward(const Args& args);
+
+#if LITE_WITH_CUDA
+bool device_input(const Args& args);
+bool device_input_output(const Args& args);
+bool pinned_host_input(const Args& args);
+#endif
+#endif
+
+}  // namespace example
+}  // namespace lite
+
+#if LITE_BUILD_WITH_MGE
+bool basic_c_interface(const lite::example::Args& args);
+bool device_io_c_interface(const lite::example::Args& args);
+bool async_c_interface(const lite::example::Args& args);
+#endif
+
+#define CONCAT_IMPL(a, b) a##b
+#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)
+
+#define REGIST_EXAMPLE(name_, func_) \
+    REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_)
+
+#define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_)          \
+    template <>                                                 \
+    struct Register<number_> {                                  \
+        Register() { register_example(name_, func_); }          \
+    };                                                          \
+    namespace {                                                 \
+    Register<number_> MACRO_CONCAT(example_function_, number_); \
+    }
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/main.cpp b/lite/example/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e9400083060e11195d22055ad970c75fe384739
--- /dev/null
+++ b/lite/example/main.cpp
@@ -0,0 +1,172 @@
+/**
+ * \file example/example.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/global.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+
+#include "example.h"
+#include "npy.h"
+
+#include <string.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+using namespace lite;
+using namespace example;
+
+Args Args::from_argv(int argc, char** argv) {
+    Args ret;
+    if (argc < 4) {
+        printf("usage: lite_examples <example_name> <model file> <input "
+               "file> <output file>.\n");
+        printf("*********The output file is optional.*************\n");
+        printf("The registered examples include:\n");
+        size_t index = 0;
+        for (auto it : *get_example_function_map()) {
+            printf("%zu : %s\n", index, it.first.c_str());
+            index++;
+        }
+        ret.args_parse_ret = -1;
+        return ret;
+    }
+    ret.example_name = argv[1];
+    ret.model_path = argv[2];
+    ret.input_path = argv[3];
+    if (argc > 4) {
+        ret.output_path = argv[4];
+    }
+    if (argc > 5) {
+        ret.loader_path = argv[5];
+    }
+    return ret;
+}
+
+ExampleFuncMap* lite::example::get_example_function_map() {
+    static ExampleFuncMap static_map;
+    return &static_map;
+}
+
+bool lite::example::register_example(std::string example_name,
+                                     const ExampleFunc& fuction) {
+    auto map = get_example_function_map();
+    if (map->find(example_name) != map->end()) {
+        printf("Error!!! This example is registed yet\n");
+        return false;
+    }
+    (*map)[example_name] = fuction;
+    return true;
+}
+
+std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path,
+                                                 LiteBackend backend) {
+    std::string type_str;
+    std::vector<npy::ndarray_len_t> stl_shape;
+    std::vector<int8_t> raw;
+    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);
+
+    auto lite_tensor =
+            std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU);
+    Layout layout;
+    layout.ndim = stl_shape.size();
+    const std::map<std::string, LiteDataType> type_map = {
+            {"f4", LiteDataType::LITE_FLOAT},
+            {"i4", LiteDataType::LITE_INT},
+            {"i1", LiteDataType::LITE_INT8},
+            {"u1", LiteDataType::LITE_UINT8}};
+    layout.shapes[0] = 1;
+    for (size_t i = 0; i < layout.ndim; i++) {
+        layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
+    }
+
+    for (auto& item : type_map) {
+        if (type_str.find(item.first) != std::string::npos) {
+            layout.data_type = item.second;
+            break;
+        }
+    }
+    lite_tensor->set_layout(layout);
+    size_t length = lite_tensor->get_tensor_total_size_in_byte();
+    void* dest = lite_tensor->get_memory_ptr();
+    memcpy(dest, raw.data(), length);
+    //! rknn not support reshape now
+    if (layout.ndim == 3) {
+            lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]),
+                                  static_cast<int>(layout.shapes[1]),
+                                  static_cast<int>(layout.shapes[2])});
+    }
+    return lite_tensor;
+}
+
+void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) {
+#if defined(__APPLE__) || defined(WIN32)
+#pragma message("set_cpu_affinity not enabled on apple and windows platform")
+#else
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i : cpuset) {
+        CPU_SET(i, &mask);
+    }
+    auto err = sched_setaffinity(0, sizeof(mask), &mask);
+    if (err) {
+        printf("failed to sched_setaffinity: %s (error ignored)",
+               strerror(errno));
+    }
+#endif
+}
+
+int main(int argc, char** argv) {
+    set_log_level(LiteLogLevel::WARN);
+    auto&& args = Args::from_argv(argc, argv);
+    if (args.args_parse_ret)
+        return -1;
+    auto map = get_example_function_map();
+    auto example = (*map)[args.example_name];
+    if (example) {
+        printf("Begin to run %s example.\n", args.example_name.c_str());
+        return example(args);
+    } else {
+        printf("The example of %s is not registed.", args.example_name.c_str());
+        return -1;
+    }
+}
+namespace lite {
+namespace example {
+
+#if LITE_BUILD_WITH_MGE
+#if LITE_WITH_CUDA
+REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda);
+#endif
+REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path);
+REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader);
+REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory);
+REGIST_EXAMPLE("cpu_affinity", cpu_affinity);
+REGIST_EXAMPLE("register_cryption_method", register_cryption_method);
+REGIST_EXAMPLE("update_cryption_key", update_cryption_key);
+REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights);
+REGIST_EXAMPLE("reset_input", reset_input);
+REGIST_EXAMPLE("reset_input_output", reset_input_output);
+REGIST_EXAMPLE("config_user_allocator", config_user_allocator);
+REGIST_EXAMPLE("async_forward", async_forward);
+
+REGIST_EXAMPLE("basic_c_interface", basic_c_interface);
+REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface);
+REGIST_EXAMPLE("async_c_interface", async_c_interface);
+
+#if LITE_WITH_CUDA
+REGIST_EXAMPLE("device_input", device_input);
+REGIST_EXAMPLE("device_input_output", device_input_output);
+REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
+#endif
+#endif
+}  // namespace example
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/README.md b/lite/example/mge/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f41115d22cadd7b147103015a43566e1e098ee7b
--- /dev/null
+++ b/lite/example/mge/README.md
@@ -0,0 +1,166 @@
+# Example
+
+在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子，主要
+是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example 
+都是使用 shufflenet 来进行演示。
+
+## Example bazel 的编译和运行
+
+* 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境，编译 CPU 版本
+```bash
+    ./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \
+        --compiler="gcc9" -c opt
+```
+* 运行时需要指定运行的具体 example 名字，运行的模型，模型运行的数据
+ * 获取所有的 example 名字
+```
+    bazel-bin/brain/megbrain/lite/lite_examples
+```
+ * 运行 example，下面命令运行 basic_load_from_memory
+```
+    bazel-bin/brain/megbrain/lite/lite_examples \
+        basic_load_from_memory \
+        path-to-megbrain/lite/test/resource/lite/shufflenet.mge \
+        path-to-megbrain/lite/test/resource/lite/input_data.npy
+```
+
+## basic 使用
+
+* **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和
+ basic_load_from_memory**
+
+* 该 example 使用 lite 来完成基本的 inference 功能，load 模型使用默认的配置，
+进行 forward 之前将输入数据 copy 到输入 tensor 中，完成 forward 之后，再将
+数据从输出 tensor 中 copy 到用户的内存中，输入 tensor 和输出 tensor 都是从
+Network 中通过 name 来获取的，输入输出 tensor 的 layout 也可以从对应的 tensor
+中直接获取获取，**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。**
+
+## 输入输出指定的内存
+
+* **实现在 reset_io.cpp 中，包括两个 example，reset_input 和 reset_input_output
+两个 example。**
+
+* 该 example 中演示输入 tensor 的内存为用户指定的内存（该内存中已经保存好输入
+数据），输出 tensor 也可以是用户指定的内存，这样 Network 完成 Forward 之后就会将数据
+保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。
+
+* 主要是通过 tensor 中的 reset 接口，该接口可以重新指定 tensor 的内存和对应的
+layout，如果 layout 没有指定，默认为 tensor 中原来的 layout。
+
+* **该方法中由于内存是用户申请，需要用户提前知道输入，输出 tensor 对应的 layout，然后
+根据 layout 来申请内存，另外通过 reset 设置到 tensor 中的内存，生命周期不由 tensor
+管理，由外部用户来管理。**
+
+## 输入输出指定 device 上内存
+
+* **实现在 device_io.cpp 中，device_input 和 device_input_output 两个 example。**
+
+* 该 example 中配置模型运行在 device(CUDA) 上，并且使用用户提前申请的 device 上的内存
+作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上，不设置默认
+在 CPU 上，其他地方和**输入输出为用户指定的内存**的使用相同
+
+* 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端
+
+## 申请 pinned host 内存作为输入
+
+* **实现在 device_io.cpp 中，函数名字为 pinned_host_input。**
+
+* 这个 example 中模型运行在 device(CUDA) 上，但是输入输出在 CPU 上，为了加速 host2device 的
+copy，将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出
+output tensor 不是 device 上的时候，默认就是 pinned host 的。
+
+* 申请 pinned host 内存的方法是：构建 tensor 的时候指定 device，layout，以及 is_host_pinned
+参数，这样申请的内存就是 pinned host 的内存。
+
+    ```C
+     bool is_pinned_host = true;
+     auto tensor_pinned_input =
+             Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
+    ```
+
+## 用户指定内存分配器
+
+* **实现在 user_allocator.cpp 中，函数名为：config_user_allocator。**
+
+* 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法，用户自定义
+内存分配器需要继承自 lite 中的 Allocator 基类，并实现 allocate 和 free 两个接口。目前在 CPU
+上验证是正确的，其他设备上有待测试。
+
+* 设置自定定义内存分配器的接口为 Network 中如下接口：
+    ```C
+    Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator);
+    ```
+
+## 多个 Network 共享同一份模型 weights
+
+* **实现在 network_share_weights.cpp 中，函数名为：network_share_same_weights。**
+
+* 很多情况用户希望多个 Network 共享同一份 weights，因为模型中 weights 是只读的，这样可以节省
+模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能，首先创建一个新的 Network，
+用户可以指定新的 Config 和 NetworkIO 以及其他一些配置，使得新创建出来的 Network 完成不同的
+功能。
+
+* 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口：
+    ```C
+        static void shared_weight_with_network(
+            std::shared_ptr<Network> dst_network,
+            const std::shared_ptr<Network> src_network);
+    ```
+    * dst_network: 指新 load 出来的 Network
+    * src_network：已经 load 的老的 Network
+
+## CPU 绑核
+
+* **实现在 cpu_affinity.cpp 中，函数名为：cpu_affinity。**
+
+* 该 example 之中指定模型运行在 CPU 多线程上，然后使用 Network 中的
+set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来，用户可以
+根据该 id 决定具体绑核行为，在多线程中，如果线程总数为 n，则 id 为 n-1 的线程为主线程。
+
+## 用户注册自定义解密算法和 key
+
+* **实现在 user_cryption.cpp 中，函数名为：register_cryption_method 和 update_aes_key 。**
+
+* 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口，实现了使用用户自定的解密算法
+实现模型的 load 操作。在这个 example 中，自定义了一个解密方法，(其实没有做任何事情，
+将模型两次异或上 key 之后返回，等于将原始模型直接返回)，然后将其注册到 lite 中，后面创建 Network 时候在其
+config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其
+key 的更新操作。
+目前 lite 里面定义好了几种解密算法：
+    * AES_default : 其 key 是由 32 个 unsighed char 组成，默认为0到31
+    * RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char，hash
+      key 在前，enc_key 在后。
+    * SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。
+大概命名规则为：前面大写是具体算法的名字，'_'后面的小写，代表解密 key。
+具体的接口为：
+    ```C
+    bool register_decryption_and_key(std::string decrypt_name,
+                                    const DecryptionFunc& func,
+                                    const std::vector<uint8_t>& key);
+    bool update_decryption_or_key(std::string decrypt_name,
+                                    const DecryptionFunc& func,
+                                    const std::vector<uint8_t>& key);
+    ```
+register 接口中必须要求三个参数都是正确的值，update中 decrypt_nam 必须为已有的解密算法，
+将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新
+
+## 异步执行模式
+
+* **实现在 basic.cpp 中，函数名为：async_forward。**
+
+* 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式，
+目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持，在 inference 时异步模式，
+主线程可以在工作线程正在执行计算的同时做一些其他的运算，避免长时间等待，但是
+在一些单核处理器上没有收益。
+
+## 纯 C example
+
+* **实现在 lite_c_interface.cpp，函数名为：basic_c_interface，
+device_io_c_interface，async_c_interface**
+
+* Lite 完成对 C++ 接口的封装，对外暴露了纯 C 的接口，用户如果不是源码依赖 Lite
+的情况下，应该使用纯 C 接口来完成集成。
+* 纯 C 的所有接口都是返回一个 int，如果这个 int 的数值不为 0，则又错误产生，需要
+调用 LITE_get_last_error 来获取错误信息。
+* 纯 C 的所有 get 函数都需要先定义一个对应的对象，然后将该对象的指针传递进接口，
+Lite 会将结果写入到 对应指针的地址里面。
diff --git a/lite/example/mge/basic.cpp b/lite/example/mge/basic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..986f1fc304f1afff9bab6d04c6f1efdffdd75c62
--- /dev/null
+++ b/lite/example/mge/basic.cpp
@@ -0,0 +1,370 @@
+/**
+ * \file example/basic.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <thread>
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+#include <cstdio>
+
+#include "misc.h"
+
+using namespace lite;
+using namespace example;
+
+namespace {
+void output_info(std::shared_ptr<Network> network, size_t output_size) {
+    for (size_t index = 0; index < output_size; index++) {
+        printf("output[%zu] names %s \n", index,
+               network->get_all_output_name()[index].c_str());
+        std::shared_ptr<Tensor> output_tensor =
+                network->get_output_tensor(index);
+        size_t ndim = output_tensor->get_layout().ndim;
+        for (size_t i = 0; i < ndim; i++) {
+            printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
+                   output_tensor->get_layout().shapes[i]);
+        }
+    }
+}
+
+void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
+    for (size_t index = 0; index < output_size; index++) {
+        auto output_tensor = network->get_output_tensor(index);
+        void* out_data = output_tensor->get_memory_ptr();
+        size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                            output_tensor->get_layout().get_elem_size();
+        LiteDataType dtype = output_tensor->get_layout().data_type;
+        float max = -1000.0f;
+        float min = 1000.0f;
+        int max_idx = 0;
+        int min_idx = 0;
+        float sum = 0.0f;
+#define cb(_dtype, _real_dtype)                                        \
+    case LiteDataType::_dtype: {                                       \
+        for (size_t i = 0; i < out_length; i++) {                      \
+            _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
+            sum += data;                                               \
+            if (max < data) {                                          \
+                max = data;                                            \
+                max_idx = i;                                           \
+            }                                                          \
+            if (min > data) {                                          \
+                min = data;                                            \
+                min_idx = i;                                           \
+            }                                                          \
+        }                                                              \
+    } break;
+
+        switch (dtype) {
+            cb(LITE_FLOAT, float);
+            cb(LITE_INT, int);
+            cb(LITE_INT8, int8_t);
+            cb(LITE_UINT8, uint8_t);
+            default:
+                printf("unknow datatype");
+        }
+        printf("output_length %zu index %zu  max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
+               out_length, index, max, max_idx, min, min_idx, sum);
+    }
+#undef cb
+}
+}  // namespace
+
+#if LITE_WITH_CUDA
+bool lite::example::load_from_path_run_cuda(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    set_log_level(LiteLogLevel::DEBUG);
+    //! config the network running in CUDA device
+    lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
+    //! set NetworkIO
+    NetworkIO network_io;
+    std::string input_name = "img0_comp_fullface";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    network_io.inputs.push_back(device_input);
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory if filled with user input data, set it to the
+    //! input tensor
+    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+#endif
+bool lite::example::basic_load_from_path(const Args& args) {
+    set_log_level(LiteLogLevel::DEBUG);
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto layout = input_tensor->get_layout();
+    for (size_t i = 0; i < layout.ndim; i++) {
+        printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
+    }
+
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    auto layout0 = src_tensor->get_layout();
+    for (size_t i = 0; i < layout0.ndim; i++) {
+        printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
+    }
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+
+bool lite::example::basic_load_from_path_with_loader(const Args& args) {
+    set_log_level(LiteLogLevel::DEBUG);
+    lite::set_loader_lib_path(args.loader_path);
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto input_layout = input_tensor->get_layout();
+
+    //! copy or forward data to network
+    auto src_tensor = parse_npy(input_path);
+    auto src_layout = src_tensor->get_layout();
+    if (src_layout.ndim != input_layout.ndim) {
+        printf("src dim is not equal model input dim\n");
+    }
+    //! pay attention the input shape can change
+    for (size_t i = 0; i < input_layout.ndim; i++) {
+        if (input_layout.shapes[i] != src_layout.shapes[i]) {
+            printf("src shape not equal input shape");
+        }
+    }
+    input_tensor->set_layout(src_tensor->get_layout());
+
+    //! reset or forward data to network
+    input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! forward
+    {
+        lite::Timer ltimer("warmup");
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(0);
+    }
+    lite::Timer ltimer("forward_iter");
+    for (int i = 0; i < 10; i++) {
+        ltimer.reset_start();
+        network->forward();
+        network->wait();
+        ltimer.print_used_time(i);
+    }
+
+    //! get the output data or read tensor set in network_in
+    size_t output_size = network->get_all_output_name().size();
+    output_info(network, output_size);
+    output_data_info(network, output_size);
+    return true;
+}
+
+bool lite::example::basic_load_from_memory(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    FILE* fin = fopen(network_path.c_str(), "rb");
+    if (!fin) {
+        printf("failed to open %s.", network_path.c_str());
+    }
+
+    fseek(fin, 0, SEEK_END);
+    size_t size = ftell(fin);
+    fseek(fin, 0, SEEK_SET);
+    void* ptr = malloc(size);
+    std::shared_ptr<void> buf{ptr, ::free};
+    auto len = fread(buf.get(), 1, size, fin);
+    if (len < 1) {
+        printf("read file failed.\n");
+    }
+    fclose(fin);
+
+    network->load_model(buf.get(), size);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::async_forward(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    Config config;
+    config.options.var_sanity_check_first_run = false;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! set async mode and callback
+    volatile bool finished = false;
+    network->set_async_callback([&finished]() {
+#if !__DEPLOY_ON_XP_SP2__
+        std::cout << "worker thread_id:" << std::this_thread::get_id()
+                  << std::endl;
+#endif
+        finished = true;
+    });
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    //! forward
+    network->forward();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("Forward finish, count is %zu\n", count);
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/cpu_affinity.cpp b/lite/example/mge/cpu_affinity.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c740b53327b8e536f0e0b4eadb71b0d99cd1511
--- /dev/null
+++ b/lite/example/mge/cpu_affinity.cpp
@@ -0,0 +1,69 @@
+/**
+ * \file example/cpu_affinity.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::cpu_affinity(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    //! run with multi theads
+    Runtime::set_cpu_threads_number(network, 4);
+
+    network->load_model(network_path);
+
+    std::vector<int> core_ids = {0, 1, 2, 3};
+    auto affinity = [core_ids](int id) {
+        //! add user define affinity function
+        set_cpu_affinity({core_ids[id]});
+        printf("set thread id = %d with the affinity of core %d.\n", id,
+               core_ids[id]);
+    };
+    Runtime::set_runtime_thread_affinity(network, affinity);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/device_io.cpp b/lite/example/mge/device_io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..321bf388bd15ca377824803d8b4f9541db8780e1
--- /dev/null
+++ b/lite/example/mge/device_io.cpp
@@ -0,0 +1,189 @@
+/**
+ * \file example/device_io.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <thread>
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+#if LITE_WITH_CUDA
+
+bool lite::example::device_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! set NetworkIO
+    NetworkIO network_io;
+    std::string input_name = "data";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    network_io.inputs.push_back(device_input);
+
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory if filled with user input data, set it to the
+    //! input tensor
+    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::device_input_output(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! set NetworkIO include input and output
+    NetworkIO network_io;
+    std::string input_name = "data";
+    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    bool is_host = false;
+    IO device_input{input_name, is_host};
+    IO device_output{output_name, is_host};
+    network_io.inputs.push_back(device_input);
+    network_io.outputs.push_back(device_output);
+
+    //! create and load the network
+    std::shared_ptr<Network> network =
+            std::make_shared<Network>(config, network_io);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
+    Layout input_layout = input_tensor_device->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+
+    //! malloc the device memory
+    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
+
+    //! copy to the device memory
+    tensor_device.copy_from(*src_tensor);
+
+    //! Now the device memory is filled with user input data, set it to the
+    //! input tensor
+    input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! output is in device, should copy it to host
+    std::shared_ptr<Tensor> output_tensor_device =
+            network->get_io_tensor(output_name);
+
+    auto output_tensor = std::make_shared<Tensor>();
+    output_tensor->copy_from(*output_tensor_device);
+
+    //! get the output data or read tensor set in network_in
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::pinned_host_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! config the network running in CUDA device
+    lite::Config config{LiteDeviceType::LITE_CUDA};
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    Layout input_layout = input_tensor->get_layout();
+
+    //! read data from numpy data file
+    auto src_tensor = parse_npy(input_path);
+    //! malloc the pinned host memory
+    bool is_pinned_host = true;
+    auto tensor_pinned_input =
+            Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
+    //! copy to the pinned memory
+    tensor_pinned_input.copy_from(*src_tensor);
+    //! set the pinned host memory to the network as input
+    input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/lite_c_interface.cpp b/lite/example/mge/lite_c_interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a917877a5a213c63e9be48fd16795a60c508878
--- /dev/null
+++ b/lite/example/mge/lite_c_interface.cpp
@@ -0,0 +1,224 @@
+/**
+ * \file example/basic_c_interface.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#include "misc.h"
+#if LITE_BUILD_WITH_MGE
+#include "lite-c/global_c.h"
+#include "lite-c/network_c.h"
+#include "lite-c/tensor_c.h"
+
+#include <thread>
+
+#define LITE_CAPI_CHECK(_expr)                 \
+    do {                                       \
+        int _ret = (_expr);                    \
+        if (_ret) {                            \
+            LITE_THROW(LITE_get_last_error()); \
+        }                                      \
+    } while (0)
+
+bool basic_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+
+    //! create and load the network
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, *default_config(), *default_network_io()));
+
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    void* dst_ptr;
+    size_t length_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_in_byte));
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr));
+    //! copy or forward data to network
+    memcpy(dst_ptr, src_ptr, length_in_byte);
+
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    //! get the output data or read tensor data
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    //! get the first output tensor name
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    void* output_ptr;
+    size_t length_output_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
+                                                       &length_output_in_byte));
+
+    size_t out_length = length_output_in_byte / sizeof(float);
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(output_ptr)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool device_io_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+    size_t length_read_in = src_tensor->get_tensor_total_size_in_byte();
+
+    //! create and load the network
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, *default_config(), *default_network_io()));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    size_t length_tensor_in;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_tensor_in));
+    if (length_read_in != length_tensor_in) {
+        LITE_THROW("The input data size is not match the network input tensro "
+               "size,\n");
+    }
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
+                                             length_tensor_in));
+
+    //! reset the output tensor memory with user allocated memory
+    size_t out_length = 1000;
+    LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
+    std::shared_ptr<float> ptr(new float[out_length],
+                               [](float* ptr) { delete[] ptr; });
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    LITE_CAPI_CHECK(
+            LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));
+
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    void* out_data = ptr.get();
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+namespace {
+volatile bool finished = false;
+int async_callback(void) {
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+    finished = true;
+    return 0;
+}
+}  // namespace
+
+bool async_c_interface(const lite::example::Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! read input data to lite::tensor
+    auto src_tensor = lite::example::parse_npy(input_path);
+    void* src_ptr = src_tensor->get_memory_ptr();
+
+    LiteNetwork c_network;
+    LiteConfig config = *default_config();
+    config.options.var_sanity_check_first_run = false;
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io()));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
+
+    //! set input data to input tensor
+    LiteTensor c_input_tensor;
+    size_t length_tensor_in;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
+                                                       &length_tensor_in));
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
+                                             length_tensor_in));
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback));
+    //! forward
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("The count is %zu\n", count);
+    finished = false;
+
+    //! get the output data or read tensor data
+    const char* output_name;
+    LiteTensor c_output_tensor;
+    //! get the first output tensor name
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    void* output_ptr;
+    size_t length_output_in_byte;
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
+    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
+                                                       &length_output_in_byte));
+
+    size_t out_length = length_output_in_byte / sizeof(float);
+    printf("length=%zu\n", out_length);
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(output_ptr)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/network_share_weights.cpp b/lite/example/mge/network_share_weights.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2b6e741fffa9c7760453aa5650e79412cc55a57
--- /dev/null
+++ b/lite/example/mge/network_share_weights.cpp
@@ -0,0 +1,78 @@
+/**
+ * \file example/network_share_weights.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::network_share_same_weights(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(network_path);
+
+    //! load a new network from the created network and share the same weights,
+    Config config_new;
+    config_new.options.const_shape = true;
+    NetworkIO network_io_new;
+    std::shared_ptr<Network> weight_shared_network =
+            std::make_shared<Network>(config_new, network_io_new);
+    Runtime::shared_weight_with_network(weight_shared_network, network);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    std::shared_ptr<Tensor> input_tensor2 =
+            weight_shared_network->get_input_tensor(0);
+    void* dst_ptr2 = input_tensor2->get_memory_ptr();
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+    memcpy(dst_ptr2, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    weight_shared_network->forward();
+    weight_shared_network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    std::shared_ptr<Tensor> output_tensor2 =
+            weight_shared_network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    void* out_data2 = output_tensor2->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        float data2 = static_cast<float*>(out_data2)[i];
+        if (data != data2) {
+            printf("the result between the origin network and weight share "
+                   "netwrok is different.\n");
+        }
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/reset_io.cpp b/lite/example/mge/reset_io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d95d834ad31b0303a1295eb7bd59637829c794ac
--- /dev/null
+++ b/lite/example/mge/reset_io.cpp
@@ -0,0 +1,95 @@
+/**
+ * \file example/reset_io.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+bool lite::example::reset_input(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    lite::Config config;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! 6. get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::reset_input_output(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    lite::Config config;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! set output ptr to store the network output
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < 1000; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/user_allocator.cpp b/lite/example/mge/user_allocator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2fd76e6b6bd1ff8853e3e6d8b19ab3bef42f018b
--- /dev/null
+++ b/lite/example/mge/user_allocator.cpp
@@ -0,0 +1,89 @@
+/**
+ * \file example/user_allocator.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+using namespace lite;
+using namespace example;
+
+namespace {
+class CheckAllocator : public lite::Allocator {
+public:
+    //! allocate memory of size in the given device with the given align
+    void* allocate(LiteDeviceType, int, size_t size, size_t align) override {
+#ifdef WIN32
+        return _aligned_malloc(size, align);
+#elif defined(__ANDROID__) || defined(ANDROID)
+        return memalign(align, size);
+#else
+        void* ptr = nullptr;
+        auto err = posix_memalign(&ptr, align, size);
+        if (!err) {
+            printf("failed to malloc %zu bytes with align %zu", size, align);
+        }
+        return ptr;
+#endif
+    };
+
+    //! free the memory pointed by ptr in the given device
+    void free(LiteDeviceType, int, void* ptr) override {
+#ifdef WIN32
+        _aligned_free(ptr);
+#else
+        ::free(ptr);
+#endif
+    };
+};
+}  // namespace
+
+bool lite::example::config_user_allocator(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    auto allocator = std::make_shared<CheckAllocator>();
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+
+    Runtime::set_memory_allocator(network, allocator);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/mge/user_cryption.cpp b/lite/example/mge/user_cryption.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b6f2f3343255d89ab1ad4fda81b70b8691c1652
--- /dev/null
+++ b/lite/example/mge/user_cryption.cpp
@@ -0,0 +1,122 @@
+/**
+ * \file example/user_cryption.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../example.h"
+#if LITE_BUILD_WITH_MGE
+
+using namespace lite;
+using namespace example;
+
+namespace {
+std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size,
+                                   const std::vector<uint8_t>& key) {
+    if (key.size() == 1) {
+        std::vector<uint8_t> ret(size, 0);
+        const uint8_t* ptr = static_cast<const uint8_t*>(model_mem);
+        uint8_t key_data = key[0];
+        for (size_t i = 0; i < size; i++) {
+            ret[i] = ptr[i] ^ key_data ^ key_data;
+        }
+        return ret;
+    } else {
+        printf("the user define decrypt method key length is wrong.\n");
+        return {};
+    }
+}
+}  // namespace
+
+bool lite::example::register_cryption_method(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! register the decryption method
+    register_decryption_and_key("just_for_test", decrypt_model, {15});
+
+    lite::Config config;
+    config.bare_model_cryption_name = "just_for_test";
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::update_cryption_key(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+
+    //! update the decryption method key
+    std::vector<uint8_t> key(32, 0);
+    for (size_t i = 0; i < 32; i++) {
+        key[i] = 31 - i;
+    }
+    update_decryption_or_key("AES_default", nullptr, key);
+
+    lite::Config config;
+    config.bare_model_cryption_name = "AES_default";
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto layout = input_tensor->get_layout();
+
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    input_tensor->reset(src, layout);
+
+    //! forward
+    network->forward();
+    network->wait();
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/example/npy.h b/lite/example/npy.h
new file mode 100644
index 0000000000000000000000000000000000000000..552cda78f7cc203624b58a16ec0213bc65735b60
--- /dev/null
+++ b/lite/example/npy.h
@@ -0,0 +1,638 @@
+/*
+   Copyright 2017 Leon Merten Lohse
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#ifndef NPY_H
+#define NPY_H
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace npy {
+
+/* Compile-time test for byte order.
+   If your compiler does not define these per default, you may want to define
+   one of these constants manually.
+   Defaults to little endian order. */
+#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN ||                  \
+        defined(__BIG_ENDIAN__) || defined(__ARMEB__) ||                      \
+        defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \
+        defined(__MIBSEB) || defined(__MIBSEB__)
+const bool big_endian = true;
+#else
+const bool big_endian = false;
+#endif
+
+const char magic_string[] = "\x93NUMPY";
+const size_t magic_string_length = 6;
+
+const char little_endian_char = '<';
+const char big_endian_char = '>';
+const char no_endian_char = '|';
+
+constexpr char host_endian_char =
+        (big_endian ? big_endian_char : little_endian_char);
+
+/* npy array length */
+typedef unsigned long int ndarray_len_t;
+
+inline void write_magic(std::ostream& ostream, unsigned char v_major = 1,
+                        unsigned char v_minor = 0) {
+    ostream.write(magic_string, magic_string_length);
+    ostream.put(v_major);
+    ostream.put(v_minor);
+}
+
+inline void read_magic(std::istream& istream, unsigned char& v_major,
+                       unsigned char& v_minor) {
+    char buf[magic_string_length + 2];
+    istream.read(buf, magic_string_length + 2);
+
+    if (!istream) {
+        fprintf(stderr, "io error: failed reading file");
+    }
+
+    if (0 != std::memcmp(buf, magic_string, magic_string_length)) {
+        fprintf(stderr, "this file does not have a valid npy format.");
+    }
+
+    v_major = buf[magic_string_length];
+    v_minor = buf[magic_string_length + 1];
+}
+
+// typestring magic
+struct Typestring {
+private:
+    char c_endian;
+    char c_type;
+    int len;
+
+public:
+    inline std::string str() {
+        const size_t max_buflen = 16;
+        char buf[max_buflen];
+        std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
+        return std::string(buf);
+    }
+
+    Typestring(const std::vector<float>&)
+            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {}
+    Typestring(const std::vector<double>&)
+            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {}
+    Typestring(const std::vector<long double>&)
+            : c_endian{host_endian_char},
+              c_type{'f'},
+              len{sizeof(long double)} {}
+
+    Typestring(const std::vector<char>&)
+            : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {}
+    Typestring(const std::vector<short>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {}
+    Typestring(const std::vector<int>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {}
+    Typestring(const std::vector<long>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {}
+    Typestring(const std::vector<long long>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {}
+
+    Typestring(const std::vector<unsigned char>&)
+            : c_endian{no_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned char)} {}
+    Typestring(const std::vector<unsigned short>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned short)} {}
+    Typestring(const std::vector<unsigned int>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned int)} {}
+    Typestring(const std::vector<unsigned long>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned long)} {}
+    Typestring(const std::vector<unsigned long long>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned long long)} {}
+
+    Typestring(const std::vector<std::complex<float>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<float>)} {}
+    Typestring(const std::vector<std::complex<double>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<double>)} {}
+    Typestring(const std::vector<std::complex<long double>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<long double>)} {}
+};
+
+inline void parse_typestring(std::string typestring) {
+    std::regex re("'([<>|])([ifuc])(\\d+)'");
+    std::smatch sm;
+
+    std::regex_match(typestring, sm, re);
+
+    if (sm.size() != 4) {
+        fprintf(stderr, "invalid typestring");
+    }
+}
+
+namespace pyparse {
+
+/**
+  Removes leading and trailing whitespaces
+  */
+inline std::string trim(const std::string& str) {
+    const std::string whitespace = " \t";
+    auto begin = str.find_first_not_of(whitespace);
+
+    if (begin == std::string::npos)
+        return "";
+
+    auto end = str.find_last_not_of(whitespace);
+
+    return str.substr(begin, end - begin + 1);
+}
+
+inline std::string get_value_from_map(const std::string& mapstr) {
+    size_t sep_pos = mapstr.find_first_of(":");
+    if (sep_pos == std::string::npos)
+        return "";
+
+    std::string tmp = mapstr.substr(sep_pos + 1);
+    return trim(tmp);
+}
+
+/**
+   Parses the string representation of a Python dict
+
+   The keys need to be known and may not appear anywhere else in the data.
+ */
+inline std::unordered_map<std::string, std::string> parse_dict(
+        std::string in, std::vector<std::string>& keys) {
+    std::unordered_map<std::string, std::string> map;
+
+    if (keys.size() == 0)
+        return map;
+
+    in = trim(in);
+
+    // unwrap dictionary
+    if ((in.front() == '{') && (in.back() == '}'))
+        in = in.substr(1, in.length() - 2);
+    else {
+        fprintf(stderr, "Not a Python dictionary.");
+    }
+
+    std::vector<std::pair<size_t, std::string>> positions;
+
+    for (auto const& value : keys) {
+        size_t pos = in.find("'" + value + "'");
+
+        if (pos == std::string::npos) {
+            fprintf(stderr, "Missing %s key.", value.c_str());
+        }
+
+        std::pair<size_t, std::string> position_pair{pos, value};
+        positions.push_back(position_pair);
+    }
+
+    // sort by position in dict
+    std::sort(positions.begin(), positions.end());
+
+    for (size_t i = 0; i < positions.size(); ++i) {
+        std::string raw_value;
+        size_t begin{positions[i].first};
+        size_t end{std::string::npos};
+
+        std::string key = positions[i].second;
+
+        if (i + 1 < positions.size())
+            end = positions[i + 1].first;
+
+        raw_value = in.substr(begin, end - begin);
+
+        raw_value = trim(raw_value);
+
+        if (raw_value.back() == ',')
+            raw_value.pop_back();
+
+        map[key] = get_value_from_map(raw_value);
+    }
+
+    return map;
+}
+
+/**
+  Parses the string representation of a Python boolean
+  */
+inline bool parse_bool(const std::string& in) {
+    if (in == "True")
+        return true;
+    if (in == "False")
+        return false;
+
+    fprintf(stderr, "Invalid python boolan.");
+    return false;
+}
+
+/**
+  Parses the string representation of a Python str
+  */
+inline std::string parse_str(const std::string& in) {
+    if ((in.front() == '\'') && (in.back() == '\''))
+        return in.substr(1, in.length() - 2);
+
+    fprintf(stderr, "Invalid python string.");
+    return "";
+}
+
+/**
+  Parses the string represenatation of a Python tuple into a vector of its items
+ */
+inline std::vector<std::string> parse_tuple(std::string in) {
+    std::vector<std::string> v;
+    const char seperator = ',';
+
+    in = trim(in);
+
+    if ((in.front() == '(') && (in.back() == ')'))
+        in = in.substr(1, in.length() - 2);
+    else {
+        fprintf(stderr, "Invalid Python tuple.");
+    }
+
+    std::istringstream iss(in);
+
+    for (std::string token; std::getline(iss, token, seperator);) {
+        v.push_back(token);
+    }
+
+    return v;
+}
+
+template <typename T>
+inline std::string write_tuple(const std::vector<T>& v) {
+    if (v.size() == 0)
+        return "";
+
+    std::ostringstream ss;
+
+    if (v.size() == 1) {
+        ss << "(" << v.front() << ",)";
+    } else {
+        const std::string delimiter = ", ";
+        // v.size() > 1
+        ss << "(";
+        std::copy(v.begin(), v.end() - 1,
+                  std::ostream_iterator<T>(ss, delimiter.c_str()));
+        ss << v.back();
+        ss << ")";
+    }
+
+    return ss.str();
+}
+
+inline std::string write_boolean(bool b) {
+    if (b)
+        return "True";
+    else
+        return "False";
+}
+
+}  // namespace pyparse
+
+inline void parse_header(std::string header, std::string& descr) {
+    /*
+       The first 6 bytes are a magic string: exactly "x93NUMPY".
+       The next 1 byte is an unsigned byte: the major version number of the file
+       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
+       number of the file format, e.g. x00. Note: the version of the file format
+       is not tied to the version of the numpy package. The next 2 bytes form a
+       little-endian unsigned short int: the length of the header data
+       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
+       array's format. It is an ASCII string which contains a Python literal
+       expression of a dictionary. It is terminated by a newline ('n') and
+       padded with spaces
+       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
+       evenly divisible by 16 for alignment purposes. The dictionary contains
+       three keys:
+
+       "descr" : dtype.descr
+       An object that can be passed as an argument to the numpy.dtype()
+       constructor to create the array's dtype. For repeatability and
+       readability, this dictionary is formatted using pprint.pformat() so the
+       keys are in alphabetic order.
+     */
+
+    // remove trailing newline
+    if (header.back() != '\n')
+        fprintf(stderr, "invalid header");
+    header.pop_back();
+
+    // parse the dictionary
+    std::vector<std::string> keys{"descr"};
+    auto dict_map = npy::pyparse::parse_dict(header, keys);
+
+    if (dict_map.size() == 0)
+        fprintf(stderr, "invalid dictionary in header");
+
+    std::string descr_s = dict_map["descr"];
+    parse_typestring(descr_s);
+    // remove
+    descr = npy::pyparse::parse_str(descr_s);
+    return;
+}
+
+inline void parse_header(std::string header, std::string& descr,
+                         bool& fortran_order,
+                         std::vector<ndarray_len_t>& shape) {
+    /*
+       The first 6 bytes are a magic string: exactly "x93NUMPY".
+       The next 1 byte is an unsigned byte: the major version number of the file
+       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
+       number of the file format, e.g. x00. Note: the version of the file format
+       is not tied to the version of the numpy package. The next 2 bytes form a
+       little-endian unsigned short int: the length of the header data
+       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
+       array's format. It is an ASCII string which contains a Python literal
+       expression of a dictionary. It is terminated by a newline ('n') and
+       padded with spaces
+       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
+       evenly divisible by 16 for alignment purposes. The dictionary contains
+       three keys:
+
+       "descr" : dtype.descr
+       An object that can be passed as an argument to the numpy.dtype()
+       constructor to create the array's dtype. "fortran_order" : bool Whether
+       the array data is Fortran-contiguous or not. Since Fortran-contiguous
+       arrays are a common form of non-C-contiguity, we allow them to be written
+       directly to disk for efficiency. "shape" : tuple of int The shape of the
+       array. For repeatability and readability, this dictionary is formatted
+       using pprint.pformat() so the keys are in alphabetic order.
+     */
+
+    // remove trailing newline
+    if (header.back() != '\n')
+        fprintf(stderr, "invalid header");
+    header.pop_back();
+
+    // parse the dictionary
+    std::vector<std::string> keys{"descr", "fortran_order", "shape"};
+    auto dict_map = npy::pyparse::parse_dict(header, keys);
+
+    if (dict_map.size() == 0)
+        fprintf(stderr, "invalid dictionary in header");
+
+    std::string descr_s = dict_map["descr"];
+    std::string fortran_s = dict_map["fortran_order"];
+    std::string shape_s = dict_map["shape"];
+
+    // TODO: extract info from typestring
+    parse_typestring(descr_s);
+    // remove
+    descr = npy::pyparse::parse_str(descr_s);
+
+    // convert literal Python bool to C++ bool
+    fortran_order = npy::pyparse::parse_bool(fortran_s);
+
+    // parse the shape tuple
+    auto shape_v = npy::pyparse::parse_tuple(shape_s);
+    if (shape_v.size() == 0)
+        fprintf(stderr, "invalid shape tuple in header");
+
+    for (auto item : shape_v) {
+        ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item));
+        shape.push_back(dim);
+    }
+}
+
+inline std::string write_header_dict(const std::string& descr,
+                                     bool fortran_order,
+                                     const std::vector<ndarray_len_t>& shape) {
+    std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
+    std::string shape_s = npy::pyparse::write_tuple(shape);
+
+    return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order +
+           ", 'shape': " + shape_s + ", }";
+}
+
+inline void write_header(std::ostream& out, const std::string& descr,
+                         bool fortran_order,
+                         const std::vector<ndarray_len_t>& shape_v) {
+    std::string header_dict = write_header_dict(descr, fortran_order, shape_v);
+
+    size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;
+
+    unsigned char version[2] = {1, 0};
+    if (length >= 255 * 255) {
+        length = magic_string_length + 2 + 4 + header_dict.length() + 1;
+        version[0] = 2;
+        version[1] = 0;
+    }
+    size_t padding_len = 16 - length % 16;
+    std::string padding(padding_len, ' ');
+
+    // write magic
+    write_magic(out, version[0], version[1]);
+
+    // write header length
+    if (version[0] == 1 && version[1] == 0) {
+        char header_len_le16[2];
+        uint16_t header_len = static_cast<uint16_t>(header_dict.length() +
+                                                    padding.length() + 1);
+
+        header_len_le16[0] = (header_len >> 0) & 0xff;
+        header_len_le16[1] = (header_len >> 8) & 0xff;
+        out.write(reinterpret_cast<char*>(header_len_le16), 2);
+    } else {
+        char header_len_le32[4];
+        uint32_t header_len = static_cast<uint32_t>(header_dict.length() +
+                                                    padding.length() + 1);
+
+        header_len_le32[0] = (header_len >> 0) & 0xff;
+        header_len_le32[1] = (header_len >> 8) & 0xff;
+        header_len_le32[2] = (header_len >> 16) & 0xff;
+        header_len_le32[3] = (header_len >> 24) & 0xff;
+        out.write(reinterpret_cast<char*>(header_len_le32), 4);
+    }
+
+    out << header_dict << padding << '\n';
+}
+
+inline std::string read_header(std::istream& istream) {
+    // check magic bytes an version number
+    unsigned char v_major, v_minor;
+    read_magic(istream, v_major, v_minor);
+
+    uint32_t header_length = 0;
+    if (v_major == 1 && v_minor == 0) {
+        char header_len_le16[2];
+        istream.read(header_len_le16, 2);
+        header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
+
+        if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
+            // TODO: display warning
+        }
+    } else if (v_major == 2 && v_minor == 0) {
+        char header_len_le32[4];
+        istream.read(header_len_le32, 4);
+
+        header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) |
+                        (header_len_le32[2] << 16) | (header_len_le32[3] << 24);
+
+        if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
+            // TODO: display warning
+        }
+    } else {
+        fprintf(stderr, "unsupported file format version");
+    }
+
+    auto buf_v = std::vector<char>();
+    buf_v.reserve(header_length);
+    istream.read(buf_v.data(), header_length);
+    std::string header(buf_v.data(), header_length);
+
+    return header;
+}
+
+inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
+    ndarray_len_t size = 1;
+    for (ndarray_len_t i : shape)
+        size *= i;
+
+    return size;
+}
+
+template <typename Scalar>
+inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order,
+                             unsigned int n_dims, const unsigned long shape[],
+                             const std::vector<Scalar>& data) {
+    Typestring typestring_o(data);
+    std::string typestring = typestring_o.str();
+
+    std::ofstream stream(filename, std::ofstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::vector<ndarray_len_t> shape_v(shape, shape + n_dims);
+    write_header(stream, typestring, fortran_order, shape_v);
+
+    auto size = static_cast<size_t>(comp_size(shape_v));
+
+    stream.write(reinterpret_cast<const char*>(data.data()),
+                 sizeof(Scalar) * size);
+}
+
+template <typename Scalar>
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::vector<unsigned long>& shape,
+                               std::vector<Scalar>& data) {
+    bool fortran_order;
+    LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data);
+}
+
+template <typename Scalar>
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::vector<unsigned long>& shape,
+                               bool& fortran_order, std::vector<Scalar>& data) {
+    std::ifstream stream(filename, std::ifstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::string header = read_header(stream);
+
+    // parse header
+    std::string typestr;
+
+    parse_header(header, typestr, fortran_order, shape);
+
+    // check if the typestring matches the given one
+    Typestring typestring_o{data};
+    std::string expect_typestr = typestring_o.str();
+    if (typestr != expect_typestr) {
+        fprintf(stderr, "formatting error: typestrings not matching");
+    }
+
+    // compute the data size based on the shape
+    auto size = static_cast<size_t>(comp_size(shape));
+    data.resize(size);
+
+    // read the data
+    stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size);
+}
+
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::string& type_str,
+                               std::vector<ndarray_len_t>& shape,
+                               std::vector<int8_t>& data) {
+    std::ifstream stream(filename, std::ifstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::string header = read_header(stream);
+    bool fortran_order;
+    // parse header
+    parse_header(header, type_str, fortran_order, shape);
+
+    // check if the typestring matches the given one
+    std::string size_str = type_str.substr(type_str.size() - 1);
+    size_t elem_size = atoi(size_str.c_str());
+
+    // compute the data size based on the shape
+    auto byte_size = elem_size * static_cast<size_t>(comp_size(shape));
+    data.resize(byte_size);
+
+    // read the data
+    stream.read(reinterpret_cast<char*>(data.data()), byte_size);
+}
+
+}  // namespace npy
+
+#endif  // NPY_H
diff --git a/lite/include/lite/common_enum_c.h b/lite/include/lite/common_enum_c.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed4db6c5ef6d658b98d82fa906de4859a59cc91a
--- /dev/null
+++ b/lite/include/lite/common_enum_c.h
@@ -0,0 +1,97 @@
+/**
+ * \file inlude/lite/common_enum_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_COMMON_ENUM_C_H_
+#define LITE_COMMON_ENUM_C_H_
+
+/*!
+ * \brief The log level.
+ */
+typedef enum LiteLogLevel {
+    DEBUG = 0, /*!< The lowest level and most verbose */
+    INFO = 1,  /*!< The lowest level and most verbose */
+    WARN = 2,  /*!< Print only warning and errors */
+    ERROR = 3, /*!< Print only errors */
+} LiteLogLevel;
+
+typedef enum LiteBackend {
+    LITE_DEFAULT = 0, //! default backend is mge
+} LiteBackend;
+
+typedef enum LiteDeviceType {
+    LITE_CPU = 0,
+    LITE_CUDA = 1,
+    LITE_ATLAS = 3,
+    LITE_NPU = 4,
+    //! when the device information is set in model, so set LITE_DEVICE_DEFAULT
+    //! in lite
+    LITE_DEVICE_DEFAULT = 5,
+} LiteDeviceType;
+
+typedef enum LiteDataType {
+    LITE_FLOAT = 0,
+    LITE_HALF = 1,
+    LITE_INT = 2,
+    LITE_INT16 = 3,
+    LITE_INT8 = 4,
+    LITE_UINT8 = 5,
+    LITE_UINT = 6,
+    LITE_UINT16 = 7,
+    LITE_INT64 = 8,
+} LiteCDataType;
+
+typedef enum LiteTensorPhase {
+    //! Tensor maybe input or output
+    LITE_IO = 0,
+    //! Tensor is input
+    LITE_INPUT = 1,
+    //! Tensor is output
+    LITE_OUTPUT = 2,
+} LiteTensorPhase;
+
+/*!
+ * \brief the input and output type, include SHAPE and VALUE
+ * sometimes user only need the shape of the output tensor
+ */
+typedef enum LiteIOType {
+    LITE_IO_VALUE = 0,
+    LITE_IO_SHAPE = 1,
+} LiteIOType;
+
+/*!
+ * \brief operation algorithm seletion strategy type, some operations have
+ * multi algorithms, different algorithm has different attribute, according to
+ * the strategy, the best algorithm will be selected.
+ *
+ * Note: These strategies can be combined
+ *
+ * 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
+ * use heuristic instead
+ *
+ * 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
+ * reproducible algo
+ *
+ * 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
+ * algorithm from the reproducible algorithms set
+ *
+ * 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
+ * algorithm form the optimzed algorithms, thus profile will process fast
+ *
+ * 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
+ * profile the best algorithm form the optimzed and reproducible algorithms
+ */
+typedef enum LiteAlgoSelectStrategy {
+    LITE_ALGO_HEURISTIC = 1 << 0,
+    LITE_ALGO_PROFILE = 1 << 1,
+    LITE_ALGO_REPRODUCIBLE = 1 << 2,
+    LITE_ALGO_OPTIMIZED = 1 << 3,
+} LiteAlgoSelectStrategy;
+
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/include/lite/global.h b/lite/include/lite/global.h
new file mode 100644
index 0000000000000000000000000000000000000000..2737f6a38f91f41b8766b14338b4a0b0af71186e
--- /dev/null
+++ b/lite/include/lite/global.h
@@ -0,0 +1,157 @@
+/**
+ * \file inlude/lite/global.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "macro.h"
+#include "network.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace lite {
+
+/**
+ * \brief Model decryption function
+ *
+ * \param[in] const void* is the decrypted model memory pointer
+ * \param[in] size_t the size the decrypted model memory in byte
+ * \param[in] const std::vector<uint8_t>& the decryption key vector
+ */
+using DecryptionFunc = std::function<std::vector<uint8_t>(
+        const void*, size_t, const std::vector<uint8_t>&)>;
+
+/**
+ * \brief register a custom decryption method and key to lite.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model.
+ *
+ * \param[in] key the decryption key of the method
+ */
+LITE_API bool register_decryption_and_key(std::string decrypt_name,
+                                          const DecryptionFunc& func,
+                                          const std::vector<uint8_t>& key);
+
+/**
+ * \brief update decryption function or key of a custom decryption method.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model. if
+ * function is nullptr, it will not be updated.
+ *
+ * \param[in] key the decryption key of the method, if the size of key is zero,
+ * it will not be updated
+ */
+LITE_API bool update_decryption_or_key(std::string decrypt_name,
+                                       const DecryptionFunc& func,
+                                       const std::vector<uint8_t>& key);
+
+/**
+ * \brief Model information parse function
+ *
+ * \param[in] const void* is the information memory
+ * \param[in] size_t the size the information memory
+ * \param[in] const std::string the model name used for check whether the
+ * infomation match the model
+ * \param[in] Config the model config, ParseInfoFunc can fill it with the
+ * information in json, the config will influence Network loading later
+ * \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the
+ * information in json, the networkio will influence Network forwarding later
+ * \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the
+ * other config not inclue in config and networkIO, ParseInfoFunc can fill it
+ * with the information in json, now support:
+ * "device_id" : int, default 0
+ * "number_threads" : size_t, default 1
+ * "is_inplace_model" : bool, default false
+ * "use_tensorrt" : bool, default false
+ */
+using ParseInfoFunc = std::function<bool(
+        const void*, size_t, const std::string model_name, Config& config,
+        NetworkIO& network_io,
+        std::unordered_map<std::string, LiteAny>& isolated_config_map,
+        std::string& extra_info)>;
+
+/**
+ * \brief register a custom parser function to lite.
+ *
+ * \param[in] info_type the name of the parser function, which will act as the
+ * hash key to find the parser method.
+ *
+ * \param[in] parse_func the parser function, which will parse the given
+ * information and modify the Network Config and IO.
+ *
+ */
+LITE_API bool register_parse_info_func(std::string info_type,
+                                       const ParseInfoFunc& parse_func);
+
+/*! \brief Get version
+ */
+LITE_API void get_version(int& major, int& minor, int& patch);
+
+/*! \brief Set the current log level.
+ * \param[in] level The new log level
+ */
+LITE_API void set_log_level(LiteLogLevel level);
+
+/*! \brief Get the current log level.
+ * \return The current log level
+ */
+LITE_API LiteLogLevel get_log_level();
+
+/*! \brief Get device count
+ * \param[in] device_type device type
+ * \return the device count
+ */
+LITE_API size_t get_device_count(LiteDeviceType device_type);
+
+/*! \brief try to coalesce all free memory in megenine
+ */
+LITE_API void try_coalesce_all_free_memory();
+
+/*!
+ * \brief Set the loader to the lite
+ * \param loader_path is the file path which store the cache
+ */
+LITE_API void set_loader_lib_path(const std::string& loader_path);
+
+/*!
+ * \brief Set the algo policy cache file for CPU/CUDA ...
+ * \param cache_path is the file path which store the cache
+ * \param always_sync sync the cache when model run
+ */
+LITE_API void set_persistent_cache(const std::string& cache_path,
+                                   bool always_sync = false);
+
+/*!
+ * \brief dump the PersistentCache policy cache to file, if the network is set
+ * to profile when forward, though this the algo policy will dump to file
+ */
+LITE_API void dump_persistent_cache(const std::string& cache_path);
+
+/*!
+ * \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
+ */
+LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path);
+
+/*!
+ * \brief dump the TensorRT cache to the file set in set_tensor_rt_cache
+ */
+LITE_API void dump_tensor_rt_cache();
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/include/lite/macro.h b/lite/include/lite/macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f3dc91e34acc304f9c7414038c5d0baaa78dba8
--- /dev/null
+++ b/lite/include/lite/macro.h
@@ -0,0 +1,20 @@
+/**
+ * \file include/lite/macro.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_MACRO_H_
+#define LITE_MACRO_H_
+
+#if defined(_WIN32)
+#define LITE_API __declspec(dllexport)
+#else
+#define LITE_API __attribute__((visibility("default")))
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h
new file mode 100644
index 0000000000000000000000000000000000000000..2082d81489ad8cb930e551efd0232fb54ac226c4
--- /dev/null
+++ b/lite/include/lite/network.h
@@ -0,0 +1,368 @@
+/**
+ * \file inlude/lite/network.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "macro.h"
+#include "tensor.h"
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+namespace lite {
+
+LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x,
+                                                 LiteAlgoSelectStrategy y) {
+    return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) |
+                                               static_cast<uint32_t>(y));
+}
+
+/*!
+ * \brief the inference options which will be translated to megenine
+ *
+ * \param weight_preprocess is the option wich optimize the inferece performance
+ * with preprocess the const weights
+ *
+ * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
+ * dimshuffle
+ *
+ * \param fake_next_exec  whether only to perform non-computing tasks (like
+ * memory allocation and queue initialization) for next exec. This would be
+ * reset to false when the graph is executed.
+ *
+ * \param var_sanity_check_first_run Disable var sanity check on the first run.
+ * Var sanity check is enabled on the first-time execution by default, and can
+ * be used to find some potential memory access errors in the operator
+ * implementation.
+ *
+ * \param const_shape This can be used to reduce memory usage since some
+ * static inference data structures can be omitted.
+ *
+ * \param force_dynamic_alloc force dynamic memory alloc for all vars
+ *
+ * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
+ * which are used as CallbackCaller input when call compile() function
+ *
+ * \param no_profiling_on_shape_change do not re-profile to select best impl
+ * algo when input shape changes (use previous algo)
+ *
+ * \param jit_level Execute supported operators with JIT (support MLIR,
+ * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
+ * 1 for basic elemwise opr;
+ * 2 for including reduce operator
+ *
+ * \param record_level flag optimize the inference performace with record the
+ * kernel tasks in first run, hereafter the inference all need to execute the
+ * recorded tasks.
+ * level = 0 means the normal inference,
+ * level = 1 means use record inference,
+ * level = 2 means record inference with free the extra memory
+ *
+ * \param graph_opt_level optimization level:
+ * 0: disable
+ * 1: level-1: inplace arith transformations during graph
+ *    construction
+ * 2: level-2: level-1, plus global optimization before graph
+ *    compiling
+ * 3: also enable JIT
+ * <0: corresponding level, with result check for debug
+ *
+ * \param async_exec_level exec: dispatch on separate threads for different
+ * comp_node.
+ * 0: do not perform async dispatch
+ * 1: dispatch async if there are more than one comp node with limited queue
+ * mask 0b10: async if there are multiple comp nodes with
+ * mask 0b100: always async
+ */
+struct LITE_API Options {
+    bool weight_preprocess = false;
+    bool fuse_preprocess = false;
+    bool fake_next_exec = false;
+    bool var_sanity_check_first_run = true;
+    bool const_shape = false;
+    bool force_dynamic_alloc = false;
+    bool force_output_dynamic_alloc = false;
+    bool no_profiling_on_shape_change = false;
+    uint8_t jit_level = 0;
+    uint8_t comp_node_seq_record_level = 0;
+    uint8_t graph_opt_level = 2;
+    uint16_t async_exec_level = 1;
+
+    //! layout transform options
+    bool enable_nchw44 = false;
+    bool enable_nchw44_dot = false;
+    bool enable_nchw88 = false;
+    bool enable_nhwcd4 = false;
+    bool enable_nchw4 = false;
+    bool enable_nchw32 = false;
+    bool enable_nchw64 = false;
+};
+
+/*!
+ * \brief Configuration when load and compile the graph
+ *
+ * \param bare_model_cryption_name is the bare model cryption method name, bare
+ *model is not pack json info inside
+ *
+ *\param has_compression flag whether the model is compressed, the compress
+ *method will read form the model
+ */
+struct LITE_API Config {
+    bool has_compression = false;
+    int device_id = 0;
+    LiteDeviceType device_type = LiteDeviceType::LITE_CPU;
+    LiteBackend backend = LiteBackend::LITE_DEFAULT;
+    std::string bare_model_cryption_name = {};
+    Options options = {};
+};
+
+/*!
+ * \brief config the network input and output item
+ *
+ */
+struct LITE_API IO {
+    //! the tensor name in the graph corresponding to the IO
+    std::string name;
+
+    //! Used to mark where the input tensor comes from and the output where copy
+    //! to, if is_host is true, the input is from host and output copy to host,
+    //! otherwise device. Sometimes The input is from device and output no need
+    //! copy to host, default is true.
+    bool is_host = true;
+
+    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+    //! output tensor value is invaid, only shape will be set, default is VALUE
+    LiteIOType io_type = LiteIOType::LITE_IO_VALUE;
+
+    //! The layout of the config from user, if other layout is set before
+    //! forward or get after forward by input tensor reset, this layout will by
+    //! pass. if no other layout is set before forward, this layout will work.
+    //! if this layout is no set, the model will forward with its origin layout.
+    //! if in output, it will used to check.
+    Layout config_layout = {};
+};
+
+/*!
+ * \brief the input and output information when load the network
+ * the NetworkIO will remain in the network until the network is destroyed
+ */
+struct LITE_API NetworkIO {
+    std::vector<IO> inputs = {};
+    std::vector<IO> outputs = {};
+};
+
+/*!
+ * \brief A user-implemented allocator interface
+ */
+class LITE_API Allocator {
+public:
+    virtual ~Allocator() = default;
+
+    //! allocate memory of size in the given device with the given align
+    virtual void* allocate(LiteDeviceType device_type, int device_id,
+                           size_t size, size_t align) = 0;
+
+    //! free the memory pointed by ptr in the given device
+    virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0;
+};
+
+/*!
+ * \brief the thread affinith callback type
+ * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
+ * thread_id of (nr_threads - 1) is the main worker thread.
+ */
+using ThreadAffinityCallback = std::function<void(int thread_id)>;
+
+using AsyncCallback = std::function<void(void)>;
+
+/*!
+ * \brief the start/finish callback function
+ * \param unordered_map map from the io tensor name to the pair of which is the
+ * corresponding IO of user config and the realy input or output tensor.
+ */
+using StartCallback = std::function<void(
+        const std::unordered_map<std::string,
+                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;
+using FinishCallback = std::function<void(
+        const std::unordered_map<std::string,
+                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;
+
+/*!
+ * \brief The network is construct form a model, implement model load, init,
+ * forward, and display some model information
+ */
+class LITE_API Network {
+public:
+    class NetworkImplBase;
+
+    ~Network();
+
+    Network(const Config& config = {}, const NetworkIO& networkio = {});
+
+    Network(const NetworkIO& networkio, const Config& config = {});
+
+    //! load the model form memory
+    void load_model(void* model_mem, size_t size);
+
+    //! load the model from a model path
+    void load_model(std::string model_path);
+
+    //! only compute the output tensor in user configured
+    void compute_only_configured_output();
+
+    //! get the network input and output tensor, the layout of which is
+    //! sync from mge tensor, when the name of input and output tensor  are the
+    //! same, use LiteTensorPhase to separate
+    std::shared_ptr<Tensor> get_io_tensor(
+            std::string io_name,
+            LiteTensorPhase phase = LiteTensorPhase::LITE_IO);
+
+    //! get the network input by index
+    std::shared_ptr<Tensor> get_input_tensor(size_t index);
+
+    //! get the network output tensor by index
+    std::shared_ptr<Tensor> get_output_tensor(size_t index);
+
+    //! set the network forward in async mode and set the async callback
+    //! function
+    Network& set_async_callback(const AsyncCallback& async_callback);
+
+    //! set the start forward callback function, which will be execute before
+    //! forward. this can be used to check network input or dump model inputs
+    //! for debug
+    Network& set_start_callback(const StartCallback& start_callback);
+
+    //! set the finish forward callback function, which will be execute after
+    //! forward. this can be used to dump model outputs for debug
+    Network& set_finish_callback(const FinishCallback& finish_callback);
+
+    //! forward the network with filled input data and fill the output data
+    //! to the output tensor
+    void forward();
+
+    //! waite until forward finish in sync model
+    void wait();
+
+    //! get the input tensor name in the order in load return
+    std::string get_input_name(size_t index) const;
+
+    //! get the output tensor name in the order in load return
+    std::string get_output_name(size_t index) const;
+
+    //! get all the input tensor name in the order in load return
+    std::vector<std::string> get_all_input_name() const;
+
+    //! get all the output tensor name in the order in load return
+    std::vector<std::string> get_all_output_name() const;
+
+    //! set/get device id, default device id = 0
+    Network& set_device_id(int device_id);
+    int get_device_id() const;
+
+    //! set/get stream id, default stream id = 0
+    Network& set_stream_id(int stream_id);
+    int get_stream_id() const;
+
+    //! enable profile the network, a file will be generated
+    void enable_profile_performance(std::string profile_file_path);
+
+    //! get model extra info
+    const std::string& get_model_extra_info();
+
+    //! get device type
+    LiteDeviceType get_device_type() const;
+
+public:
+    friend class NetworkHelper;
+
+private:
+    //! update member from implement
+    void update_from_implement();
+
+    //! decrypt and parse the model file
+    void prase_model(std::shared_ptr<void> model_data, size_t size);
+
+private:
+    bool m_loaded = false;
+    Config m_config;
+    NetworkIO m_network_io;
+    std::unique_ptr<NetworkImplBase> m_impl;
+    std::string m_extra_info;
+};
+
+/*********************** MGE special network function ***************/
+class LITE_API Runtime {
+public:
+    //! When device is CPU, this interface will set the to be loaded model
+    //! run in multi thread mode with the given thread number.
+    static void set_cpu_threads_number(std::shared_ptr<Network> dst_network,
+                                       size_t nr_threads);
+    static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network);
+
+    //! set threads affinity callback;
+    static void set_runtime_thread_affinity(
+            std::shared_ptr<Network> network,
+            const ThreadAffinityCallback& thread_affinity_callback);
+
+    //! Set cpu default mode when device is CPU, in some low computation
+    //! device or single core device, this mode will get good performace
+    static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
+    static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
+
+    //! Set use tensorrt forward
+    static void use_tensorrt(std::shared_ptr<Network> dst_network);
+
+    //! set opr algorithm selection strategy in the network
+    //! shared_batch_size: the batch size used by fastrun,
+    //!                    Non-zero value means that fastrun use this batch size
+    //!                    regardless of the batch size of the model. Zero means
+    //!                    fastrun use batch size of the model
+    //! binary_equal_between_batch: if the content of each input batch is binary
+    //!                             equal,whether the content of each output
+    //!                             batch is promised to be equal
+    static void set_network_algo_policy(
+            std::shared_ptr<Network> dst_network,
+            LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0,
+            bool binary_equal_between_batch = false);
+
+    //! set workspace_limit for oprs with multiple algorithms, set
+    //! workspace limitation can save memory but may influence the performance
+    static void set_network_algo_workspace_limit(
+            std::shared_ptr<Network> dst_network, size_t workspace_limit);
+
+    //! set the network memroy allocator, the allocator is defined by user
+    static void set_memory_allocator(std::shared_ptr<Network> dst_network,
+                                     std::shared_ptr<Allocator> user_allocator);
+
+    //! share the runtime memory with other network, the weights is not shared
+    static void share_runtime_memory_with(std::shared_ptr<Network> dst_network,
+                                          std::shared_ptr<Network> src_network);
+
+    //! Dump input/output values of all internal variables to output
+    //! file, in txt format
+    static void enable_io_txt_dump(std::shared_ptr<Network> dst_network,
+                                   std::string io_txt_out_file);
+
+    //! Dump input/output values of all internal variables to output
+    //! directory, in binary format
+    static void enable_io_bin_dump(std::shared_ptr<Network> dst_network,
+                                   std::string io_bin_out_dir);
+
+    //! load a new network which will share weights with src network
+    static void shared_weight_with_network(
+            std::shared_ptr<Network> dst_network,
+            const std::shared_ptr<Network> src_network);
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/include/lite/tensor.h b/lite/include/lite/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e7a56526d7e8f8590aebf3b7053d51b2b0ea51b
--- /dev/null
+++ b/lite/include/lite/tensor.h
@@ -0,0 +1,224 @@
+/**
+ * \file inlude/lite/tensor.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "common_enum_c.h"
+#include "macro.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace lite {
+
+/*!
+ * \brief the simple layout description
+ */
+struct LITE_API Layout {
+    static constexpr uint32_t MAXDIM = 7;
+    size_t shapes[MAXDIM];
+    size_t ndim = 0;
+    LiteDataType data_type = LiteDataType::LITE_FLOAT;
+
+    //! get the total byte of a layout
+    size_t get_elem_size() const;
+
+    //! compare whether the two layout is equal
+    bool operator==(const Layout& other) const;
+};
+
+/*!
+ * \brief warpper of the MegEngine Tensor
+ *
+ * The memory is not alloc directly, when call get_memory_ptr() the memory
+ * will be allocated in tensor implement, which will be deleted automatically
+ *
+ * Note: if the tensor memory is set through reset() interface, the memory is
+ * managed by the user, it will not be freed by the tensor
+ *
+ * If the device or layout is not set, when copy form other source tensor, its
+ * device and layout will be copy form the source tensor
+ *
+ * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
+ * this is used to Optimize the H2D or D2H memory copy, if the device or layout
+ * is not set, when copy form other device(CUDA) tensor, this tensor
+ * will be automatically set to pinned tensor
+ */
+class LITE_API Tensor {
+    class TensorImpl;
+
+public:
+    class TensorImplBase;
+
+    Tensor();
+    Tensor(LiteDeviceType device_type, bool is_pinned_host = false);
+    Tensor(LiteDeviceType device_type, const Layout& layout,
+           bool is_pinned_host = false);
+    Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {},
+           bool is_pinned_host = false);
+    Tensor(int device_id, int stream_id, LiteDeviceType device_type,
+           bool is_pinned_host = false);
+    Tensor(LiteBackend backend,
+           LiteDeviceType device_type = LiteDeviceType::LITE_CPU,
+           int device_id = 0, const Layout& layout = {},
+           bool is_pinned_host = false);
+    ~Tensor();
+
+    LiteDeviceType get_device_type() const { return m_device_type; };
+
+    int get_device_id() const { return m_device_id; };
+
+    Layout get_layout() const { return m_layout; };
+
+    bool is_pinned_host() const { return m_is_pinned_host; };
+
+    //! set layout will change the layout and reallocate memory of the tensor
+    void set_layout(const Layout& layout);
+
+    //! which will trigger memory alloc in tensor implement
+    void* get_memory_ptr() const;
+
+    //! get the memory with the offset describe in idx
+    void* get_memory_ptr(const std::vector<size_t>& idx) const;
+
+    //! get the tensor capacity in byte
+    size_t get_tensor_total_size_in_byte() const;
+
+    //! use the user allocated data to reset the memory of the tensor, the
+    //! memory will not be managed by the lite, later, the user should delete
+    //! it.
+    void reset(void* prepared_data, size_t data_length_in_byte);
+
+    //! use the user allocated data and corresponding layout to reset the data
+    //! and layout of the tensor, the memory will not be managed by lite, later,
+    //! the user should delete it.
+    void reset(void* prepared_data, const Layout& layout);
+
+    //! reshape the tensor with new shape, keep the data_type the same
+    void reshape(const std::vector<int>& shape);
+
+    //! get a new tensor slice from the origin tensor
+    std::shared_ptr<Tensor> slice(const std::vector<size_t>& start,
+                                  const std::vector<size_t>& end,
+                                  const std::vector<size_t>& step = {});
+
+    //! set the tensor memory with zero
+    void fill_zero();
+
+    //! copy tensor form other tensor
+    //! Note: the best way for tensor copy is just set the dst device, left
+    //! layout empty, when copying the dst layout will be set the same with
+    //! src
+    void copy_from(const Tensor& src);
+
+    //! share memory with other tensor
+    void share_memory_with(const Tensor& src_tensor);
+
+    //! whether the memory of tensor is continue
+    bool is_continue_memory() const;
+
+    //! update the menbers from the implement
+    void update_from_implement();
+
+public:
+    friend class TensorHelper;
+
+private:
+    std::shared_ptr<TensorImplBase> m_tensor_impl;
+
+    //! flag whether the storage of the tensor is pinned, this is only used
+    //! when the compnode is not in CPU
+    bool m_is_pinned_host = false;
+    int m_device_id = 0;
+    Layout m_layout;
+    //! the device of the tensor should not be changed after the tensor has
+    //! constructed
+    LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU;
+};
+
+/**
+ * \brief a class can hold any type data, but not check whether the visit type
+ * is valid
+ */
+class LITE_API LiteAny {
+public:
+    LiteAny() = default;
+    template <class T>
+    LiteAny(T value) : m_holder(new AnyHolder<T>(value)) {
+        m_is_string = std::is_same<std::string, T>();
+    }
+
+    LiteAny(const LiteAny& any) {
+        m_holder = any.m_holder->clone();
+        m_is_string = any.is_string();
+    }
+    LiteAny& operator=(const LiteAny& any) {
+        m_holder = any.m_holder->clone();
+        m_is_string = any.is_string();
+        return *this;
+    }
+    bool is_string() const { return m_is_string; }
+
+    class HolderBase {
+    public:
+        virtual ~HolderBase() = default;
+        virtual std::shared_ptr<HolderBase> clone() = 0;
+        virtual size_t type_length() const = 0;
+    };
+
+    template<class T>
+    class AnyHolder : public HolderBase {
+    public:
+        AnyHolder(const T value) :
+            m_value(value) {
+        }
+        virtual std::shared_ptr<HolderBase> clone() override {
+            return std::make_shared<AnyHolder>(m_value);
+        }
+        virtual size_t type_length() const override { return sizeof(T); }
+
+    public:
+        T m_value;
+    };
+    //! if type is miss matching, it will throw
+    void type_missmatch(size_t expect, size_t get) const;
+
+    //! only check the storage type and the visit type length, so it's not safe
+    template <class T>
+    T unsafe_cast() const {
+        if (sizeof(T) != m_holder->type_length()) {
+            type_missmatch(m_holder->type_length(), sizeof(T));
+        }
+        return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value;
+    }
+    //! only check the storage type and the visit type length, so it's not safe
+    void* cast_void_ptr() const {
+        return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value;
+    }
+
+private:
+    std::shared_ptr<HolderBase> m_holder;
+    bool m_is_string = false;
+};
+
+/*********************** special tensor function ***************/
+class LITE_API TensorUtils {
+public:
+    //! concat all the input tensor to one on the specified dim, the result
+    //! tensor reside in dst_device_id of dst_device, if dst_device is
+    //! LITE_DEVICE_DEFAULT, the device will get from the first tensor
+    static std::shared_ptr<Tensor> concat(
+            const std::vector<Tensor>& tensors, int dim,
+            LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT,
+            int dst_device_id = -1);
+};
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/include/lite-c/global_c.h b/lite/lite-c/include/lite-c/global_c.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b19c4e4a957286a147068d10c3ff98761d7b5a0
--- /dev/null
+++ b/lite/lite-c/include/lite-c/global_c.h
@@ -0,0 +1,169 @@
+/**
+ * \file lite-c/include/lite-c/global-c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_GLOBAL_H_
+#define LITE_C_GLOBAL_H_
+
+#include "macro.h"
+#include "network_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Get version
+ */
+LITE_API int LITE_get_version(int* major, int* minor, int* patch);
+
+/*! \brief Get the last error message.
+ * \return the message pointer
+ */
+LITE_API const char* LITE_get_last_error();
+
+/*! \brief Get device count
+ * \param[in] device_type device type
+ * \return the device count
+ */
+LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count);
+
+/*! \brief try to coalesce all free memory in megenine
+ */
+LITE_API int LITE_try_coalesce_all_free_memory();
+
+/**
+ * \brief Model decryption function
+ *
+ * \param[in] input_data is the decrypted model memory pointer
+ * \param[in] input_size the size the decrypted model memory in byte
+ * \param[in] key_data decryption key data
+ * \param[in] key_size the size of decryption key data
+ * \param[out] output_data the data of decrypted data, if output_data is
+ * nullptr, just query the output memory length, else write the decryted data to
+ * the output_data
+ * \return size of decrypted data
+ */
+typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size,
+                                     const uint8_t* key_data, size_t key_size,
+                                     const void* output_data);
+
+/**
+ * \brief Model information parse function
+ *
+ * \param[in] info_data is the information memory
+ * \param[in] info_size the size the information memory
+ * \param[in] model_name the model name used for check whether the
+ * infomation match the model
+ * \param[in] config the model config, ParseInfoFunc can fill it with the
+ * information in json, the config will influence Network loading later
+ * \param[in] network_io the model IO, ParseInfoFunc can fill it with the
+ * information in json, the networkio will influence Network forwarding later
+ * \param[in] device_id the address to store device_id, default 0
+ * \param[in] nr_threads the address to store nr_threads, default 1
+ * \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default
+ * \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default
+ * false
+ */
+typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size,
+                                 const char* model_name, LiteConfig* config,
+                                 LiteNetworkIO* network_io, int* device_id,
+                                 size_t* nr_threads, int* is_cpu_inplace_mode,
+                                 int* use_tensorrt);
+
+/**
+ * \brief register a custom decryption method and key to lite.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model.
+ * \param[in] key_data the decryption key of the method
+ * \param[in] key_size the size of decryption key
+ */
+LITE_API int LITE_register_decryption_and_key(const char* decrypt_name,
+                                              const LiteDecryptionFunc func,
+                                              const uint8_t* key_data,
+                                              size_t key_size);
+
+/**
+ * \brief update decryption function or key of a custom decryption method.
+ *
+ * \param[in] decrypt_name the name of the decryption, which will act as the
+ * hash key to find the decryption method.
+ *
+ * \param[in] func the decryption function, which will decrypt the model with
+ * the registered key, return a vector that contain the decrypted model. if
+ * function is nullptr, it will not be updated.
+ *
+ * \param[in] key the decryption key of the method, if the size of key is zero,
+ * it will not be updated
+ */
+LITE_API int LITE_update_decryption_or_key(const char* decrypt_name,
+                                           const LiteDecryptionFunc func,
+                                           const uint8_t* key_data,
+                                           size_t key_size);
+
+/**
+ * \brief register a custom parser function to lite.
+ *
+ * \param[in] info_type the name of the parser function, which will act as the
+ * hash key to find the parser method.
+ *
+ * \param[in] parse_func the parser function, which will parse the given
+ * information and modify the Network Config and IO.
+ *
+ */
+LITE_API int LITE_register_parse_info_func(const char* info_type,
+                                           const LiteParseInfoFunc parse_func);
+
+/*!
+ * \brief Set the loader to the lite
+ * \param[in] loader_path is the file path which store the cache
+ */
+LITE_API int LITE_set_loader_lib_path(const char* loader_path);
+
+/*!
+ * \brief Set the algo policy cache file for CPU/CUDA ...
+ * \param[in] cache_path is the file path which store the cache
+ * \param[in] always_sync sync the cache when cache updated
+ */
+LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync);
+
+/*!
+ * \brief Set the tensor policy cache file for CPU/CUDA ...
+ * \param[in] cache_path is the file path which store the cache
+ */
+LITE_API int LITE_set_tensor_rt_cache(const char* cache_path);
+
+/*! \brief Set the current log level.
+ * \param[in] level The new log level
+ */
+LITE_API int LITE_set_log_level(LiteLogLevel level);
+
+/*! \brief Get the current log level.
+ * \param[in] level The pointer to log level
+ */
+LITE_API int LITE_get_log_level(LiteLogLevel* level);
+/*!
+ * \brief dump the algo policy cache to file, if the network is set to profile
+ * when forward, though this the algo policy will dump to file
+ * \param[in] cache_path is the file path which store the cache
+ */
+LITE_API int LITE_dump_persistent_cache(const char* cache_path);
+
+/*!
+ * \brief dump the tensorrt policy cache to file
+ */
+LITE_API int LITE_dump_tensor_rt_cache();
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h
new file mode 100644
index 0000000000000000000000000000000000000000..84b13502f0f24f0ce0859cedc106e10cff5efb5c
--- /dev/null
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -0,0 +1,525 @@
+/**
+ * \file lite-c/include/lite-c/network_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_NETWORK_H_
+#define LITE_C_NETWORK_H_
+
+#include "tensor_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief the inference options which will be translated to megenine
+ *
+ * \param weight_preprocess is the option wich optimize the inferece performance
+ * with preprocess the const weights
+ *
+ * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
+ * dimshuffle
+ *
+ * \param fake_next_exec  whether only to perform non-computing tasks (like
+ * memory allocation and queue initialization) for next exec. This would be
+ * reset to false when the graph is executed.
+ *
+ * \param var_sanity_check_first_run Disable var sanity check on the first run.
+ * Var sanity check is enabled on the first-time execution by default, and can
+ * be used to find some potential memory access errors in the operator
+ * implementation.
+ *
+ * \param const_shape This can be used to reduce memory usage since some
+ * static inference data structures can be omitted.
+ *
+ * \param force_dynamic_alloc force dynamic memory alloc for all vars
+ *
+ * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
+ * which are used as CallbackCaller input when call compile() function
+ *
+ * \param no_profiling_on_shape_change do not re-profile to select best impl
+ * algo when input shape changes (use previous algo)
+ *
+ * \param jit_level Execute supported operators with JIT (support MLIR,
+ * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
+ * 1 for basic elemwise opr;
+ * 2 for including reduce operator
+ *
+ * \param record_level flag optimize the inference performace with record the
+ * kernel tasks in first run, hereafter the inference all need to execute the
+ * recorded tasks.
+ * level = 0 means the normal inference,
+ * level = 1 means use record inference,
+ * level = 2 means record inference with free the extra memory
+ *
+ * \param graph_opt_level optimization level:
+ * 0: disable
+ * 1: level-1: inplace arith transformations during graph
+ *    construction
+ * 2: level-2: level-1, plus global optimization before graph
+ *    compiling
+ * 3: also enable JIT
+ * <0: corresponding level, with result check for debug
+ *
+ * \param async_exec_level exec: dispatch on separate threads for different
+ * comp_node.
+ * 0: do not perform async dispatch
+ * 1: dispatch async if there are more than one comp node with limited queue
+ * mask 0b10: async if there are multiple comp nodes with
+ * mask 0b100: always async
+ */
+typedef struct Options {
+    int weight_preprocess;
+    int fuse_preprocess;
+    int fake_next_exec;
+    int var_sanity_check_first_run;
+    int const_shape;
+    int force_dynamic_alloc;
+    int force_output_dynamic_alloc;
+    int no_profiling_on_shape_change;
+    int jit_level;
+    int comp_node_seq_record_level;
+    int graph_opt_level;
+    int async_exec_level;
+
+    //! layout transform options
+    int enable_nchw44;
+    int enable_nchw44_dot;
+    int enable_nchw88;
+    int enable_nhwcd4;
+    int enable_nchw4;
+    int enable_nchw32;
+    int enable_nchw64;
+} LiteOptions;
+
+//! define a default Options
+extern LITE_API const LiteOptions default_option;
+
+/*!
+ * \brief Configuration when load and compile the graph
+ *
+ * \param bare_model_cryption_name is the bare model cryption method name, bare
+ *model is not pack json info inside
+ *
+ *\param has_compression flag whether the model is compressed, the compress
+ *method will read form the model
+ */
+typedef struct LiteConfig {
+    int has_compression;
+    int device_id;
+    LiteDeviceType device_type;
+    LiteBackend backend;
+    const char* bare_model_cryption_name;
+    LiteOptions options;
+} LiteConfig;
+
+//! get default config
+LITE_API LiteConfig* default_config();
+
+/*!
+ * \brief config the network input and output item
+ *
+ */
+typedef struct LiteIO {
+    //! the tensor name in the graph corresponding to the IO
+    const char* name;
+
+    //! Used to mark where the input tensor comes from and the output where copy
+    //! to, if is_host is true, the input is from host and output copy to host,
+    //! otherwise device. Sometimes The input is from device and output no need
+    //! copy to host, default is true.
+    int is_host;
+
+    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+    //! output tensor value is invaid, only shape will be set, default is VALUE
+    LiteIOType io_type;
+
+    //! The layout of the config from user, if other layout is set before
+    //! forward or get after forward, this layout will by pass. if no other
+    //! layout is set before forward, this layout will work. if this layout is
+    //! no set, the model will forward with its origin layout. if in output, it
+    //! will used to check.
+    LiteLayout config_layout;
+} LiteIO;
+
+//! define a default IO
+extern LITE_API const LiteIO default_io;
+
+/*!
+ * \brief the input and output information when load the network
+ * the NetworkIO will remain in the network until the network is destroyed
+ */
+typedef struct LiteNetworkIO {
+    LiteIO* inputs;
+    LiteIO* outputs;
+    size_t input_size;   //! the number IO in inputs
+    size_t output_size;  //! the number IO in outputs
+} LiteNetworkIO;
+
+//! get default NetworkIO
+LITE_API LiteNetworkIO* default_network_io();
+
+/*!
+ * \brief A user-implemented allocator function
+ */
+//! allocate memory of size in the given device with the given align
+typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id,
+                              size_t size, size_t align);
+//! free the memory pointed by ptr in the given device
+typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr);
+
+/*!
+ * \brief the thread affinith callback type
+ * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
+ * thread_id of (nr_threads - 1) is the main worker thread.
+ */
+typedef int (*LiteThreadAffinityCallback)(int thread_id);
+
+typedef int (*LiteAsyncCallback)();
+
+/*!
+ * \brief the start/finish callback function
+ * \param unordered_map map from the io tensor name to the pair of which is the
+ * corresponding IO of user config and the realy input or output tensor.
+ */
+
+typedef int (*LiteStartCallback)(const LiteIO* inputs,
+                                 const LiteTensor* input_tensors, size_t size);
+
+typedef int (*LiteFinishCallback)(const LiteIO* outputs,
+                                  const LiteTensor* output_tensors,
+                                  size_t size);
+
+/*!
+ * \brief The network is construct form a model, implement model load, init,
+ * forward, and display some model information
+ */
+typedef void* LiteNetwork;
+
+/**
+ * \brief Create a lite Network object with default config and networkIO.
+ * \param[out] network The netwrok pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_make_default_network(LiteNetwork* network);
+
+/**
+ * \brief Create a lite Network object from the given config and networkIO.
+ * \param[in] config The configration to create the network
+ * \param[in] network_io The configration io to create the network
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config,
+                               const LiteNetworkIO network_io);
+
+/**
+ * \brief Create a lite Network object from the given config and networkIO.
+ * \param[in] config The configration to create the network
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config);
+
+
+/**
+ * \brief load the model to network form memory
+ * \param[in] model_mem The model in memory
+ * \param[in] size The size of the model memory
+ * \param[out] network The network to be load model in
+ */
+LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
+                                      size_t size);
+
+/**
+ * \brief load the model to network form given path
+ * \param[in] model_path The model path
+ * \param[out] network The network to be load model in
+ */
+LITE_API int LITE_load_model_from_path(LiteNetwork network,
+                                       const char* model_path);
+
+/**
+ * \brief load a new network which will share weights with src network
+ * \param[in] origin_network The origin network pointer
+ * \param[out] network The network pointer
+ */
+LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network,
+                                             const LiteNetwork src_network);
+
+/**
+ * \brief Destroy a lite network object.
+ * \param[in] network The network pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_destroy_network(LiteNetwork network);
+
+/**
+ * \brief forward the network with filled input data and fill the output data
+ * to the output tensor
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_forward(const LiteNetwork network);
+
+/**
+ * \brief waite until forward finish in sync model
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_wait(const LiteNetwork network);
+
+/**
+ * \brief get the network input and ouput tensor, the layout of which is
+ * get from model
+ * \param[in] network The loaded model
+ * \param[in] io_name The input or output name
+ * \param[in] phase The tensor phase
+ * \param[out] tensor The IO tensor get from the network
+ */
+LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
+                                LiteTensorPhase phase, LiteTensor* tensor);
+
+/**
+ * \brief get the input tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] index The index of input tensor
+ * \param[out] name The input tensor name
+ */
+LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index,
+                                 const char** name);
+
+/**
+ * \brief get the output tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] index The index of output tensor
+ * \param[out] name The output tensor name
+ */
+LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index,
+                                  const char** name);
+
+/**
+ * \brief get all the input tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] size The number of the input tensor
+ * \param[out] name The input tensor names
+ */
+LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
+                                     const char** name);
+
+/**
+ * \brief get all the output tensor name in the order in loaded model
+ * \param[in] network The loaded model
+ * \param[in] size The number of output tensor
+ * \param[out] name The output tensor name
+ */
+LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
+                                      const char** name);
+
+/**
+ * \brief get whether the model is running in cpu inplace mode
+ * \param[in] network The loaded model
+ * \param[out] is_cpu_inplace_mode whether is in cpu inplace mode
+ */
+LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network,
+                                      int* is_cpu_inplace_mode);
+
+/**
+ * \brief get the number of thread the network will run with
+ * \param[in] network The loaded model
+ * \param[out] nr_threads the thread number when the network running
+ */
+LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network,
+                                         size_t* nr_threads);
+
+/**
+ * \brief get the device id the network will run with
+ * \param[in] network The loaded model
+ * \param[out] device_id the device id of the network will run
+ */
+LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id);
+
+/**
+ * \brief get the stream id the network will run with
+ * \param[in] network The loaded model
+ * \param[out] stream_id the stream id of the network will run
+ */
+LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id);
+
+/**
+ * \brief get the device type the network will run with
+ * \param[in] network The loaded model
+ * \param[out] device_type the device type of the network will run
+ */
+LITE_API int LITE_get_device_type(const LiteNetwork network,
+                                  LiteDeviceType* device_type);
+
+/**
+ * \brief get the device type the network will run with
+ * \param[in] network The loaded model
+ * \param[out] info  : the json format memory
+ * \param[out] info_size: the json format memory size
+ */
+LITE_API int LITE_get_model_extra_info(const LiteNetwork network,
+                                       const char** info, int* info_size);
+
+/**
+ * \brief Set cpu default mode when device is CPU, in some low computation
+ * device or single core device, this mode will get good performace
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network);
+
+/**
+ * \brief When device is CPU, this interface will set the to be loaded model
+ * run in multi thread mode with the given thread number.
+ * \param[in] network The loaded model
+ * \param[in] nr_threads The threads number
+ */
+LITE_API int LITE_set_cpu_threads_number(LiteNetwork network,
+                                         size_t nr_threads);
+
+/**
+ * \brief set device id, default device id = 0
+ * \param[in] network The loaded model
+ * \param[in] device_id The device id to be set
+ */
+LITE_API int LITE_set_device_id(LiteNetwork network, int device_id);
+
+/**
+ * \brief set stream id, default stream id = 0
+ * \param[in] network The loaded model
+ * \param[in] stream_id The stream id to be set
+ */
+LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id);
+
+/**
+ * \brief enable tensorrt
+ * \param[in] network The loaded model
+ */
+LITE_API int LITE_use_tensorrt(LiteNetwork network);
+
+/**
+ * \brief set opr algorithm selection strategy in the network
+ * \param[in] network The loaded model
+ * \param[in] select_strategy The operator algorithm selection strategy
+ */
+LITE_API int LITE_set_network_algo_policy(LiteNetwork network,
+                                          LiteAlgoSelectStrategy strategy);
+
+/**
+ * \brief set opr algorithm selection strategy in the network
+ * \param[in] network The loaded model
+ * \param[in] shared_batch_size: the batch size used by fastrun,
+ *                      Non-zero value means that fastrun use this batch size
+ *                      regardless of the batch size of the model. Zero means
+ *                      fastrun use batch size of the model
+ * \param[in] binary_equal_between_batch: if the content of each input batch is
+ *                      binary equal,whether the content of each output batch is
+ *                      promised to be equal
+ */
+LITE_API int LITE_set_network_algo_fastrun_config(
+        LiteNetwork network, unsigned int shared_batch_size,
+        int binary_equal_between_batch);
+
+/**
+ * \brief set workspace_limit for oprs with multiple algorithms, set
+ * workspace limit can save memory but may influence the performance
+ * \param[in] network The loaded model
+ * \param[in] workspace_limit The operator algorithm workspace limit
+ */
+LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network,
+                                                   size_t workspace_limit);
+
+/**
+ * \brief set the network forward in async mode and set the async callback
+ * function
+ * \param[in] network The loaded model
+ * \param[in] async_callback when network finish forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_async_callback(LiteNetwork network,
+                                     const LiteAsyncCallback async_callback);
+
+/**
+ * \brief set the start forward callback function, which will be execute beform
+ *  forward, this can be used to check network input or dump model inputs
+ *  for debug
+ * \param[in] network The loaded model
+ * \param[in] start_callback when network start forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_start_callback(LiteNetwork network,
+                                     const LiteStartCallback start_callback);
+
+/**
+ * \brief set the finish forward callback function, which will be execute after
+ * forward, this can be used to dump model outputs for debug
+ * \param[in] network The loaded model
+ * \param[in] finish_callback when network finish forwarding, the callbak
+ * will be called
+ */
+LITE_API int LITE_set_finish_callback(LiteNetwork network,
+                                      const LiteFinishCallback finish_callback);
+
+/**
+ * \brief set threads affinity callback
+ * \param[in] network The loaded model
+ * \param[in] thread_affinity_callback
+ */
+LITE_API int LITE_set_runtime_thread_affinity(
+        LiteNetwork network,
+        const LiteThreadAffinityCallback thread_affinity_callback);
+
+/**
+ * \brief set the network memroy allocator, the allocator is defined by user
+ * \param[in] network The loaded model
+ * \param[in] allocate_fun The allocate function of the user defined allocator
+ * \param[in] free_fun The free function of the user defined allocator
+ */
+LITE_API int LITE_set_memory_allocator(LiteNetwork network,
+                                       const LiteAllocate allocate_fun,
+                                       const LiteFree free_fun);
+
+/**
+ * \brief the dst_network share the runtime memory with src_network
+ * \param[in] src_network The source network
+ * \param[in] dst_network The dst network to shared memory with src_network
+ */
+LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network,
+                                       LiteNetwork dst_network);
+
+/**
+ * \brief enable profile the network, a JSON format file will be generated
+ * \param[in] network The loaded model
+ * \param[in] profile_json_file_path The profile result file path
+ */
+LITE_API int LITE_enable_profile_performance(
+        LiteNetwork network, const char* profile_json_file_path);
+
+/**
+ * \brief Dump input/output values of all internal variables to output file,
+ * in text format
+ * \param[in] network The loaded model
+ * \param[in] io_txt_out_file The dumped txt file name
+ */
+LITE_API int LITE_enable_io_txt_dump(LiteNetwork network,
+                                     const char* io_txt_out_file);
+
+/**
+ * \brief Dump input/output values of all internal variables to output
+ * directory, in binary format
+ * \param[in] network The loaded model
+ * \param[in] io_bin_out_dir The dumped bin file directory
+ */
+LITE_API int LITE_enable_io_bin_dump(LiteNetwork network,
+                                     const char* io_bin_out_dir);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/include/lite-c/tensor_c.h b/lite/lite-c/include/lite-c/tensor_c.h
new file mode 100644
index 0000000000000000000000000000000000000000..96316a9a9f526a1c138e4a7aea263ae197a74b83
--- /dev/null
+++ b/lite/lite-c/include/lite-c/tensor_c.h
@@ -0,0 +1,251 @@
+/**
+ * \file lite-c/include/lite-c/tensor_c.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_TENSOR_C_H_
+#define LITE_TENSOR_C_H_
+
+#include "common_enum_c.h"
+#include "macro.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "stddef.h"
+#include "stdint.h"
+
+#define LAYOUT_MAX_DIM (7)
+
+/*!
+ * \brief the simple layout description
+ */
+typedef struct LiteLayout {
+    size_t shapes[LAYOUT_MAX_DIM];
+    size_t ndim;
+    LiteDataType data_type;
+} LiteLayout;
+
+//! define a default LiteLayout
+extern LITE_API const LiteLayout default_layout;
+
+/*!
+ * \brief warpper of the MegEngine Tensor
+ *
+ * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
+ * this is used to Optimize the H2D or D2H memory copy, if the device or layout
+ * is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor
+ * will be automatically set to pinned tensor
+ */
+typedef struct LiteTensorDesc {
+    //! flag whether the storage of the tensor is pinned, this is only used when
+    //! the compnode is not in CPU
+    int is_pinned_host;
+
+    //! the layout of the tensor
+    LiteLayout layout;
+
+    //! the device of the tensor should not be changed after the tensor has
+    //! constructed
+    LiteDeviceType device_type;
+
+    //! device id of the tensor
+    int device_id;
+} LiteTensorDesc;
+
+//! define a default TensorDesc
+extern LITE_API const LiteTensorDesc default_desc;
+
+/*!
+ * \brief The pointer to a Lite Tensor object
+ */
+typedef void* LiteTensor;
+
+/**
+ * \brief Create a lite tensor object from the given describe.
+ * \param[in] tensor_describe The description to create the Tensor
+ * \param[out] tensor The Tensor pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe,
+                              LiteTensor* tensor);
+
+/**
+ * \brief Destroy a lite tensor object.
+ * \param[in] tensor The Tensor pointer
+ * \return int if the return is not zero, error happened, the error message
+ * can get by LITE_get_last_error
+ */
+LITE_API int LITE_destroy_tensor(LiteTensor tensor);
+
+/**
+ * \brief change the layout of a Tensor object.
+ * \param[in] tensor The Tensor
+ * \param[out] layout The Layout to be set to a tensor
+ */
+LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout);
+
+/**
+ * \brief use the user allocated data to reset the memory of the tensor, the
+ * memory will not be managed by the lite, later, the user should delete
+ * it.
+ * \param[in] tensor The Tensor
+ * \param[in] prepared_data The allocated memory which satisfy the Tensor
+ * \param[in] data_length_in_byte The length of the allocated memory
+ * layout
+ */
+LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
+                                      size_t data_length_in_byte);
+
+/**
+ * \brief  use the user allocated data and corresponding layout to reset the
+ * data and layout of the tensor, the memory will not be managed by lite, later,
+ * the user should delete it.
+ * \param[in] tensor The Tensor
+ * \param[in] layout The Layout to be set to the tensor
+ * \param[in] prepared_data The allocated memory which satisfy the layout to be
+ * set
+ */
+LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
+                               void* prepared_data);
+
+/**
+ * \brief reshape a tensor with the memroy not change, the total number of
+ * element in the reshaped tensor must equal to the origin tensor, the input
+ * shape must only contain one or zero -1 to flag it can be deduced
+ * automatically.
+ * \param[in] tensor The Tensor to be reshape
+ * \param[in] shape the user input shape
+ * \param[in] size the number of data in shape,
+ */
+LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size);
+
+/**
+ * \brief slice a tensor with input param
+ * \param[in] tensor The Tensor to be slice
+ * \param[in] start start index of every axis of to be sliced
+ * \param[in] end end index of every axis of to be sliced
+ * \param[in] step step of every axis of to be sliced, if nullptr, step will be
+ * 1
+ * \param[in] size the number axis to be sliced
+ * \param[out] sliced_tensor the result tensor sliced from the origin tensor
+ */
+LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
+                               const size_t* end, const size_t* step,
+                               size_t size, LiteTensor* slice_tensor);
+
+/**
+ * \brief fill zero to the tensor
+ * \param[in] tensor The Tensor to be memset
+ */
+LITE_API int LITE_tensor_fill_zero(LiteTensor tensor);
+
+/**
+ * \brief copy tensor form other tensor
+ * \param[out] dst_tensor The Tensor to copy into
+ * \param[in] src_tensor The Tensor to copy from
+ */
+LITE_API int LITE_tensor_copy(LiteTensor dst_tensor,
+                              const LiteTensor src_tensor);
+
+/**
+ * \brief share memory form other tensor
+ * \param[out] dst_tensor The Tensor to share into
+ * \param[in] src_tensor The Tensor to be shared
+ */
+LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
+                                           const LiteTensor src_tensor);
+
+/**
+ * \brief get the memory pointer of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] data a pointer to void pointer
+ */
+LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data);
+
+/**
+ * \brief get the memory pointer of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[in] index The coordinate in the tensor
+ * \param[in] size The lenght of coordinate
+ * \param[out] data a pointer to void pointer
+ */
+LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
+                                               const size_t* index, size_t size,
+                                               void** data);
+
+/**
+ * \brief get the tensor capacity in byte of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] size_ptr a pointer to the return size
+
+ */
+LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor,
+                                                size_t* size);
+
+/**
+ * \brief get the tensor layout of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] layout_ptr a pointer will be write with the layout of the tensor
+ */
+LITE_API int LITE_get_tensor_layout(const LiteTensor tensor,
+                                    LiteLayout* layout);
+
+/**
+ * \brief get the tensor device of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] device_ptr a pointer will be write with the device of the tensor
+ */
+LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor,
+                                         LiteDeviceType* device_type);
+
+/**
+ * \brief get the tensor device id of a Tensor object.
+ * \param[in] tensor The input Tensor
+ * \param[out] device_id a pointer will be write with the device id of the
+ * tensor
+ */
+LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id);
+
+/**
+ * \brief whether the tensor is is_pinned_host.
+ * \param[in] tensor The input Tensor
+ * \param[out] is_pinned_host_ptr a int pointer will be write with whether the
+ * tensor is pinned host
+ */
+LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host);
+
+/**
+ * \brief whether the tensor memory is continue.
+ * \param[in] tensor The input Tensor
+ * \param[out] is_continue a int pointer will be write with whether the
+ * tensor continue
+ */
+LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue);
+/**
+ * \brief concat the inputs tensor to one big tensor
+ * \param[in] tensors ptr The input Tensors
+ * \param[in] nr_tensors number input Tensor
+ * \param[in] dim the dim concat act on
+ * \param[in] dst_device the device type of result tensor, when
+ * LITE_DEVICE_DEFAULT, the result tensor device type will get from the first
+ * tensor
+ * \param[in] device_id the device id of result tensor, when -1, the result
+ * tensor device id will get from the first tensor
+ * \param[out] result_tensor the result tensor after concat
+ */
+LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
+                                LiteDeviceType dst_device, int device_id,
+                                LiteTensor* result_tensor);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/src/common.h b/lite/lite-c/src/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..47208c386fa12c08d55d6737fe62e92fc3f1d16f
--- /dev/null
+++ b/lite/lite-c/src/common.h
@@ -0,0 +1,73 @@
+/**
+ * \file lite-c/src/common.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#ifndef LITE_C_COMMON_H_
+#define LITE_C_COMMON_H_
+
+#include "../src/misc.h"
+#include "lite-c/network_c.h"
+#include "lite-c/tensor_c.h"
+#include "lite/network.h"
+
+#include <exception>
+#include <stdexcept>
+
+//! convert c Layout to lite::Layout
+lite::Layout convert_to_layout(const LiteLayout& layout);
+
+//! convert lite::Layout to C Layout
+LiteLayout convert_to_clayout(const lite::Layout& layout);
+
+//! convert c config to lite::config
+lite::Config convert_to_lite_config(const LiteConfig c_config);
+
+//! convert C NetworkIO io to lite::NetworkIO
+lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io);
+
+/*!
+ * \brief handle exception
+ * \param e the exception
+ * \return the return value of the error
+ */
+int LiteHandleException(const std::exception& e);
+#if LITE_ENABLE_EXCEPTION
+/*! \brief  macro to guard a function */
+#define LITE_CAPI_BEGIN() try {
+/*! \brief every function starts with LITE_CAPI_BEGIN();
+ * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
+ */
+#define LITE_CAPI_END()                       \
+    }                                         \
+    catch (std::exception & _except_) {       \
+        return LiteHandleException(_except_); \
+    }                                         \
+    return 0;
+#else
+/*! \brief  macro to guard a function */
+#define LITE_CAPI_BEGIN()  {
+/*! \brief every function starts with LITE_CAPI_BEGIN();
+ * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
+ */
+#define LITE_CAPI_END()                       \
+    }                                         \
+    return 0;
+#endif
+/*!
+ * \brief catch the exception with stms
+ */
+#define LITE_CAPI_END_WITH_STMS(_stms)        \
+    }                                         \
+    catch (std::exception & _except_) {       \
+        _stms;                                \
+        return LiteHandleException(_except_); \
+    }                                         \
+    return 0;
+
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/src/global.cpp b/lite/lite-c/src/global.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51145b784a9d99663a5d5147bb18e2f89c010c2a
--- /dev/null
+++ b/lite/lite-c/src/global.cpp
@@ -0,0 +1,192 @@
+/**
+ * \file lite-c/src/tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/global.h"
+#include "common.h"
+#include "lite-c/global_c.h"
+
+#include <exception>
+#include <mutex>
+
+namespace {
+
+class ErrorMsg {
+public:
+    std::string& get_error_msg() { return error_msg; }
+    void set_error_msg(const std::string& msg) { error_msg = msg; }
+
+private:
+    std::string error_msg;
+};
+ErrorMsg& get_global_error() {
+    static thread_local ErrorMsg error_msg;
+    return error_msg;
+}
+}  // namespace
+
+int LiteHandleException(const std::exception& e) {
+    get_global_error().set_error_msg(e.what());
+    return -1;
+}
+
+const char* LITE_get_last_error() {
+    return get_global_error().get_error_msg().c_str();
+}
+
+int LITE_get_version(int* major, int* minor, int* patch) {
+    LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null");
+    lite::get_version(*major, *minor, *patch);
+    return 0;
+}
+
+int LITE_get_device_count(LiteDeviceType device_type, size_t* count) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(count, "The ptr pass to LITE api is null");
+    *count = lite::get_device_count(device_type);
+    LITE_CAPI_END();
+}
+
+int LITE_try_coalesce_all_free_memory(){
+    LITE_CAPI_BEGIN();
+    lite::try_coalesce_all_free_memory();
+    LITE_CAPI_END();
+}
+
+int LITE_register_decryption_and_key(const char* decrypt_name,
+                                     const LiteDecryptionFunc func,
+                                     const uint8_t* key_data, size_t key_size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(decrypt_name && key_data && func,
+                "The ptr pass to LITE api is null");
+    std::vector<uint8_t> key;
+    for (size_t i = 0; i < key_size; i++) {
+        key.push_back(key_data[i]);
+    }
+    auto decrypt_func = [func](const void* input_data, size_t input_size,
+                               const std::vector<uint8_t>& key) {
+        auto size =
+                func(input_data, input_size, key.data(), key.size(), nullptr);
+        std::vector<uint8_t> output(size, 0);
+        func(input_data, input_size, key.data(), key.size(), output.data());
+        return output;
+    };
+    lite::register_decryption_and_key(decrypt_name, decrypt_func, key);
+    LITE_CAPI_END();
+}
+
+int LITE_update_decryption_or_key(const char* decrypt_name,
+                                  const LiteDecryptionFunc func,
+                                  const uint8_t* key_data, size_t key_size) {
+    LITE_CAPI_BEGIN();
+    std::vector<uint8_t> key;
+    for (size_t i = 0; i < key_size; i++) {
+        key.push_back(key_data[i]);
+    }
+    lite::DecryptionFunc decrypt_func = nullptr;
+    if (func) {
+        decrypt_func = [func](const void* input_data, size_t input_size,
+                              const std::vector<uint8_t>& key) {
+            auto size = func(input_data, input_size, key.data(), key.size(),
+                             nullptr);
+            std::vector<uint8_t> output(size, 0);
+            func(input_data, input_size, key.data(), key.size(), output.data());
+            return output;
+        };
+    }
+    lite::update_decryption_or_key(decrypt_name, decrypt_func, key);
+    LITE_CAPI_END();
+}
+
+int LITE_register_parse_info_func(const char* info_type,
+                                  const LiteParseInfoFunc parse_func) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null");
+    auto lite_func = [parse_func](
+                             const void* info_data, size_t info_size,
+                             const std::string model_name, lite::Config& config,
+                             lite::NetworkIO& network_io,
+                             std::unordered_map<std::string, lite::LiteAny>&
+                                     separate_config_map,
+                             std::string& extra_info) {
+        LITE_MARK_USED_VAR(extra_info);
+        size_t nr_threads = 1;
+        int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false;
+        LiteNetworkIO c_io;
+        LiteConfig c_config;
+        auto ret = parse_func(info_data, info_size, model_name.c_str(),
+                              &c_config, &c_io, &device_id, &nr_threads,
+                              &is_cpu_inplace_mode, &use_tensorrt);
+        config = convert_to_lite_config(c_config);
+        network_io = convert_to_lite_io(c_io);
+        if (device_id != 0) {
+            separate_config_map["device_id"] = device_id;
+        }
+        if (nr_threads != 1) {
+            separate_config_map["nr_threads"] = nr_threads;
+        }
+        if (is_cpu_inplace_mode != false) {
+            separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode;
+        }
+        if (use_tensorrt != false) {
+            separate_config_map["use_tensorrt"] = use_tensorrt;
+        }
+        return ret;
+    };
+    lite::register_parse_info_func(info_type, lite_func);
+    LITE_CAPI_END();
+}
+
+int LITE_set_loader_lib_path(const char* loader_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(loader_path, "The ptr pass to LITE api is null");
+    lite::set_loader_lib_path(loader_path);
+    LITE_CAPI_END();
+}
+
+int LITE_set_persistent_cache(const char* cache_path, int always_sync) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::set_persistent_cache(cache_path, always_sync);
+    LITE_CAPI_END();
+}
+
+int LITE_set_tensor_rt_cache(const char* cache_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::set_tensor_rt_cache(cache_path);
+    LITE_CAPI_END();
+}
+
+int LITE_set_log_level(LiteLogLevel level) {
+    LITE_CAPI_BEGIN();
+    lite::set_log_level(level);
+    LITE_CAPI_END();
+}
+
+int LITE_get_log_level(LiteLogLevel* level) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(level, "The ptr pass to LITE api is null");
+    *level = lite::get_log_level();
+    LITE_CAPI_END();
+}
+
+int LITE_dump_persistent_cache(const char* cache_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
+    lite::dump_persistent_cache(cache_path);
+    LITE_CAPI_END();
+}
+
+int LITE_dump_tensor_rt_cache() {
+    LITE_CAPI_BEGIN();
+    lite::dump_tensor_rt_cache();
+    LITE_CAPI_END();
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..409924287909058d399701b360377a4ab3ea2e23
--- /dev/null
+++ b/lite/lite-c/src/network.cpp
@@ -0,0 +1,580 @@
+/**
+ * \file lite-c/src/network.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/network.h"
+#include "common.h"
+#include "lite-c/network_c.h"
+
+#include "../../src/network_impl_base.h"
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <string.h>
+
+//! define a default Options
+const LiteOptions default_option = {
+        .weight_preprocess = false,
+        .fuse_preprocess = false,
+        .fake_next_exec = false,
+        .var_sanity_check_first_run = true,
+        .const_shape = false,
+        .force_dynamic_alloc = false,
+        .force_output_dynamic_alloc = false,
+        .no_profiling_on_shape_change = false,
+        .jit_level = 0,
+        .comp_node_seq_record_level = 0,
+        .graph_opt_level = 2,
+        .async_exec_level = 1,
+        //! layout transform options
+        .enable_nchw44 = 0,
+        .enable_nchw44_dot = 0,
+        .enable_nchw88 = 0,
+        .enable_nhwcd4 = 0,
+        .enable_nchw4 = 0,
+        .enable_nchw32 = 0,
+        .enable_nchw64 = 0,
+
+};
+
+//! define a default config
+LiteConfig default_config_t = {.has_compression = false,
+                                   .device_id = -1,
+                                   .device_type = LiteDeviceType::LITE_CPU,
+                                   .backend = LiteBackend::LITE_DEFAULT,
+                                   .bare_model_cryption_name = nullptr,
+                                   .options = default_option};
+LiteConfig* default_config() {
+    return &default_config_t;
+}
+
+//! define a default IO
+const LiteIO default_io = {.name = nullptr,
+                           .is_host = true,
+                           .io_type = LiteIOType::LITE_IO_VALUE,
+                           .config_layout = default_layout};
+
+//! define a default NetworkIO
+LiteNetworkIO default_network_io_t = {.inputs = nullptr,
+                                          .outputs = nullptr,
+                                          .input_size = 0,
+                                          .output_size = 0};
+LiteNetworkIO* default_network_io() {
+    return &default_network_io_t;
+}
+
+namespace {
+std::unordered_map<void*, std::shared_ptr<lite::Network>>&
+get_gloabl_network_holder() {
+    static thread_local std::unordered_map<void*,
+                                           std::shared_ptr<lite::Network>>
+            network_holder;
+    return network_holder;
+}
+
+/*!
+ * \brief A user-implemented allocator interface
+ */
+class UserAllocator : public lite::Allocator {
+public:
+    UserAllocator(LiteAllocate allocate_func, LiteFree free_func)
+            : m_allocator(allocate_func), m_free(free_func) {
+        LITE_ASSERT(m_allocator && m_free);
+    }
+
+    //! allocate memory of size in the given device with the given align
+    void* allocate(LiteDeviceType device_type, int device_id, size_t size,
+                   size_t align) override {
+        return m_allocator(device_type, device_id, size, align);
+    }
+
+    //! free the memory pointed by ptr in the given device
+    void free(LiteDeviceType device_type, int device_id, void* ptr) override {
+        m_free(device_type, device_id, ptr);
+    }
+
+private:
+    LiteAllocate m_allocator;
+    LiteFree m_free;
+};
+}  // namespace
+
+//! convert c config to lite::config
+lite::Config convert_to_lite_config(const LiteConfig c_config) {
+    lite::Config lite_config;
+    lite_config.device_type = c_config.device_type;
+    if (c_config.bare_model_cryption_name) {
+        lite_config.bare_model_cryption_name =
+                c_config.bare_model_cryption_name;
+    }
+    lite_config.backend = c_config.backend;
+    lite_config.has_compression = c_config.has_compression;
+    lite_config.device_id = c_config.device_id;
+
+    lite_config.options.weight_preprocess = c_config.options.weight_preprocess;
+    lite_config.options.fuse_preprocess = c_config.options.fuse_preprocess;
+    lite_config.options.fake_next_exec = c_config.options.fake_next_exec;
+    lite_config.options.var_sanity_check_first_run =
+            c_config.options.var_sanity_check_first_run;
+    lite_config.options.const_shape = c_config.options.const_shape;
+    lite_config.options.force_dynamic_alloc = c_config.options.const_shape;
+    lite_config.options.force_output_dynamic_alloc =
+            c_config.options.force_output_dynamic_alloc;
+    lite_config.options.no_profiling_on_shape_change =
+            c_config.options.no_profiling_on_shape_change;
+    lite_config.options.jit_level = c_config.options.jit_level;
+    lite_config.options.comp_node_seq_record_level =
+            c_config.options.comp_node_seq_record_level;
+    lite_config.options.graph_opt_level = c_config.options.graph_opt_level;
+    lite_config.options.async_exec_level = c_config.options.async_exec_level;
+
+   lite_config.options.enable_nchw44 = c_config.options.enable_nchw44;
+   lite_config.options.enable_nchw44_dot = c_config.options.enable_nchw44_dot;
+   lite_config.options.enable_nchw88 = c_config.options.enable_nchw88;
+   lite_config.options.enable_nchw4 = c_config.options.enable_nchw4;
+   lite_config.options.enable_nhwcd4 = c_config.options.enable_nhwcd4;
+   lite_config.options.enable_nchw32 = c_config.options.enable_nchw32;
+   lite_config.options.enable_nchw64 = c_config.options.enable_nchw64;
+
+    return lite_config;
+}
+
+//! convert C NetworkIO io to lite::NetworkIO
+lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io) {
+    lite::NetworkIO network_io;
+    for (size_t i = 0; i < c_network_io.input_size; i++) {
+        LiteIO* c_io = c_network_io.inputs + i;
+        LITE_ASSERT(c_io->name, "input name of io tensor must set.");
+        network_io.inputs.push_back(
+                {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type,
+                 convert_to_layout(c_io->config_layout)});
+    }
+    for (size_t i = 0; i < c_network_io.output_size; i++) {
+        LiteIO* c_io = c_network_io.outputs + i;
+        LITE_ASSERT(c_io->name, "output name of io tensor must set.");
+        network_io.outputs.push_back(
+                {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type,
+                 convert_to_layout(c_io->config_layout)});
+    }
+    return network_io;
+}
+
+int LITE_make_default_network(LiteNetwork* network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_network = std::make_shared<lite::Network>();
+    get_gloabl_network_holder()[lite_network.get()] = lite_network;
+    *network = lite_network.get();
+    LITE_CAPI_END();
+}
+
+int LITE_make_network(LiteNetwork* network, const LiteConfig config,
+                      const LiteNetworkIO network_io) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_network = std::make_shared<lite::Network>(
+            convert_to_lite_config(config), convert_to_lite_io(network_io));
+    get_gloabl_network_holder()[lite_network.get()] = lite_network;
+    *network = lite_network.get();
+    LITE_CAPI_END();
+}
+
+int LITE_make_network_config(LiteNetwork* network, const LiteConfig config) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_network =
+            std::make_shared<lite::Network>(convert_to_lite_config(config));
+    get_gloabl_network_holder()[lite_network.get()] = lite_network;
+    *network = lite_network.get();
+    LITE_CAPI_END();
+}
+
+int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
+                             size_t size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(model_mem, "The model memory pass to LITE api is null");
+    static_cast<lite::Network*>(network)->load_model(model_mem, size);
+    LITE_CAPI_END();
+}
+
+int LITE_load_model_from_path(LiteNetwork network, const char* model_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(model_path, "The model path pass to LITE api is null");
+    static_cast<lite::Network*>(network)->load_model(model_path);
+    LITE_CAPI_END();
+}
+
+int LITE_destroy_network(LiteNetwork network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    get_gloabl_network_holder().erase(network);
+    LITE_CAPI_END();
+}
+
+int LITE_forward(const LiteNetwork network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    static_cast<lite::Network*>(network)->forward();
+    LITE_CAPI_END();
+}
+
+int LITE_wait(const LiteNetwork network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    static_cast<lite::Network*>(network)->wait();
+    LITE_CAPI_END();
+}
+
+int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
+                       LiteTensorPhase phase, LiteTensor* tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto io_tensor =
+            static_cast<lite::Network*>(network)->get_io_tensor(io_name, phase);
+    *tensor = io_tensor.get();
+    LITE_CAPI_END();
+}
+
+int LITE_get_input_name(const LiteNetwork network, size_t index,
+                        const char** name) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network && name, "The network pass to LITE api is null");
+    *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
+                    ->get_input_name(index);
+    LITE_CAPI_END();
+}
+
+int LITE_get_output_name(const LiteNetwork network, size_t index,
+                         const char** name) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(name, "The name ptr pass to LITE api is null");
+    *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
+                    ->get_output_name(index);
+    LITE_CAPI_END();
+}
+
+int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
+                            const char** name) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto&& names =
+            lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
+                    ->get_all_input_name();
+    if (size)
+        *size = names.size();
+    if (name) {
+        for (auto in_name : names) {
+            *name = in_name;
+            name++;
+        }
+    }
+    LITE_CAPI_END();
+}
+
+int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
+                             const char** name) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto&& names =
+            lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
+                    ->get_all_output_name();
+    if (size)
+        *size = names.size();
+    if (name) {
+        for (auto in_name : names) {
+            *name = in_name;
+            name++;
+        }
+    }
+    LITE_CAPI_END();
+}
+
+int LITE_set_device_id(LiteNetwork network, int device_id) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    static_cast<lite::Network*>(network)->set_device_id(device_id);
+    LITE_CAPI_END();
+}
+
+int LITE_get_device_id(const LiteNetwork network, int* device_id) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(device_id, "The device_id pass to LITE api is null");
+    *device_id = static_cast<lite::Network*>(network)->get_device_id();
+    LITE_CAPI_END();
+}
+
+int LITE_set_stream_id(LiteNetwork network, int stream_id) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    static_cast<lite::Network*>(network)->set_stream_id(stream_id);
+    LITE_CAPI_END();
+}
+
+int LITE_get_stream_id(const LiteNetwork network, int* stream_id) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(stream_id, "The stream_id pass to LITE api is null");
+    *stream_id = static_cast<lite::Network*>(network)->get_stream_id();
+    LITE_CAPI_END();
+}
+
+int LITE_get_model_extra_info(const LiteNetwork network, const char** info,
+                              int* info_size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(info_size, "The info and info_size are all null");
+    auto& extra_info =
+            static_cast<lite::Network*>(network)->get_model_extra_info();
+    *info_size = extra_info.size();
+    *info = extra_info.c_str();
+    LITE_MARK_USED_VAR(info);
+    LITE_CAPI_END();
+}
+
+int LITE_get_device_type(const LiteNetwork network,
+                         LiteDeviceType* device_type) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(device_type, "The device_type pass to LITE api is null");
+    *device_type = static_cast<lite::Network*>(network)->get_device_type();
+    LITE_CAPI_END();
+}
+
+int LITE_set_async_callback(LiteNetwork network,
+                            const LiteAsyncCallback async_callback) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(async_callback, "The ptr pass to LITE api is null");
+    static_cast<lite::Network*>(network)->set_async_callback(
+            std::move(async_callback));
+    LITE_CAPI_END();
+}
+
+int LITE_set_start_callback(LiteNetwork network,
+                            const LiteStartCallback start_callback) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_start_callback =
+            [start_callback](
+                    const std::unordered_map<
+                            std::string,
+                            std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>&
+                            inputs_map) -> void {
+        std::vector<LiteIO> ios;
+        std::vector<LiteTensor> io_tensors;
+        size_t nr_io = 0;
+        for (const auto& io : inputs_map) {
+            nr_io++;
+            auto&& lite_io = io.second.first;
+            ios.push_back({lite_io.name.c_str(), lite_io.is_host,
+                           lite_io.io_type,
+                           convert_to_clayout(lite_io.config_layout)});
+            io_tensors.push_back(io.second.second.get());
+        }
+        start_callback(ios.data(), io_tensors.data(), nr_io);
+    };
+    static_cast<lite::Network*>(network)->set_start_callback(
+            lite_start_callback);
+    LITE_CAPI_END();
+}
+
+int LITE_set_finish_callback(LiteNetwork network,
+                             const LiteFinishCallback finish_callback) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_finish_callback =
+            [finish_callback](
+                    const std::unordered_map<
+                            std::string,
+                            std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>&
+                            outputs_map) -> void {
+        std::vector<LiteIO> ios;
+        std::vector<LiteTensor> io_tensors;
+        size_t nr_io = 0;
+        for (const auto& io : outputs_map) {
+            nr_io++;
+            auto&& lite_io = io.second.first;
+            ios.push_back({lite_io.name.c_str(), lite_io.is_host,
+                           lite_io.io_type,
+                           convert_to_clayout(lite_io.config_layout)});
+            io_tensors.push_back(io.second.second.get());
+        }
+        finish_callback(ios.data(), io_tensors.data(), nr_io);
+    };
+    static_cast<lite::Network*>(network)->set_finish_callback(
+            lite_finish_callback);
+    LITE_CAPI_END();
+}
+
+int LITE_enable_profile_performance(LiteNetwork network,
+                                    const char* profile_json_file_path) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    static_cast<lite::Network*>(network)->enable_profile_performance(
+            profile_json_file_path);
+    LITE_CAPI_END();
+}
+
+int LITE_is_cpu_inplace_mode(const LiteNetwork network,
+                             int* is_cpu_inplace_mode) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network && is_cpu_inplace_mode,
+                "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    *is_cpu_inplace_mode = lite::Runtime::is_cpu_inplace_mode(network_shared);
+    LITE_CAPI_END();
+}
+
+int LITE_get_cpu_threads_number(const LiteNetwork network, size_t* nr_threads) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(nr_threads, "The ptr pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    *nr_threads = lite::Runtime::get_cpu_threads_number(network_shared);
+    LITE_CAPI_END();
+}
+
+int LITE_set_cpu_inplace_mode(LiteNetwork network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_cpu_inplace_mode(network_shared);
+    LITE_CAPI_END();
+}
+
+int LITE_use_tensorrt(LiteNetwork network){
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::use_tensorrt(network_shared);
+    LITE_CAPI_END();
+}
+
+int LITE_set_cpu_threads_number(LiteNetwork network, size_t nr_threads) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_cpu_threads_number(network_shared, nr_threads);
+    LITE_CAPI_END();
+}
+
+int LITE_set_network_algo_policy(LiteNetwork network,
+                                 LiteAlgoSelectStrategy strategy) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_network_algo_policy(network_shared, strategy);
+    LITE_CAPI_END();
+}
+
+int LITE_set_network_algo_fastrun_config(LiteNetwork network,
+                                         unsigned int shared_batch_size,
+                                         int binary_equal_between_batch) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_network_algo_policy(
+            network_shared, LiteAlgoSelectStrategy(0), shared_batch_size,
+            binary_equal_between_batch);
+    LITE_CAPI_END();
+}
+
+int LITE_set_network_algo_workspace_limit(LiteNetwork network,
+                                          size_t workspace_limit) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_network_algo_workspace_limit(network_shared,
+                                                    workspace_limit);
+    LITE_CAPI_END();
+}
+
+int LITE_set_runtime_thread_affinity(
+        LiteNetwork network,
+        const LiteThreadAffinityCallback thread_affinity_callback) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_runtime_thread_affinity(
+            network_shared, std::move(thread_affinity_callback));
+    LITE_CAPI_END();
+}
+
+int LITE_set_memory_allocator(LiteNetwork network,
+                              const LiteAllocate allocate_fun,
+                              const LiteFree free_fun) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network && allocate_fun && free_fun,
+                "The ptr pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::set_memory_allocator(
+            network_shared,
+            std::make_shared<UserAllocator>(allocate_fun, free_fun));
+    LITE_CAPI_END();
+}
+
+int LITE_enable_io_txt_dump(LiteNetwork network, const char* io_txt_out_file) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::enable_io_txt_dump(network_shared, io_txt_out_file);
+    LITE_CAPI_END();
+}
+
+int LITE_enable_io_bin_dump(LiteNetwork network, const char* io_bin_out_dir) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> network_shared{
+            static_cast<lite::Network*>(network), [](void*) {}};
+    lite::Runtime::enable_io_bin_dump(network_shared, io_bin_out_dir);
+    LITE_CAPI_END();
+}
+
+int LITE_shared_weight_with_network(LiteNetwork dst_network,
+                                    const LiteNetwork src_network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(dst_network && src_network,
+                "The network pass to LITE api is null");
+    const std::shared_ptr<lite::Network> src_shared_net{
+            static_cast<lite::Network*>(src_network), [](void*) {}};
+    std::shared_ptr<lite::Network> dst_shared_net{
+            static_cast<lite::Network*>(dst_network), [](void*) {}};
+    lite::Runtime::shared_weight_with_network(dst_shared_net, src_shared_net);
+    LITE_CAPI_END();
+}
+
+int LITE_share_runtime_memroy(LiteNetwork dst_network,
+                              LiteNetwork src_network) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(src_network && dst_network,
+                "The network pass to LITE api is null");
+    std::shared_ptr<lite::Network> src_shared{
+            static_cast<lite::Network*>(src_network), [](void*) {}};
+    std::shared_ptr<lite::Network> dst_shared{
+            static_cast<lite::Network*>(dst_network), [](void*) {}};
+    lite::Runtime::share_runtime_memory_with(dst_shared, src_shared);
+    LITE_CAPI_END();
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/src/tensor.cpp b/lite/lite-c/src/tensor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ed8499398b8f1cbdebe252d33690401402201e2
--- /dev/null
+++ b/lite/lite-c/src/tensor.cpp
@@ -0,0 +1,257 @@
+/**
+ * \file lite-c/src/tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/tensor.h"
+#include "../../src/tensor_impl_base.h"
+#include "common.h"
+#include "lite-c/tensor_c.h"
+#include <set>
+#include <string>
+#include <unordered_map>
+
+const LiteLayout default_layout = {.shapes = {0, 0, 0, 0, 0},
+                                   .ndim = 0,
+                                   .data_type = LiteDataType::LITE_FLOAT};
+
+const LiteTensorDesc default_desc = {.is_pinned_host = false,
+                                     .layout = default_layout,
+                                     .device_type = LiteDeviceType::LITE_CPU,
+                                     .device_id = 0};
+namespace {
+std::unordered_map<void*, std::shared_ptr<lite::Tensor>>&
+get_global_tensor_holder() {
+    static thread_local std::unordered_map<void*, std::shared_ptr<lite::Tensor>>
+            global_holder;
+    return global_holder;
+}
+std::unordered_map<std::string, lite::LiteAny>&
+get_global_tensor_attr_holder() {
+    static thread_local std::unordered_map<std::string, lite::LiteAny>
+            global_holder;
+    return global_holder;
+}
+}  // namespace
+
+//! convert the lite::Layout to Layout
+LiteLayout convert_to_clayout(const lite::Layout& layout) {
+    LiteLayout clayout;
+    clayout.ndim = layout.ndim;
+    LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "layout ndim is to large");
+    for (size_t i = 0; i < layout.ndim; i++) {
+        clayout.shapes[i] = layout.shapes[i];
+    }
+    clayout.data_type = layout.data_type;
+    return clayout;
+}
+
+//! convert the C Layout to lite::Layout
+lite::Layout convert_to_layout(const LiteLayout& clayout) {
+    lite::Layout layout;
+    layout.ndim = clayout.ndim;
+    LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "clayout ndim is to large");
+    for (size_t i = 0; i < layout.ndim; i++) {
+        layout.shapes[i] = clayout.shapes[i];
+    }
+    layout.data_type = clayout.data_type;
+    return layout;
+}
+
+int LITE_make_tensor(const LiteTensorDesc tensor_describe, LiteTensor* tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE_make_tensor is null");
+    lite::Layout layout = convert_to_layout(tensor_describe.layout);
+    auto lite_tensor = std::make_shared<lite::Tensor>(
+            tensor_describe.device_id, tensor_describe.device_type, layout,
+            tensor_describe.is_pinned_host);
+    get_global_tensor_holder()[lite_tensor.get()] = lite_tensor;
+    *tensor = lite_tensor.get();
+    LITE_CAPI_END();
+}
+
+int LITE_destroy_tensor(LiteTensor tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    get_global_tensor_holder().erase(tensor);
+    LITE_CAPI_END();
+}
+
+int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    auto tensor_ptr = static_cast<lite::Tensor*>(tensor);
+    tensor_ptr->set_layout(convert_to_layout(layout));
+    LITE_CAPI_END();
+}
+
+int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
+                             size_t data_length_in_byte) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null");
+    static_cast<lite::Tensor*>(tensor)->reset(prepared_data,
+                                              data_length_in_byte);
+    LITE_CAPI_END();
+}
+
+int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
+                      void* prepared_data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null");
+    static_cast<lite::Tensor*>(tensor)->reset(prepared_data,
+                                              convert_to_layout(layout));
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor && shape, "The tensor pass to LITE c_api is null");
+    std::vector<int> shapes;
+    for (int i = 0; i < size; i++) {
+        shapes.push_back(shape[i]);
+    }
+    static_cast<lite::Tensor*>(tensor)->reshape(shapes);
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
+                      const size_t* end, const size_t* step, size_t size,
+                      LiteTensor* slice_tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor && start && end && slice_tensor,
+                "The tensor pass to LITE c_api is null");
+    std::vector<size_t> starts, ends, steps;
+    for (size_t i = 0; i < size; i++) {
+        starts.push_back(start[i]);
+        ends.push_back(end[i]);
+        if (step) {
+            steps.push_back(step[i]);
+        }
+    }
+    auto ret_tensor =
+            static_cast<lite::Tensor*>(tensor)->slice(starts, ends, steps);
+    get_global_tensor_holder()[ret_tensor.get()] = ret_tensor;
+    *slice_tensor = ret_tensor.get();
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_fill_zero(LiteTensor tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    static_cast<lite::Tensor*>(tensor)->fill_zero();
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_copy(LiteTensor dst_tensor, const LiteTensor src_tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(dst_tensor && src_tensor,
+                "The tensor pass to LITE c_api is null");
+    static_cast<lite::Tensor*>(dst_tensor)
+            ->copy_from(*static_cast<lite::Tensor*>(src_tensor));
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
+                                  const LiteTensor src_tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(dst_tensor && src_tensor,
+                "The tensor pass to LITE c_api is null");
+    static_cast<lite::Tensor*>(dst_tensor)
+            ->share_memory_with(*static_cast<lite::Tensor*>(src_tensor));
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_memory(const LiteTensor tensor, void** data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(data, "The data ptr pass to LITE c_api is null");
+    *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr();
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
+                                      const size_t* index, size_t size,
+                                      void** data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor && index && data,
+                "The tensor pass to LITE c_api is null");
+    std::vector<size_t> index_v;
+    for (size_t i = 0; i < size; i++) {
+        index_v.push_back(index[i]);
+    }
+    *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(index_v);
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, size_t* size) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(size, "The size ptr pass to LITE c_api is null");
+    *size = static_cast<lite::Tensor*>(tensor)->get_tensor_total_size_in_byte();
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_layout(const LiteTensor tensor, LiteLayout* layout) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(layout, "The layout ptr pass to LITE c_api is null");
+    *layout = convert_to_clayout(
+            static_cast<lite::Tensor*>(tensor)->get_layout());
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_device_type(const LiteTensor tensor,
+                           LiteDeviceType* device_type) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(device_type, "The device ptr pass to LITE c_api is null");
+    *device_type = static_cast<lite::Tensor*>(tensor)->get_device_type();
+    LITE_CAPI_END();
+}
+
+int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor && device_id, "The tensor pass to LITE c_api is null");
+    *device_id = static_cast<lite::Tensor*>(tensor)->get_device_id();
+    LITE_CAPI_END();
+}
+
+int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(is_pinned_host,
+                "The is_pinned_host ptr pass to LITE c_api is null");
+    *is_pinned_host = static_cast<lite::Tensor*>(tensor)->is_pinned_host();
+    LITE_CAPI_END();
+}
+
+int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
+    LITE_ASSERT(is_continue, "The is_continue ptr pass to LITE c_api is null");
+    *is_continue = static_cast<lite::Tensor*>(tensor)->is_continue_memory();
+    LITE_CAPI_END();
+}
+
+int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
+                       LiteDeviceType dst_device, int device_id,
+                       LiteTensor* result_tensor) {
+    LITE_CAPI_BEGIN();
+    std::vector<lite::Tensor> v_tensors;
+    for (int i = 0; i < nr_tensor; i++) {
+        v_tensors.push_back(*static_cast<lite::Tensor*>(tensors[i]));
+    }
+    auto tensor =
+            lite::TensorUtils::concat(v_tensors, dim, dst_device, device_id);
+    get_global_tensor_holder()[tensor.get()] = tensor;
+    *result_tensor = tensor.get();
+    LITE_CAPI_END()
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/pylite/megenginelite/__init__.py b/lite/pylite/megenginelite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9e52af7b0737de25971d23de8b5e7c8a46c18c1
--- /dev/null
+++ b/lite/pylite/megenginelite/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from .base import *
+from .global_setting import *
+from .network import *
+from .struct import *
+from .tensor import *
+from .utils import *
diff --git a/lite/pylite/megenginelite/base.py b/lite/pylite/megenginelite/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f29718b464e38d11acb52a9359967b6ceea1e094
--- /dev/null
+++ b/lite/pylite/megenginelite/base.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import ctypes
+import glob
+import logging
+import os
+import sys
+from ctypes import *
+
+if sys.platform == "win32":
+    lib_path = os.path.join(os.path.dirname(__file__), "libs")
+    dll_paths = list(filter(os.path.exists, [lib_path,]))
+    assert len(dll_paths) > 0
+
+    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
+    old_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if has_load_library_attr:
+        kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    for dll_path in dll_paths:
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(dll_path)
+        elif has_load_library_attr:
+            res = kernel32.AddDllDirectory(dll_path)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
+                    dll_path
+                )
+                raise err
+        else:
+            print("WARN: python or OS env have some issue, may load DLL failed!!!")
+
+    import glob
+
+    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if has_load_library_attr:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
+                    dll
+                )
+                raise err
+
+    kernel32.SetErrorMode(old_error_mode)
+
+
+class _LiteCLib:
+    def __init__(self):
+        cwd = os.getcwd()
+        package_dir = os.path.dirname(os.path.realpath(__file__))
+        debug_path = os.getenv("LITE_LIB_PATH")
+        os.chdir(package_dir)
+        lite_libs = glob.glob("libs/liblite*")
+        os.chdir(cwd)
+
+        if debug_path is None:
+            assert len(lite_libs) == 1
+            self._lib = CDLL(os.path.join(package_dir, lite_libs[0]))
+        else:
+            self._lib = CDLL(debug_path)
+        self._register_api(
+            "LITE_get_version", [POINTER(c_int), POINTER(c_int), POINTER(c_int)]
+        )
+        self.lib.LITE_get_version.restype = None
+        self._register_api("LITE_set_log_level", [c_int])
+        self._register_api("LITE_get_log_level", [])
+        self._register_api("LITE_get_last_error", [], False)
+        self.lib.LITE_get_last_error.restype = c_char_p
+
+    def _errcheck(self, result, func, args):
+        if result:
+            error = self.lib.LITE_get_last_error()
+            msg = error.decode("utf-8")
+            logging.error("{}".format(msg))
+            raise RuntimeError("{}".format(msg))
+        return result
+
+    def _register_api(self, api_name, arg_types, error_check=True):
+        func = getattr(self.lib, api_name)
+        func.argtypes = arg_types
+        func.restype = c_int
+        if error_check:
+            func.errcheck = self._errcheck
+
+    @property
+    def lib(self):
+        return self._lib
+
+    @property
+    def version(self):
+        major = c_int()
+        minor = c_int()
+        patch = c_int()
+        self.lib.LITE_get_version(byref(major), byref(minor), byref(patch))
+        return "{}.{}.{}".format(major.value, minor.value, patch.value)
+
+    def set_log_level(self, level):
+        self.lib.LITE_set_log_level(level)
+
+    def get_log_level(self):
+        return self.lib.LITE_get_log_level()
+
+
+_lib = _LiteCLib()
+version = _lib.version
+set_log_level = _lib.set_log_level
+get_log_level = _lib.get_log_level
+
+_Cnetwork = c_void_p
+_Ctensor = c_void_p
+
+
+class _LiteCObjMetaClass(type):
+    """metaclass for lite object"""
+
+    def __new__(cls, name, bases, attrs):
+        for api in attrs["_api_"]:
+            _lib._register_api(*api)
+        del attrs["_api_"]
+        attrs["_lib"] = _lib.lib
+        return super().__new__(cls, name, bases, attrs)
+
+
+class _LiteCObjBase(metaclass=_LiteCObjMetaClass):
+    _api_ = []
diff --git a/lite/pylite/megenginelite/global_setting.py b/lite/pylite/megenginelite/global_setting.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe04b3014f53c4af758cd761afc6685d10e4c4b6
--- /dev/null
+++ b/lite/pylite/megenginelite/global_setting.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from ctypes import *
+
+import numpy as np
+
+from .base import _Ctensor, _lib, _LiteCObjBase
+from .network import *
+from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure
+from .tensor import *
+
+LiteDecryptionFunc = CFUNCTYPE(
+    c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p
+)
+
+
+class _GlobalAPI(_LiteCObjBase):
+    """
+    get the api from the lib
+    """
+
+    _api_ = [
+        ("LITE_get_device_count", [c_int, POINTER(c_size_t)]),
+        ("LITE_try_coalesce_all_free_memory", []),
+        (
+            "LITE_register_decryption_and_key",
+            [c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t],
+        ),
+        (
+            "LITE_update_decryption_or_key",
+            [c_char_p, c_void_p, POINTER(c_uint8), c_size_t],
+        ),
+        ("LITE_set_loader_lib_path", [c_char_p]),
+        ("LITE_set_persistent_cache", [c_char_p, c_int]),
+        # ('LITE_set_tensor_rt_cache', [c_char_p]),
+        ("LITE_dump_persistent_cache", [c_char_p]),
+        ("LITE_dump_tensor_rt_cache", [c_char_p]),
+    ]
+
+
+def decryption_func(func):
+    """the decryption function decorator
+    :type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte
+    """
+
+    @CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p)
+    def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data):
+        in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length)
+        key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length)
+        if c_out_data:
+            out_length = func(in_arr, None)
+            out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length)
+            return func(in_arr, key_arr, out_arr)
+        # just query the output length
+        else:
+            return func(in_arr, key_arr, None)
+
+    return wrapper
+
+
+class LiteGlobal(object):
+    """
+    some global config in lite
+    """
+
+    _api = _GlobalAPI()._lib
+
+    @staticmethod
+    def register_decryption_and_key(decryption_name, decryption_func, key):
+        c_name = c_char_p(decryption_name.encode("utf-8"))
+        key_length = len(key)
+        c_key = (c_uint8 * key_length)(*key)
+        LiteGlobal._api.LITE_register_decryption_and_key(
+            c_name, decryption_func, c_key, key_length
+        )
+
+    @staticmethod
+    def update_decryption_key(decryption_name, key):
+        c_name = c_char_p(decryption_name.encode("utf-8"))
+        key_length = len(key)
+        c_key = (c_uint8 * key_length)(*key)
+        LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length)
+
+    @staticmethod
+    def set_loader_lib_path(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_loader_lib_path(c_path)
+
+    @staticmethod
+    def set_persistent_cache(path, always_sync=False):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync)
+
+    @staticmethod
+    def set_tensorrt_cache(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_set_tensorrt_cache(c_path)
+
+    @staticmethod
+    def dump_persistent_cache(path):
+        c_path = c_char_p(path.encode("utf-8"))
+        LiteGlobal._api.LITE_dump_persistent_cache(c_path)
+
+    @staticmethod
+    def dump_tensorrt_cache():
+        LiteGlobal._api.LITE_dump_tensorrt_cache()
+
+    @staticmethod
+    def get_device_count(device_type):
+        count = c_size_t()
+        LiteGlobal._api.LITE_get_device_count(device_type, byref(count))
+        return count.value
+
+    @staticmethod
+    def try_coalesce_all_free_memory():
+        LiteGlobal._api.LITE_try_coalesce_all_free_memory()
diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..856dc757c714117647fdb5476c6b24177789ae53
--- /dev/null
+++ b/lite/pylite/megenginelite/network.py
@@ -0,0 +1,531 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from ctypes import *
+
+import numpy as np
+
+from .base import _Cnetwork, _Ctensor, _lib, _LiteCObjBase
+from .struct import *
+from .tensor import *
+
+
+class LiteOptions(Structure):
+    """
+    the inference options will be used to config a network
+    """
+
+    _fields_ = [
+        ("weight_preprocess", c_int),
+        ("fuse_preprocess", c_int),
+        ("fake_next_exec", c_int),
+        ("var_sanity_check_first_run", c_int),
+        ("const_shape", c_int),
+        ("force_dynamic_alloc", c_int),
+        ("force_output_dynamic_alloc", c_int),
+        ("no_profiling_on_shape_change", c_int),
+        ("jit_level", c_int),
+        ("comp_node_seq_record_level", c_int),
+        ("graph_opt_level", c_int),
+        ("async_exec_level", c_int),
+        # layout transform options
+        ("enable_nchw44", c_int),
+        ("enable_nchw44_dot", c_int),
+        ("enable_nchw88", c_int),
+        ("enable_nhwcd4", c_int),
+        ("enable_nchw4", c_int),
+        ("enable_nchw32", c_int),
+        ("enable_nchw64", c_int),
+    ]
+
+    def __init__(self):
+        self.weight_preprocess = False
+        self.fuse_preprocess = False
+        self.fake_next_exec = False
+        self.var_sanity_check_first_run = True
+        self.const_shape = False
+        self.force_dynamic_alloc = False
+        self.force_output_dynamic_alloc = False
+        self.no_profiling_on_shape_change = False
+        self.jit_level = 0
+        self.comp_node_seq_record_level = 0
+        self.graph_opt_level = 2
+        self.async_exec_level = 1
+
+    def __repr__(self):
+        data = {
+            "weight_preprocess": bool(self.weight_preprocess),
+            "fuse_preprocess": bool(self.fuse_preprocess),
+            "fake_next_exec": bool(self.fake_next_exec),
+            "var_sanity_check_first_run": bool(self.var_sanity_check_first_run),
+            "const_shape": bool(self.const_shape),
+            "force_dynamic_alloc": bool(self.force_dynamic_alloc),
+            "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc),
+            "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change),
+            "jit_level": self.jit_level,
+            "comp_node_seq_record_level": self.comp_node_seq_record_level,
+            "graph_opt_level": self.graph_opt_level,
+            "async_exec_level": self.async_exec_level,
+        }
+        return data.__repr__()
+
+
+class LiteConfig(Structure):
+    """
+    Configuration when load and compile the graph
+
+    bare_model_cryption_name: is the bare model cryption method name, bare
+    model is not pack model info inside
+
+    use_loader_dynamic_param: when model forward with device loader of npu,
+    use_loader_dynamic_param used to flag whether the loader use device input or
+    output, if use device input or output it will set Non-zero , else set zero
+
+    has_compression: flag whether the model is compressed, the compress
+    method will used to read the model
+    """
+
+    _fields_ = [
+        ("has_compression", c_int),
+        ("device_id", c_int),
+        ("device_type", c_int),
+        ("backend", c_int),
+        ("bare_model_cryption_name", c_char_p),
+        ("options", LiteOptions),
+    ]
+
+    def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None):
+        self.device_type = device_type
+        if option:
+            self.options = option
+        else:
+            self.options = LiteOptions()
+
+        self.bare_model_cryption_name = c_char_p(b"")
+        self.use_loader_dynamic_param = 0
+        self.has_compression = 0
+        self.backend = LiteBackend.LITE_DEFAULT
+
+    def __repr__(self):
+        data = {
+            "has_compression": bool(self.has_compression),
+            "device_id": LiteDeviceType(self.device_id),
+            "device_type": LiteDeviceType(self.device_type),
+            "backend": LiteBackend(self.backend),
+            "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"),
+            "options": self.options,
+        }
+        return data.__repr__()
+
+
+class LiteIO(Structure):
+    """
+    config the network input and output item
+
+    name: the tensor name in the graph corresponding to the IO
+
+    is_host: Used to mark where the input tensor comes from and the output where copy
+    to, if is_host is true, the input is from host and output copy to host,
+    otherwise device. Sometimes The input is from device and output no need
+    copy to host, default is true.
+
+    io_type: The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+    output tensor value is invaid, only shape will be set, default is VALUE
+
+    config_layout: The layout of the config from user, if other layout is set before
+    forward or get after forward, this layout will by pass. if no other
+    layout is set before forward, this layout will work. if this layout is
+    no set, the model will forward with its origin layout. if in output, it
+    will used to check.
+    """
+
+    _fields_ = [
+        ("name", c_char_p),
+        ("is_host", c_int),
+        ("io_type", c_int),
+        ("config_layout", LiteLayout),
+    ]
+
+    def __init__(
+        self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None
+    ):
+        if type(name) == str:
+            self.name = c_char_p(name.encode("utf-8"))
+        else:
+            self.name = c_char_p(name)
+
+        if layout:
+            self.config_layout = layout
+        else:
+            self.config_layout = LiteLayout()
+
+        self.is_host = is_host
+        self.io_type = io_type
+
+    def __repr__(self):
+        data = {
+            "name": self.name,
+            "is_host": bool(self.is_host),
+            "io_type": LiteIOType(self.io_type),
+            "config_layout": self.config_layout,
+        }
+        return data.__repr__()
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class _LiteNetworkIO(Structure):
+    """
+    the input and output information when load the network
+    """
+
+    _fields_ = [
+        ("inputs", POINTER(LiteIO)),
+        ("outputs", POINTER(LiteIO)),
+        ("input_size", c_size_t),
+        ("output_size", c_size_t),
+    ]
+
+    def __init__(self):
+        self.inputs = POINTER(LiteIO)()
+        self.outputs = POINTER(LiteIO)()
+        self.input_size = 0
+        self.output_size = 0
+
+
+class LiteNetworkIO(object):
+    """
+    the input and output information for user to construct _LiteNetWorkIO
+    """
+
+    def __init__(self):
+        self.inputs = []
+        self.outputs = []
+
+    def add_input(self, input_io):
+        assert isinstance(input_io, LiteIO)
+        self.inputs.append(input_io)
+
+    def add_output(self, output_io):
+        assert isinstance(output_io, LiteIO)
+        self.outputs.append(output_io)
+
+    def _create_network_io(self):
+        network_io = _LiteNetworkIO()
+        length = 1 if len(self.inputs) == 0 else len(self.inputs)
+        self.c_inputs = (LiteIO * length)(*self.inputs)
+        length = 1 if len(self.outputs) == 0 else len(self.outputs)
+        self.c_outputs = (LiteIO * length)(*self.outputs)
+        network_io.inputs = pointer(self.c_inputs[0])
+        network_io.outputs = pointer(self.c_outputs[0])
+        network_io.input_size = len(self.inputs)
+        network_io.output_size = len(self.outputs)
+        return network_io
+
+    def __repr__(self):
+        data = {"inputs": list(self.inputs), "outputs": list(self.outputs)}
+        return data.__repr__()
+
+
+LiteAsyncCallback = CFUNCTYPE(c_int)
+
+
+def start_finish_callback(func):
+    @CFUNCTYPE(c_int, POINTER(LiteIO), POINTER(_Ctensor), c_size_t)
+    def wrapper(c_ios, c_tensors, size):
+        ios = {}
+        for i in range(size):
+            tensor = LiteTensor()
+            tensor._tensor = c_tensors[i]
+            tensor.update()
+            io = c_ios[i]
+            ios[io] = tensor
+        return func(ios)
+
+    return wrapper
+
+
+class _NetworkAPI(_LiteCObjBase):
+    """
+    get the network api from the lib
+    """
+
+    _api_ = [
+        ("LITE_make_default_network", [POINTER(_Cnetwork)]),
+        ("LITE_make_network", [POINTER(_Cnetwork), LiteConfig, _LiteNetworkIO]),
+        ("LITE_load_model_from_mem", [_Cnetwork, c_void_p, c_size_t]),
+        ("LITE_load_model_from_path", [_Cnetwork, c_char_p]),
+        ("LITE_shared_weight_with_network", [_Cnetwork, _Ctensor]),
+        ("LITE_destroy_network", [_Cnetwork]),
+        ("LITE_forward", [_Cnetwork]),
+        ("LITE_wait", [_Cnetwork]),
+        ("LITE_get_io_tensor", [_Cnetwork, c_char_p, c_int, POINTER(_Ctensor)]),
+        ("LITE_get_input_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]),
+        ("LITE_get_output_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]),
+        ("LITE_get_all_input_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]),
+        ("LITE_get_all_output_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]),
+        ("LITE_is_cpu_inplace_mode", [_Cnetwork, POINTER(c_int)]),
+        ("LITE_get_cpu_threads_number", [_Cnetwork, POINTER(c_size_t)]),
+        ("LITE_get_device_id", [_Cnetwork, POINTER(c_int)]),
+        ("LITE_set_device_id", [_Cnetwork, c_int]),
+        ("LITE_set_cpu_inplace_mode", [_Cnetwork]),
+        ("LITE_use_tensorrt", [_Cnetwork]),
+        ("LITE_set_cpu_threads_number", [_Cnetwork, c_size_t]),
+        ("LITE_set_stream_id", [_Cnetwork, c_int]),
+        ("LITE_get_stream_id", [_Cnetwork, POINTER(c_int)]),
+        ("LITE_set_network_algo_policy", [_Cnetwork, c_int]),
+        ("LITE_set_network_algo_fastrun_config", [_Cnetwork, c_int, c_int]),
+        ("LITE_set_network_algo_workspace_limit", [_Cnetwork, c_size_t]),
+        ("LITE_share_runtime_memroy", [_Cnetwork, _Cnetwork]),
+        ("LITE_enable_profile_performance", [_Cnetwork, c_char_p]),
+        ("LITE_enable_io_txt_dump", [_Cnetwork, c_char_p]),
+        ("LITE_enable_io_bin_dump", [_Cnetwork, c_char_p]),
+        ("LITE_set_async_callback", [_Cnetwork, LiteAsyncCallback]),
+        ("LITE_set_start_callback", [_Cnetwork]),
+        ("LITE_set_finish_callback", [_Cnetwork]),
+    ]
+
+
+class LiteNetwork(object):
+    """
+    the network to load a model and forward
+    """
+
+    _api = _NetworkAPI()._lib
+
+    def __init__(self, config=None, io=None):
+        """
+        create a network with config and networkio
+        """
+        self._network = _Cnetwork()
+
+        if config:
+            self.config = config
+        else:
+            self.config = LiteConfig()
+
+        if io:
+            self.network_io = io
+        else:
+            self.network_io = LiteNetworkIO()
+
+        c_network_io = self.network_io._create_network_io()
+        self._api.LITE_make_network(byref(self._network), self.config, c_network_io)
+
+    def __repr__(self):
+        data = {"config": self.config, "IOs": self.network_io}
+        return data.__repr__()
+
+    def __del__(self):
+        self._api.LITE_destroy_network(self._network)
+
+    def load(self, path):
+        c_path = c_char_p(path.encode("utf-8"))
+        self._api.LITE_load_model_from_path(self._network, c_path)
+
+    def forward(self):
+        self._api.LITE_forward(self._network)
+
+    def wait(self):
+        self._api.LITE_wait(self._network)
+
+    def is_cpu_inplace_mode(self):
+        """
+        whether the network run in cpu inpalce mode
+        """
+        inplace = c_int()
+        self._api.LITE_is_cpu_inplace_mode(self._network, byref(inplace))
+        return bool(inplace.value)
+
+    def enable_cpu_inplace_mode(self):
+        """
+        set cpu forward in inplace mode with which cpu forward only create one
+        thread
+        Note: this must be set before the network loaded
+        """
+        self._api.LITE_set_cpu_inplace_mode(self._network)
+
+    def use_tensorrt(self):
+        """
+        Note: this must be set before the network loaded
+        """
+        self._api.LITE_use_tensorrt(self._network)
+
+    @property
+    def device_id(self):
+        """
+        get the device id
+        """
+        device_id = c_int()
+        self._api.LITE_get_device_id(self._network, byref(device_id))
+        return device_id.value
+
+    @device_id.setter
+    def device_id(self, device_id):
+        """
+        set the device id
+        Note: this must be set before the network loaded
+        """
+        self._api.LITE_set_device_id(self._network, device_id)
+
+    @property
+    def stream_id(self):
+        """
+        get the stream id
+        """
+        stream_id = c_int()
+        self._api.LITE_get_stream_id(self._network, byref(stream_id))
+        return stream_id.value
+
+    @stream_id.setter
+    def stream_id(self, stream_id):
+        """
+        set the stream id
+        Note: this must be set before the network loaded
+        """
+        self._api.LITE_set_stream_id(self._network, stream_id)
+
+    @property
+    def threads_number(self):
+        """
+        get the thread number of the netwrok
+        """
+        nr_thread = c_size_t()
+        self._api.LITE_get_cpu_threads_number(self._network, byref(nr_thread))
+        return nr_thread.value
+
+    @threads_number.setter
+    def threads_number(self, nr_threads):
+        """
+        set the network forward in multithread mode, and the thread number
+        Note: this must be set before the network loaded
+        """
+        self._api.LITE_set_cpu_threads_number(self._network, nr_threads)
+
+    def get_io_tensor(self, name, phase=LiteTensorPhase.LITE_IO):
+        """
+        get input or output tensor by its name
+        """
+        if type(name) == str:
+            c_name = c_char_p(name.encode("utf-8"))
+        else:
+            c_name = c_char_p(name)
+        tensor = LiteTensor()
+        self._api.LITE_get_io_tensor(
+            self._network, c_name, phase, byref(tensor._tensor)
+        )
+        tensor.update()
+        return tensor
+
+    def get_input_name(self, index):
+        """
+        get the input name by the index in the network
+        """
+        c_name = c_char_p()
+        self._api.LITE_get_input_name(self._network, index, byref(c_name))
+        return c_name.value.decode("utf-8")
+
+    def get_output_name(self, index):
+        """
+        get the output name by the index in the network
+        """
+        c_name = c_char_p()
+        self._api.LITE_get_output_name(self._network, index, byref(c_name))
+        return c_name.value.decode("utf-8")
+
+    def get_all_input_name(self):
+        """
+        get all the input tensor name in the network
+        """
+        nr_input = c_size_t()
+        self._api.LITE_get_all_input_name(self._network, byref(nr_input), None)
+
+        if nr_input.value > 0:
+            names = (c_char_p * nr_input.value)()
+            self._api.LITE_get_all_input_name(self._network, None, names)
+            ret_name = [names[i].decode("utf-8") for i in range(nr_input.value)]
+            return ret_name
+
+    def get_all_output_name(self):
+        """
+        get all the output tensor name in the network
+        """
+        nr_output = c_size_t()
+        self._api.LITE_get_all_output_name(self._network, byref(nr_output), None)
+
+        if nr_output.value > 0:
+            names = (c_char_p * nr_output.value)()
+            self._api.LITE_get_all_output_name(self._network, None, names)
+            ret_name = [names[i].decode("utf-8") for i in range(nr_output.value)]
+            return ret_name
+
+    def share_weights_with(self, src_network):
+        """
+        share weights with the loaded network
+        """
+        assert isinstance(src_network, LiteNetwork)
+        self._api.LITE_shared_weight_with_network(self._network, src_network._network)
+
+    def share_runtime_memroy(self, src_network):
+        """
+        share runtime memory with the srouce network
+        """
+        assert isinstance(src_network, LiteNetwork)
+        self._api.LITE_share_runtime_memroy(self._network, src_network._network)
+
+    def async_with_callback(self, async_callback):
+        async_callback = LiteAsyncCallback(async_callback)
+        self._api.LITE_set_async_callback(self._network, async_callback)
+
+    def set_start_callback(self, start_callback):
+        """
+        when the network start forward, the callback will be called,
+        the start_callback with param mapping from LiteIO to the corresponding
+        LiteTensor
+        """
+        self._api.LITE_set_start_callback(self._network, start_callback)
+
+    def set_finish_callback(self, finish_callback):
+        """
+        when the network finish forward, the callback will be called,
+        the finish_callback with param mapping from LiteIO to the corresponding
+        LiteTensor
+        """
+        self._api.LITE_set_finish_callback(self._network, finish_callback)
+
+    def enable_profile_performance(self, profile_file):
+        c_file = profile_file.encode("utf-8")
+        self._api.LITE_enable_profile_performance(self._network, c_file)
+
+    def set_network_algo_workspace_limit(self, size_limit):
+        self._api.LITE_set_network_algo_workspace_limit(self._network, size_limit)
+
+    def set_network_algo_policy(
+        self, policy, shared_batch_size=0, binary_equal_between_batch=False
+    ):
+        """
+        shared_batch_size: the batch size used by fastrun,
+                    Non-zero value means that fastrun use this batch size
+                    regardless of the batch size of the model. Zero means
+                    fastrun use batch size of the model
+        binary_equal_between_batch: if the content of each input batch is
+                    binary equal,whether the content of each output batch is
+                    promised to be equal
+
+        """
+        self._api.LITE_set_network_algo_policy(self._network, policy)
+        self._api.LITE_set_network_algo_fastrun_config(
+            self._network, shared_batch_size, binary_equal_between_batch
+        )
+
+    def io_txt_dump(self, txt_file):
+        c_file = txt_file.encode("utf-8")
+        self._api.LITE_enable_io_txt_dump(self._network, c_file)
+
+    def io_bin_dump(self, bin_dir):
+        c_dir = bin_dir.encode("utf-8")
+        self._api.LITE_enable_io_bin_dump(self._network, c_dir)
diff --git a/lite/pylite/megenginelite/struct.py b/lite/pylite/megenginelite/struct.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae99fe3f0106a796391b440cb5b760912470ff1
--- /dev/null
+++ b/lite/pylite/megenginelite/struct.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import logging
+from ctypes import *
+from enum import Enum, IntEnum
+
+
+class LiteBackend(IntEnum):
+    LITE_DEFAULT = 0
+
+
+class LiteDeviceType(IntEnum):
+    LITE_CPU = 0
+    LITE_CUDA = 1
+    LITE_ATLAS = 3
+    LITE_NPU = 4
+    LITE_DEVICE_DEFAULT = 5
+
+
+class LiteDataType(IntEnum):
+    LITE_FLOAT = 0
+    LITE_HALF = 1
+    LITE_INT = 2
+    LITE_INT16 = 3
+    LITE_INT8 = 4
+    LITE_UINT8 = 5
+
+
+class LiteTensorPhase(IntEnum):
+    LITE_IO = 0
+    LITE_INPUT = 1
+    LITE_OUTPUT = 2
+
+
+class LiteIOType(IntEnum):
+    """
+    the input and output type, include SHAPE and VALUE
+    sometimes user only need the shape of the output tensor
+    """
+
+    LITE_IO_VALUE = 0
+    LITE_IO_SHAPE = 1
+
+
+class LiteAlgoSelectStrategy(IntEnum):
+    """
+    operation algorithm seletion strategy type, some operations have
+    multi algorithms, different algorithm has different attribute, according to
+    the strategy, the best algorithm will be selected.
+
+    Note: These strategies can be combined
+
+    LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
+    use heuristic instead
+
+    LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
+    reproducible algo
+
+    LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
+    algorithm from the reproducible algorithms set
+
+    LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
+    algorithm form the optimzed algorithms, thus profile will process fast
+
+    LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
+    profile the best algorithm form the optimzed and reproducible algorithms
+    """
+
+    LITE_ALGO_HEURISTIC = 1
+    LITE_ALGO_PROFILE = 2
+    LITE_ALGO_REPRODUCIBLE = 4
+    LITE_ALGO_OPTIMIZED = 8
+
+
+class LiteLogLevel(IntEnum):
+    """
+    DEBUG: The most verbose level, printing debugging info
+    INFO: The default level
+    WARN: Printing warnings
+    ERROR: The least verbose level, printing errors only
+    """
+
+    DEBUG = 0
+    INFO = 1
+    WARN = 2
+    ERROR = 3
diff --git a/lite/pylite/megenginelite/tensor.py b/lite/pylite/megenginelite/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2539ce508d5c95c75fad8ca1fa4a03d24bb9cf
--- /dev/null
+++ b/lite/pylite/megenginelite/tensor.py
@@ -0,0 +1,471 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+from ctypes import *
+
+import numpy as np
+
+from .base import _Ctensor, _lib, _LiteCObjBase
+from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure
+
+MAX_DIM = 7
+
+_lite_type_to_nptypes = {
+    LiteDataType.LITE_INT: np.int32,
+    LiteDataType.LITE_FLOAT: np.float32,
+    LiteDataType.LITE_UINT8: np.uint8,
+    LiteDataType.LITE_INT8: np.int8,
+    LiteDataType.LITE_INT16: np.int16,
+    LiteDataType.LITE_HALF: np.float16,
+}
+
+_nptype_to_lite_type = {val: key for key, val in _lite_type_to_nptypes.items()}
+
+_str_nptypes_to_lite_nptypes = {
+    np.dtype("int32"): LiteDataType.LITE_INT,
+    np.dtype("float32"): LiteDataType.LITE_FLOAT,
+    np.dtype("uint8"): LiteDataType.LITE_UINT8,
+    np.dtype("int8"): LiteDataType.LITE_INT8,
+    np.dtype("int16"): LiteDataType.LITE_INT16,
+    np.dtype("float16"): LiteDataType.LITE_HALF,
+}
+
+ctype_to_lite_dtypes = {
+    c_int: LiteDataType.LITE_INT,
+    c_uint: LiteDataType.LITE_INT,
+    c_float: LiteDataType.LITE_FLOAT,
+    c_ubyte: LiteDataType.LITE_UINT8,
+    c_byte: LiteDataType.LITE_INT8,
+    c_short: LiteDataType.LITE_INT16,
+    c_ushort: LiteDataType.LITE_INT16,
+}
+
+
+class LiteLayout(Structure):
+    """
+    the simple layout description
+    """
+
+    _fields_ = [
+        ("shapes", c_size_t * MAX_DIM),
+        ("ndim", c_size_t),
+        ("data_type", c_int),
+    ]
+
+    def __init__(self, shape=None, dtype=None):
+        if shape:
+            shape = list(shape)
+            assert len(shape) <= MAX_DIM, "Layout max dim is 7."
+            self.shapes = (c_size_t * MAX_DIM)(*shape)
+            self.ndim = len(shape)
+        else:
+            self.shapes = (c_size_t * MAX_DIM)()
+            self.ndim = 0
+        if not dtype:
+            self.data_type = LiteDataType.LITE_FLOAT
+        elif isinstance(dtype, LiteDataType):
+            self.data_type = dtype
+        elif type(dtype) == str:
+            self.data_type = _str_nptypes_to_lite_nptypes[np.dtype(dtype)]
+        elif isinstance(dtype, np.dtype):
+            ctype = np.ctypeslib.as_ctypes_type(dtype)
+            self.data_type = ctype_to_lite_dtypes[ctype]
+        elif isinstance(dtype, type):
+            self.data_type = _nptype_to_lite_type[dtype]
+        else:
+            raise RuntimeError("unkonw data type")
+
+    def __repr__(self):
+        data = {
+            "shapes": list(self.shapes),
+            "ndim": self.ndim,
+            "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)],
+        }
+        return data.__repr__()
+
+
+class _LiteTensorDesc(Structure):
+    """
+    warpper of the MegEngine Tensor
+
+    :is_pinned_host: when set, the storage memory of the tensor is pinned memory,
+    this is used to Optimize the H2D or D2H memory copy, if the device or layout
+    is not set, when copy form other device(CUDA) tensor, this tensor
+    will be automatically set to pinned tensor
+    """
+
+    _fields_ = [
+        ("is_pinned_host", c_int),
+        ("layout", LiteLayout),
+        ("device_type", c_int),
+        ("device_id", c_int),
+    ]
+
+    def __init__(self):
+        self.layout = LiteLayout()
+        self.device_type = LiteDeviceType.LITE_CPU
+        self.is_pinned_host = False
+        self.device_id = 0
+
+    def __repr__(self):
+        data = {
+            "is_pinned_host": self.is_pinned_host,
+            "layout": LiteLayout(self.layout),
+            "device_type": LiteDeviceType(self.device_type.value),
+            "device_id": self.device_id,
+        }
+        return data.__repr__()
+
+
+class _TensorAPI(_LiteCObjBase):
+    """
+    get the api from the lib
+    """
+
+    _api_ = [
+        ("LITE_make_tensor", [_LiteTensorDesc, POINTER(_Ctensor)]),
+        ("LITE_set_tensor_layout", [_Ctensor, LiteLayout]),
+        ("LITE_reset_tensor_memory", [_Ctensor, c_void_p, c_size_t]),
+        ("LITE_reset_tensor", [_Ctensor, LiteLayout, c_void_p]),
+        ("LITE_tensor_reshape", [_Ctensor, POINTER(c_int), c_int]),
+        (
+            "LITE_tensor_slice",
+            [
+                _Ctensor,
+                POINTER(c_size_t),
+                POINTER(c_size_t),
+                POINTER(c_size_t),
+                c_size_t,
+                POINTER(_Ctensor),
+            ],
+        ),
+        (
+            "LITE_tensor_concat",
+            [POINTER(_Ctensor), c_int, c_int, c_int, c_int, POINTER(_Ctensor),],
+        ),
+        ("LITE_tensor_fill_zero", [_Ctensor]),
+        ("LITE_tensor_copy", [_Ctensor, _Ctensor]),
+        ("LITE_tensor_share_memory_with", [_Ctensor, _Ctensor]),
+        ("LITE_get_tensor_memory", [_Ctensor, POINTER(c_void_p)]),
+        ("LITE_get_tensor_total_size_in_byte", [_Ctensor, POINTER(c_size_t)]),
+        ("LITE_get_tensor_layout", [_Ctensor, POINTER(LiteLayout)]),
+        ("LITE_get_tensor_device_type", [_Ctensor, POINTER(c_int)]),
+        ("LITE_get_tensor_device_id", [_Ctensor, POINTER(c_int)]),
+        ("LITE_destroy_tensor", [_Ctensor]),
+        ("LITE_is_pinned_host", [_Ctensor, POINTER(c_int)]),
+    ]
+
+
+class LiteTensor(object):
+    """
+    the tensor to hold a block of data
+    """
+
+    _api = _TensorAPI()._lib
+
+    def __init__(
+        self,
+        layout=None,
+        device_type=LiteDeviceType.LITE_CPU,
+        device_id=0,
+        is_pinned_host=False,
+    ):
+        """
+        create a Tensor with layout, device, is_pinned_host param
+        """
+        self._tensor = _Ctensor()
+        if layout:
+            self._layout = layout
+        else:
+            self._layout = LiteLayout()
+        self._device_type = device_type
+        self._device_id = device_id
+        self._is_pinned_host = is_pinned_host
+
+        tensor_desc = _LiteTensorDesc()
+        tensor_desc.layout = self._layout
+        tensor_desc.device_type = device_type
+        tensor_desc.device_id = device_id
+        tensor_desc.is_pinned_host = is_pinned_host
+        self._api.LITE_make_tensor(tensor_desc, byref(self._tensor))
+
+    def __del__(self):
+        self._api.LITE_destroy_tensor(self._tensor)
+
+    def fill_zero(self):
+        """
+        fill the buffer memory with zero
+        """
+        self._api.LITE_tensor_fill_zero(self._tensor)
+        self.update()
+
+    def share_memory_with(self, src_tensor):
+        """
+        share the same memory with the src_tensor, the self memory will be freed
+        """
+        assert isinstance(src_tensor, LiteTensor)
+        self._api.LITE_tensor_share_memory_with(self._tensor, src_tensor._tensor)
+        self.update()
+
+    @property
+    def layout(self):
+        self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout))
+        return self._layout
+
+    @layout.setter
+    def layout(self, layout):
+        assert isinstance(layout, LiteLayout)
+        self._layout = layout
+        self._api.LITE_set_tensor_layout(self._tensor, layout)
+
+    @property
+    def is_pinned_host(self):
+        """
+        whether the tensor is pinned tensor
+        """
+        pinned = c_int()
+        self._api.LITE_is_pinned_host(self._tensor, byref(pinned))
+        self._is_pinned_host = pinned
+        return bool(self._is_pinned_host)
+
+    @property
+    def device_type(self):
+        """
+        get device of the tensor
+        """
+        device_type = c_int()
+        self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type))
+        self._device_type = device_type
+        return LiteDeviceType(device_type.value)
+
+    @property
+    def device_id(self):
+        """
+        get device id of the tensor
+        """
+        device_id = c_int()
+        self._api.LITE_get_tensor_device_id(self._tensor, byref(device_id))
+        self._device_id = device_id.value
+        return device_id.value
+
+    @property
+    def is_continue(self):
+        """
+        whether the tensor memory is continue
+        """
+        is_continue = c_int()
+        self._api.LITE_is_memory_continue(self._tensor, byref(is_continue))
+        return bool(is_continue.value)
+
+    @property
+    def nbytes(self):
+        """
+        get the length of the meomry in byte
+        """
+        self.update()
+        length = c_size_t()
+        self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length))
+        return length.value
+
+    def update(self):
+        """
+        update the member from C, this will auto used after slice, share
+        """
+        pinned = c_int()
+        self._api.LITE_is_pinned_host(self._tensor, byref(pinned))
+        self._is_pinned_host = pinned
+        device_type = c_int()
+        self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type))
+        self._device_type = device_type
+        self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout))
+
+    def copy_from(self, src_tensor):
+        """
+        copy memory form the src_tensor
+        """
+        assert isinstance(src_tensor, LiteTensor)
+        self._api.LITE_tensor_copy(self._tensor, src_tensor._tensor)
+        self.update()
+
+    def reshape(self, shape):
+        """
+        reshape the tensor with data not change, only change the shape
+        :param shape: int arrary of dst_shape
+        """
+        shape = list(shape)
+        length = len(shape)
+        c_shape = (c_int * length)(*shape)
+        self._api.LITE_tensor_reshape(self._tensor, c_shape, length)
+        self.update()
+
+    def slice(self, start, end, step=None):
+        """
+        slice the tensor with gaven start, end, step
+        :param start: silce begin index of each dim
+        :param end: silce end index of each dim
+        :param step: silce step of each dim
+        """
+        start = list(start)
+        end = list(end)
+        length = len(start)
+        assert length == len(end), "slice with different length of start and end."
+        if step:
+            assert length == len(step), "slice with different length of start and step."
+            step = list(step)
+        else:
+            step = [1 for i in range(length)]
+        c_start = (c_size_t * length)(*start)
+        c_end = (c_size_t * length)(*end)
+        c_step = (c_size_t * length)(*step)
+        slice_tensor = LiteTensor()
+        self._api.LITE_tensor_slice(
+            self._tensor, c_start, c_end, c_step, length, byref(slice_tensor._tensor)
+        )
+        slice_tensor.update()
+        return slice_tensor
+
+    def get_ctypes_memory(self):
+        """
+        get the memory of the tensor, return c_void_p of the tensor memory
+        """
+        self.update()
+        mem = c_void_p()
+        self._api.LITE_get_tensor_memory(self._tensor, byref(mem))
+        return mem
+
+    def set_data_by_share(self, data, length=0, layout=None):
+        """
+        share the data to the tensor
+        param data: the data will shared to the tensor, it should be a
+        numpy.ndarray or ctypes data
+        """
+        self.update()
+        if isinstance(data, np.ndarray):
+            assert (
+                self.is_continue
+            ), "set_data_by_share can only apply in continue tensor."
+            assert (
+                self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
+            ), "set_data_by_share can only apply in cpu tensor or pinned tensor."
+
+            np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
+            c_type = np.ctypeslib.as_ctypes_type(np_type)
+
+            if self.nbytes != data.nbytes:
+                self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type])
+
+            self._shared_data = data
+            data = data.ctypes.data_as(POINTER(c_type))
+
+        if layout is not None:
+            self.layout = layout
+        else:
+            assert length == 0 or length == self.nbytes, "the data length is not match."
+        self._api.LITE_reset_tensor_memory(self._tensor, data, self.nbytes)
+
+    def set_data_by_copy(self, data, data_length=0, layout=None):
+        """
+        copy the data to the tensor
+        param data: the data to copy to tensor, it should be list,
+        numpy.ndarraya or ctypes with length
+        """
+        self.update()
+        if layout is not None:
+            self.layout = layout
+
+        assert self.is_continue, "set_data_by_copy can only apply in continue tensor."
+        assert (
+            self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
+        ), "set_data_by_copy can only apply in cpu tensor or pinned tensor."
+
+        np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
+        c_type = np.ctypeslib.as_ctypes_type(np_type)
+
+        tensor_memory = c_void_p()
+
+        if type(data) == list:
+            length = len(data)
+            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
+            tensor_length = self.nbytes
+            assert (
+                length * sizeof(c_type) <= tensor_length
+            ), "the length of input data to set to the tensor is too large."
+            arr = (c_type * length)(*data)
+            memmove(tensor_memory, arr, sizeof(c_type) * length)
+
+        elif type(data) == np.ndarray:
+            if self.nbytes != data.nbytes:
+                self.layout = LiteLayout(data.shape, data.dtype)
+            arr = data.ctypes.data_as(POINTER(c_type))
+            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
+            assert self.nbytes == data.nbytes
+            memmove(tensor_memory, arr, self.nbytes)
+        else:
+            assert (
+                data_length == self.nbytes or layout is not None
+            ), "when input data is ctypes, the length of input data or layout must set"
+            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
+            memmove(tensor_memory, data, data_length)
+
+    def to_numpy(self):
+        """
+        get the buffer of the tensor
+        """
+        self.update()
+        if self.nbytes <= 0:
+            return np.array([])
+        if self.is_continue and (
+            self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
+        ):
+            ptr = c_void_p()
+            self._api.LITE_get_tensor_memory(self._tensor, byref(ptr))
+
+            np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
+            shape = [self._layout.shapes[i] for i in range(self._layout.ndim)]
+            np_arr = np.zeros(shape, np_type)
+            if np_arr.nbytes:
+                memmove(np_arr.ctypes.data_as(c_void_p), ptr, np_arr.nbytes)
+            return np_arr
+        else:
+            tmp_tensor = LiteTensor(self.layout)
+            tmp_tensor.copy_from(self)
+            return tmp_tensor.to_numpy()
+
+    def __repr__(self):
+        self.update()
+        data = {
+            "layout": self._layout,
+            "device_type": LiteDeviceType(self._device_type.value),
+            "device_id": int(self.device_id),
+            "is_pinned_host": bool(self._is_pinned_host),
+        }
+        return data.__repr__()
+
+
+def LiteTensorConcat(
+    tensors, dim, device_type=LiteDeviceType.LITE_DEVICE_DEFAULT, device_id=-1
+):
+    """
+    concat tensor in input dim to one tensor
+    dim : the dim to act concat
+    device_type: the result tensor device type
+    device_id: the result tensor device id
+    """
+    api = _TensorAPI()._lib
+    length = len(tensors)
+    c_tensors = [t._tensor for t in tensors]
+    c_tensors = (_Ctensor * length)(*c_tensors)
+    result_tensor = LiteTensor()
+    api.LITE_tensor_concat(
+        cast(byref(c_tensors), POINTER(c_void_p)),
+        length,
+        dim,
+        device_type,
+        device_id,
+        byref(result_tensor._tensor),
+    )
+    result_tensor.update()
+    return result_tensor
diff --git a/lite/pylite/megenginelite/utils.py b/lite/pylite/megenginelite/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..aec8188a569d8afb274f5e65b4a8313467d4239f
--- /dev/null
+++ b/lite/pylite/megenginelite/utils.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import threading
+
+import numpy as np
+
+from .base import *
+from .struct import *
+from .tensor import *
+
+
+class TensorBatchCollector:
+    """
+    this is a tensor utils to collect subtensor in batch continuous
+    """
+
+    def __init__(
+        self,
+        shape,
+        dtype=LiteDataType.LITE_INT8,
+        device_type=LiteDeviceType.LITE_CUDA,
+        device_id=0,
+        is_pinned_host=False,
+        tensor=None,
+    ):
+        self._mutex = threading.Lock()
+        self.dev_type = device_type
+        self.is_pinned_host = is_pinned_host
+        self.dev_id = 0
+        self.shape = shape
+        self.dtype = LiteLayout(dtype=dtype).data_type
+        self._free_list = list(range(self.shape[0]))
+
+        if tensor is not None:
+            assert (
+                tensor.layout.shapes[0 : tensor.layout.ndim] == shape
+            ), "The tensor set to TensorBatchCollector is not right."
+            self._tensor = tensor
+            self.dtype = tensor.layout.data_type
+            self.device_type = tensor.device_type
+            self.device_id = tensor.device_type
+        else:
+            self._tensor = LiteTensor(
+                LiteLayout(shape, dtype), device_type, device_id, is_pinned_host
+            )
+
+    def collect_id(self, array, batch_id):
+        if isinstance(array, np.ndarray):
+            shape = array.shape
+            assert list(shape) == self.shape[1:]
+            in_dtype = ctype_to_lite_dtypes[np.ctypeslib.as_ctypes_type(array.dtype)]
+            assert in_dtype == self.dtype
+            # get the batch index
+            with self._mutex:
+                if batch_id in self._free_list:
+                    self._free_list.remove(batch_id)
+            # get the subtensor
+            subtensor = self._tensor.slice([batch_id], [batch_id + 1])
+            if subtensor.device_type == LiteDeviceType.LITE_CPU:
+                subtensor.set_data_by_copy(array)
+            else:
+                pinned_tensor = LiteTensor(
+                    subtensor.layout, self.dev_type, self.dev_id, True
+                )
+                pinned_tensor.set_data_by_share(array)
+                subtensor.copy_from(pinned_tensor)
+        else:
+            assert isinstance(array, LiteTensor)
+            ndim = array.layout.ndim
+            shape = list(array.layout.shapes)[0:ndim]
+            assert list(shape) == self.shape[1:]
+            in_dtype = array.layout.data_type
+            assert in_dtype == self.dtype
+            # get the batch index
+            with self._mutex:
+                if batch_id in self._free_list:
+                    self._free_list.remove(batch_id)
+            # get the subtensor
+            subtensor = self._tensor.slice([batch_id], [batch_id + 1])
+            subtensor.copy_from(array)
+
+        return batch_id
+
+    def collect(self, array):
+        with self._mutex:
+            if len(self._free_list) == 0:
+                return -1
+            idx = self._free_list.pop(0)
+        return self.collect_id(array, idx)
+
+    def collect_by_ctypes(self, data, length):
+        """
+        collect with ctypes data input
+        """
+        with self._mutex:
+            if len(self._free_list) == 0:
+                return -1
+            idx = self._free_list.pop(0)
+        # get the subtensor
+        subtensor = self._tensor.slice([idx], [idx + 1])
+        if subtensor.device_type == LiteDeviceType.LITE_CPU:
+            subtensor.set_data_by_copy(data, length)
+        else:
+            pinned_tensor = LiteTensor(
+                subtensor.layout, self.dev_type, self.dev_id, True
+            )
+            pinned_tensor.set_data_by_share(data, length)
+            subtensor.copy_from(pinned_tensor)
+
+    def free(self, indexes):
+        with self._mutex:
+            self._free_list.extend(indexes)
+
+    def get(self):
+        return self._tensor
+
+    def to_numpy(self):
+        return self._tensor.to_numpy()
diff --git a/lite/pylite/pylite.md b/lite/pylite/pylite.md
new file mode 100644
index 0000000000000000000000000000000000000000..183875cc08dd7c19521ec1f74d9145eb8b0a1cef
--- /dev/null
+++ b/lite/pylite/pylite.md
@@ -0,0 +1,199 @@
+# PyLite
+
+Lite的python接口提供更加方便灵活的使用Lite进行模型Inference，支持各种平台上运行，X86-CUDA，X86-CPU，Arm-CPU，Arm-CUDA平台。
+
+## 安装
+### whl包安装
+Lite python接口的whl包会随着megbrain的发版发布，版本号和megbrain保持一致，目前发布的Lite的whl包，包括Linux、windows、macos平台，这些平台可以直接通过pip3安装。
+```shell
+    python3 -m pip install --upgrade pip
+    python3 -m pip install megenginelite -i  https://pypi.megvii-inc.com/simple
+```
+### develop 安装
+开发模式下，可以使用Cmake编译出lite动态库liblite.so/liblite.dll/liblite_shared.dylib，并使用这个动态库进行开发和debug。该方式安装的pylite只能在本地机器上使用，不能copy到其他机器上使用。
+* 编译liblite.so。使用cmake编译出liblite.so
+    * clone megbrain工程到本地
+    ```shell
+    git clone git@git-core.megvii-inc.com:brain-sdk/MegBrain.git
+    ```
+    * 进行Cmake编译，这里的cmake编译同megbrain的cmake编译，使用参数和宏也完全一样
+    * 编译准备
+    ```shell
+    cd MegBrain
+    sh ./third_party/prepare.sh
+    mkdir build
+    cd build 
+    ```
+    * 编译X86-CUDA版本
+    ```shell
+    cmake .. -DMGE_WITH_CUDA=ON -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release &&  make -j$(nproc)
+    ```
+    * 编译X86 CPU Only版本
+    ```shell
+    cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release &&  make -j$(nproc)
+    ```
+    * 编译完成之后，liblite.so 保存在build目录中的lite文件下
+    * 将liblite.so copy到megenginelite的python源文件目录下，就可以使用megenginelite了。
+    ```shell
+    MegBrain的工程目录为 ${mgb_hone}
+    cp ${mgb_hone}/build/lite/liblite.so ${mgb_home}/lite/pylite/megenginelite/
+    cd ${mgb_home}/lite/pylite
+    python3 -m "import megenginelite"
+    ```
+    这样就可以在${mgb_home}/lite/pylite 目录下面开发和debug lite的python接口了
+
+## python3中使用megenginelite
+Lite的python接口是对其C/C++接口的一层封装，他们使用的模型都是相同的模型格式。megenginelite提供两种数据接口，分别是LiteTensor和LiteNetwork。
+
+### LiteTensor
+LiteTensor提供了用户对数据的操作接口，提供了接口包括:
+* fill_zero: 将tensor的内存设置为全0
+* share_memory_with: 可以和其他LiteTensor的共享内存
+* copy_from: 从其他LiteTensor中copy数据到自身内存中
+* reshape: 改变该LiteTensor的shape，内存数据保持不变
+* slice: 对该LiteTensor中的数据进行切片，需要分别指定每一维切片的start，end，和step。
+* set_data_by_share: 调用之后使得该LiteTensor中的内存共享自输入的array的内存，输入的array必须是numpy的ndarray，并且tensor在CPU上
+* set_data_by_copy: 该LiteTensor将会从输入的data中copy数据，data可以是list和numpy的ndarray，需要保证data的数据量不超过tensor的容量，tensor在CPU上
+* to_numpy: 将该LiteTensor中数据copy到numpy的array中，返回给用户，如果是非连续的LiteTensor，如slice出来的，将copy到连续的numpy array中，该接口主要数为了debug，有性能问题。
+
+#### 使用example
+* LiteTensor 设置数据example
+```
+def test_tensor_set_data():
+    layout = LiteLayout([2, 16], "int8")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 2 * 16
+
+    data = [i for i in range(32)]
+    tensor.set_data_by_copy(data)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == i
+
+    arr = np.ones([2, 16], "int8")
+    tensor.set_data_by_copy(arr)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == 1
+
+    for i in range(32):
+        arr[i // 16][i % 16] = i
+    tensor.set_data_by_share(arr)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == i
+
+    arr[0][8] = 100
+    arr[1][3] = 20
+    real_data = tensor.to_numpy()
+    assert real_data[0][8] == 100
+    assert real_data[1][3] == 20
+```
+* tensor 共享内存example
+```python
+def test_tensor_share_memory_with():
+    layout = LiteLayout([4, 32], "int16")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 4 * 32 * 2
+
+    arr = np.ones([4, 32], "int16")
+    for i in range(128):
+        arr[i // 32][i % 32] = i
+    tensor.set_data_by_share(arr)
+    real_data = tensor.to_numpy()
+    for i in range(128):
+        assert real_data[i // 32][i % 32] == i
+
+    tensor2 = LiteTensor(layout)
+    tensor2.share_memory_with(tensor)
+    real_data = tensor.to_numpy()
+    real_data2 = tensor2.to_numpy()
+    for i in range(128):
+        assert real_data[i // 32][i % 32] == i
+        assert real_data2[i // 32][i % 32] == i
+
+    arr[1][18] = 5
+    arr[3][7] = 345
+    real_data = tensor2.to_numpy()
+    assert real_data[1][18] == 5
+    assert real_data[3][7] == 345
+```
+更多的使用可以参考pylite中test/test_tensor.py中的使用
+### LiteNetwork
+LiteNetwork主要为用户提供模型载入，运行等功能。使用的模型见lite的readme中关于模型的部分
+* CPU基本模型载入运行的example
+```
+def test_network_basic():
+    source_dir = os.getenv("LITE_TEST_RESOUCE")
+    input_data_path = os.path.join(source_dir, "input_data.npy")
+    # read input to input_data
+    input_data = np.load(input_data_path)
+    model_path = os.path.join(source_dir, "shufflenet.mge")
+
+    network = LiteNetwork()
+    network.load(model_path)
+
+    input_name = network.get_input_name(0)
+    input_tensor = network.get_io_tensor(input_name)
+    output_name = network.get_output_name(0)
+    output_tensor = network.get_io_tensor(output_name)
+
+    assert input_tensor.layout.shapes[0] == 1
+    assert input_tensor.layout.shapes[1] == 3
+    assert input_tensor.layout.shapes[2] == 224
+    assert input_tensor.layout.shapes[3] == 224
+    assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
+    assert input_tensor.layout.ndim == 4
+
+    # copy input data to input_tensor of the network
+    input_tensor.set_data_by_copy(input_data)
+    for i in range(3):
+        network.forward()
+        network.wait()
+
+    output_data = output_tensor.to_numpy()
+    print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum()))
+```
+* CUDA上使用device内存作为模型输入，需要在构造network候配置config和IO信息
+```
+def test_network_device_IO():
+    source_dir = os.getenv("LITE_TEST_RESOUCE")
+    input_data_path = os.path.join(source_dir, "input_data.npy")
+    model_path = os.path.join(source_dir, "shufflenet.mge")
+    # read input to input_data
+    input_data = np.load(input_data_path)
+    input_layout = LiteLayout([1, 3, 224, 224])
+    host_input_data = LiteTensor(layout=input_layout)
+    host_input_data.set_data_by_share(input_data)
+    dev_input_data = LiteTensor(layout=input_layout, device_type=LiteDeviceType.LITE_CUDA)
+    dev_input_data.copy_from(host_input_data)
+
+    # construct LiteOption
+    options = LiteOptions()
+    options.weight_preprocess = 1
+    options.var_sanity_check_first_run = 0
+    net_config = LiteConfig(device_type=LiteDeviceType.LITE_CUDA, option=options)
+
+    # constuct LiteIO, is_host=False means the input tensor will use device memory
+    input_io = LiteIO("data", is_host=False)
+    ios = LiteNetworkIO()
+    ios.add_input(input_io)
+
+    network = LiteNetwork(config=net_config, io=ios)
+    network.load(model_path)
+
+    input_name = network.get_input_name(0)
+    dev_input_tensor = network.get_io_tensor(input_name)
+    output_name = network.get_output_name(0)
+    output_tensor = network.get_io_tensor(output_name)
+
+    # copy input data to input_tensor of the network
+    dev_input_tensor.share_memory_with(dev_input_data)
+    for i in range(3):
+        network.forward()
+        network.wait()
+
+    output_data = output_tensor.to_numpy()
+    print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum()))
+```
+更多的使用可以参考pylite中test/test_network.py和test/test_network_cuda.py中的使用
diff --git a/lite/pylite/requires.txt b/lite/pylite/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0e7287299d95cb849599afdea389994405451369
--- /dev/null
+++ b/lite/pylite/requires.txt
@@ -0,0 +1 @@
+numpy>=1.18
diff --git a/lite/pylite/scripts/format.sh b/lite/pylite/scripts/format.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3b93c50e2b15725706190eb0bca07842367cb0b5
--- /dev/null
+++ b/lite/pylite/scripts/format.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -e
+cd $(dirname $0)/..
+
+ISORT_ARG=""
+BLACK_ARG=""
+
+while getopts 'd' OPT; do
+    case $OPT in
+        d)
+            ISORT_ARG="--diff --check-only"
+            BLACK_ARG="--diff --check"
+            ;;
+        ?)
+            echo "Usage: `basename $0` [-d]"
+    esac
+done
+
+isort $ISORT_ARG -j $(nproc) -rc megenginelite test
+black $BLACK_ARG --target-version=py35 -- megenginelite test
diff --git a/lite/pylite/setup.py b/lite/pylite/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..436c81dadc4a221f630eb029e15a9cab7682fd8f
--- /dev/null
+++ b/lite/pylite/setup.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import os
+import re
+import pathlib
+import platform
+from distutils.file_util import copy_file
+from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext as _build_ext
+
+class PrecompiledExtesion(Extension):
+    def __init__(self, name):
+        super().__init__(name, sources=[])
+
+class build_ext(_build_ext):
+
+    def build_extension(self, ext):
+        if not isinstance(ext, PrecompiledExtesion):
+            return super().build_extension(ext)
+
+        if not self.inplace:
+            fullpath = self.get_ext_fullpath(ext.name)
+            extdir = pathlib.Path(fullpath)
+            extdir.parent.mkdir(parents=True, exist_ok=True)
+
+            modpath = self.get_ext_fullname(ext.name).split('.')
+            if platform.system() == 'Windows':
+                modpath[-1] += '.dll'
+            elif platform.system() == 'Darwin':
+                modpath[-1] += '.dylib'
+            else:
+                modpath[-1] += '.so'
+            modpath = str(pathlib.Path(*modpath).resolve())
+
+            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
+
+v = {}
+with open("megenginelite/version.py") as fp:
+    exec(fp.read(), v)
+__version__ = v['__version__']
+
+email = 'megengine@megvii.com'
+# https://www.python.org/dev/peps/pep-0440
+# Public version identifiers: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
+# Local version identifiers: <public version identifier>[+<local version label>]
+# PUBLIC_VERSION_POSTFIX use to handle rc or dev info
+public_version_postfix = os.environ.get('PUBLIC_VERSION_POSTFIX')
+if public_version_postfix:
+    __version__ = '{}{}'.format(__version__, public_version_postfix)
+
+local_version = []
+strip_sdk_info = os.environ.get('STRIP_SDK_INFO', 'False').lower()
+sdk_name = os.environ.get('SDK_NAME', 'cpu')
+if 'true' == strip_sdk_info:
+    print('wheel version strip sdk info')
+else:
+    local_version.append(sdk_name)
+local_postfix = os.environ.get('LOCAL_VERSION')
+if local_postfix:
+    local_version.append(local_postfix)
+if len(local_version):
+    __version__ = '{}+{}'.format(__version__, '.'.join(local_version))
+
+packages = find_packages()
+megenginelite_data = [
+    str(f.relative_to('megenginelite'))
+    for f in pathlib.Path('megenginelite').glob('**/*')
+]
+
+if platform.system() == 'Windows':
+    megenginelite_data.remove('libs\\liblite_shared.dll')
+elif platform.system() == 'Darwin':
+    megenginelite_data.remove('libs/liblite_shared.dylib')
+else:
+    megenginelite_data.remove('libs/liblite_shared.so')
+
+with open('requires.txt') as f:
+    requires = f.read().splitlines()
+
+prebuild_modules=[PrecompiledExtesion('megenginelite.libs.liblite_shared')]
+setup_kwargs = dict(
+    name=package_name,
+    version=__version__,
+    description='Inference Framework for MegEngine',
+    author='Megvii Engine Team',
+    author_email=email,
+    packages=packages,
+    package_data={
+        'megenginelite': megenginelite_data,
+    },
+    ext_modules=prebuild_modules,
+    install_requires=requires,
+    cmdclass={'build_ext': build_ext},
+)
+setup_kwargs.update(dict(
+    classifiers=[
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Education',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Programming Language :: C++',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
+    'Programming Language :: Python :: 3.8',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Mathematics',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Topic :: Software Development',
+    'Topic :: Software Development :: Libraries',
+    'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    license='Apache 2.0',
+    keywords='megengine deep learning',
+    data_files = [("megengine", [
+        "../LICENSE",
+        "../ACKNOWLEDGMENTS",
+    ])]
+))
+
+setup(**setup_kwargs)
diff --git a/lite/pylite/test/test_global.py b/lite/pylite/test/test_global.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd4c85a08c6b768e23e08fd3f2236abfb8daa17
--- /dev/null
+++ b/lite/pylite/test/test_global.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import os
+import unittest
+
+import numpy as np
+
+from megenginelite import *
+
+set_log_level(2)
+
+
+class TestShuffleNet(unittest.TestCase):
+    source_dir = os.getenv("LITE_TEST_RESOUCE")
+    input_data_path = os.path.join(source_dir, "input_data.npy")
+    correct_data_path = os.path.join(source_dir, "output_data.npy")
+    correct_data = np.load(correct_data_path).flatten()
+    input_data = np.load(input_data_path)
+
+    def check_correct(self, out_data, error=1e-4):
+        out_data = out_data.flatten()
+        assert np.isfinite(out_data.sum())
+        assert self.correct_data.size == out_data.size
+        for i in range(out_data.size):
+            assert abs(out_data[i] - self.correct_data[i]) < error
+
+    def do_forward(self, network, times=3):
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        input_tensor.set_data_by_copy(self.input_data)
+        for i in range(times):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+
+class TestGlobal(TestShuffleNet):
+    def test_device_count(self):
+        LiteGlobal.try_coalesce_all_free_memory()
+        count = LiteGlobal.get_device_count(LiteDeviceType.LITE_CPU)
+        assert count > 0
+
+    def test_register_decryption_method(self):
+        @decryption_func
+        def function(in_arr, key_arr, out_arr):
+            if not out_arr:
+                return in_arr.size
+            else:
+                for i in range(in_arr.size):
+                    out_arr[i] = in_arr[i] ^ key_arr[0] ^ key_arr[0]
+                return out_arr.size
+
+        LiteGlobal.register_decryption_and_key("just_for_test", function, [15])
+        config = LiteConfig()
+        config.bare_model_cryption_name = "just_for_test".encode("utf-8")
+
+        network = LiteNetwork()
+        model_path = os.path.join(self.source_dir, "shufflenet.mge")
+        network.load(model_path)
+
+        self.do_forward(network)
+
+    def test_update_decryption_key(self):
+        wrong_key = [0] * 32
+        LiteGlobal.update_decryption_key("AES_default", wrong_key)
+
+        with self.assertRaises(RuntimeError):
+            config = LiteConfig()
+            config.bare_model_cryption_name = "AES_default".encode("utf-8")
+            network = LiteNetwork(config)
+            model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
+            network.load(model_path)
+
+        right_key = [i for i in range(32)]
+        LiteGlobal.update_decryption_key("AES_default", right_key)
+
+        config = LiteConfig()
+        config.bare_model_cryption_name = "AES_default".encode("utf-8")
+        network = LiteNetwork(config)
+        model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
+        network.load(model_path)
+
+        self.do_forward(network)
diff --git a/lite/pylite/test/test_network.py b/lite/pylite/test/test_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3a20283cb92bae56825a4ca0ae01fcd0355bdd
--- /dev/null
+++ b/lite/pylite/test/test_network.py
@@ -0,0 +1,405 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import os
+import unittest
+
+import numpy as np
+
+from megenginelite import *
+
+set_log_level(2)
+
+
+def test_version():
+    print("Lite verson: {}".format(version))
+
+
+def test_network_io():
+    input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE)
+    input_io2 = LiteIO(
+        "data2",
+        is_host=True,
+        io_type=LiteIOType.LITE_IO_SHAPE,
+        layout=LiteLayout([2, 4, 4]),
+    )
+    io = LiteNetworkIO()
+    io.add_input(input_io1)
+    io.add_input(input_io2)
+
+    output_io1 = LiteIO("out1", is_host=False)
+    output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000]))
+
+    io.add_output(output_io1)
+    io.add_output(output_io2)
+
+    assert len(io.inputs) == 2
+    assert len(io.outputs) == 2
+
+    assert io.inputs[0] == input_io1
+    assert io.outputs[0] == output_io1
+
+    c_io = io._create_network_io()
+
+    assert c_io.input_size == 2
+    assert c_io.output_size == 2
+
+
+class TestShuffleNet(unittest.TestCase):
+    source_dir = os.getenv("LITE_TEST_RESOUCE")
+    input_data_path = os.path.join(source_dir, "input_data.npy")
+    correct_data_path = os.path.join(source_dir, "output_data.npy")
+    model_path = os.path.join(source_dir, "shufflenet.mge")
+    correct_data = np.load(correct_data_path).flatten()
+    input_data = np.load(input_data_path)
+
+    def check_correct(self, out_data, error=1e-4):
+        out_data = out_data.flatten()
+        assert np.isfinite(out_data.sum())
+        assert self.correct_data.size == out_data.size
+        for i in range(out_data.size):
+            assert abs(out_data[i] - self.correct_data[i]) < error
+
+    def do_forward(self, network, times=3):
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        input_tensor.set_data_by_copy(self.input_data)
+        for i in range(times):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+
+class TestNetwork(TestShuffleNet):
+    def test_decryption(self):
+        model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
+        config = LiteConfig()
+        config.bare_model_cryption_name = "AES_default".encode("utf-8")
+        network = LiteNetwork(config)
+        network.load(model_path)
+        self.do_forward(network)
+
+    def test_pack_model(self):
+        model_path = os.path.join(self.source_dir, "test_packed_model_rc4.lite")
+        network = LiteNetwork()
+        network.load(model_path)
+        self.do_forward(network)
+
+    def test_network_basic(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        assert input_tensor.layout.shapes[0] == 1
+        assert input_tensor.layout.shapes[1] == 3
+        assert input_tensor.layout.shapes[2] == 224
+        assert input_tensor.layout.shapes[3] == 224
+        assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
+        assert input_tensor.layout.ndim == 4
+
+        self.do_forward(network)
+
+    def test_network_shared_data(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        input_tensor.set_data_by_share(self.input_data)
+        for i in range(3):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+    def test_network_get_name(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+
+        input_names = network.get_all_input_name()
+        assert input_names[0] == "data"
+        output_names = network.get_all_output_name()
+        assert output_names[0] == network.get_output_name(0)
+
+        self.do_forward(network)
+
+    def test_network_set_device_id(self):
+        network = LiteNetwork()
+        assert network.device_id == 0
+
+        network.device_id = 1
+        network.load(self.model_path)
+        assert network.device_id == 1
+
+        with self.assertRaises(RuntimeError):
+            network.device_id = 1
+
+        self.do_forward(network)
+
+    def test_network_set_stream_id(self):
+        network = LiteNetwork()
+        assert network.stream_id == 0
+
+        network.stream_id = 1
+        network.load(self.model_path)
+        assert network.stream_id == 1
+
+        with self.assertRaises(RuntimeError):
+            network.stream_id = 1
+
+        self.do_forward(network)
+
+    def test_network_set_thread_number(self):
+        network = LiteNetwork()
+        assert network.threads_number == 1
+
+        network.threads_number = 2
+        network.load(self.model_path)
+        assert network.threads_number == 2
+
+        with self.assertRaises(RuntimeError):
+            network.threads_number = 2
+
+        self.do_forward(network)
+
+    def test_network_cpu_inplace(self):
+        network = LiteNetwork()
+        assert network.is_cpu_inplace_mode() == False
+
+        network.enable_cpu_inplace_mode()
+        network.load(self.model_path)
+        assert network.is_cpu_inplace_mode() == True
+
+        with self.assertRaises(RuntimeError):
+            network.enable_cpu_inplace_mode()
+
+        self.do_forward(network)
+
+    def test_network_option(self):
+        option = LiteOptions()
+        option.weight_preprocess = 1
+        option.var_sanity_check_first_run = 0
+
+        config = LiteConfig(option=option)
+        network = LiteNetwork(config=config)
+        network.load(self.model_path)
+
+        self.do_forward(network)
+
+    def test_network_reset_io(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+
+        input_io = LiteIO("data")
+        ios = LiteNetworkIO()
+        ios.add_input(input_io)
+        network = LiteNetwork(config=config, io=ios)
+        network.load(self.model_path)
+
+        input_tensor = network.get_io_tensor("data")
+        assert input_tensor.device_type == LiteDeviceType.LITE_CPU
+
+        self.do_forward(network)
+
+    def test_network_by_share(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        assert input_tensor.device_type == LiteDeviceType.LITE_CPU
+        layout = LiteLayout(self.input_data.shape, self.input_data.dtype)
+        tensor_tmp = LiteTensor(layout=layout)
+        tensor_tmp.set_data_by_share(self.input_data)
+        input_tensor.share_memory_with(tensor_tmp)
+
+        for i in range(3):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+    def test_network_share_weights(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+
+        src_network = LiteNetwork(config=config)
+        src_network.load(self.model_path)
+
+        new_network = LiteNetwork()
+        new_network.enable_cpu_inplace_mode()
+        new_network.share_weights_with(src_network)
+
+        self.do_forward(src_network)
+        self.do_forward(new_network)
+
+    def test_network_share_runtime_memory(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+
+        src_network = LiteNetwork(config=config)
+        src_network.load(self.model_path)
+
+        new_network = LiteNetwork()
+        new_network.enable_cpu_inplace_mode()
+        new_network.share_runtime_memroy(src_network)
+        new_network.load(self.model_path)
+
+        self.do_forward(src_network)
+        self.do_forward(new_network)
+
+    #    def test_network_async(self):
+    #        count = 0
+    #        finished = False
+    #
+    #        def async_callback():
+    #            nonlocal finished
+    #            finished = True
+    #            return 0
+    #
+    #        option = LiteOptions()
+    #        option.var_sanity_check_first_run = 0
+    #        config = LiteConfig(option=option)
+    #
+    #        network = LiteNetwork(config=config)
+    #        network.load(self.model_path)
+    #
+    #        network.async_with_callback(async_callback)
+    #
+    #        input_tensor = network.get_io_tensor(network.get_input_name(0))
+    #        output_tensor = network.get_io_tensor(network.get_output_name(0))
+    #
+    #        input_tensor.set_data_by_share(self.input_data)
+    #        network.forward()
+    #
+    #        while not finished:
+    #            count += 1
+    #
+    #        assert count > 0
+    #        output_data = output_tensor.to_numpy()
+    #        self.check_correct(output_data)
+    #
+    #    def test_network_start_callback(self):
+    #        network = LiteNetwork()
+    #        network.load(self.model_path)
+    #        start_checked = False
+    #
+    #        @start_finish_callback
+    #        def start_callback(ios):
+    #            nonlocal start_checked
+    #            start_checked = True
+    #            assert len(ios) == 1
+    #            for key in ios:
+    #                io = key
+    #                data = ios[key].to_numpy().flatten()
+    #                input_data = self.input_data.flatten()
+    #                assert data.size == input_data.size
+    #                assert io.name.decode("utf-8") == "data"
+    #                for i in range(data.size):
+    #                    assert data[i] == input_data[i]
+    #            return 0
+    #
+    #        network.set_start_callback(start_callback)
+    #        self.do_forward(network, 1)
+    #        assert start_checked == True
+    #
+    #    def test_network_finish_callback(self):
+    #        network = LiteNetwork()
+    #        network.load(self.model_path)
+    #        finish_checked = False
+    #
+    #        @start_finish_callback
+    #        def finish_callback(ios):
+    #            nonlocal finish_checked
+    #            finish_checked = True
+    #            assert len(ios) == 1
+    #            for key in ios:
+    #                io = key
+    #                data = ios[key].to_numpy().flatten()
+    #                output_data = self.correct_data.flatten()
+    #                assert data.size == output_data.size
+    #                for i in range(data.size):
+    #                    assert data[i] == output_data[i]
+    #            return 0
+    #
+    #        network.set_finish_callback(finish_callback)
+    #        self.do_forward(network, 1)
+    #        assert finish_checked == True
+
+    def test_enable_profile(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+        network.enable_profile_performance("./profile.json")
+
+        self.do_forward(network)
+
+        fi = open("./profile.json", "r")
+        fi.close()
+        os.remove("./profile.json")
+
+    def test_io_txt_dump(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+        network.io_txt_dump("./io_txt.txt")
+        self.do_forward(network)
+
+    def test_io_bin_dump(self):
+        import shutil
+
+        folder = "./out"
+        network = LiteNetwork()
+        network.load(self.model_path)
+        if not os.path.exists(folder):
+            os.mkdir(folder)
+        network.io_bin_dump(folder)
+        self.do_forward(network)
+        shutil.rmtree(folder)
+
+    def test_algo_workspace_limit(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+        print("modify the workspace limit.")
+        network.set_network_algo_workspace_limit(10000)
+        self.do_forward(network)
+
+    def test_network_algo_policy(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+        network.set_network_algo_policy(
+            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE
+            | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE
+        )
+        self.do_forward(network)
+
+    def test_network_algo_policy_ignore_batch(self):
+        network = LiteNetwork()
+        network.load(self.model_path)
+        network.set_network_algo_policy(
+            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE,
+            shared_batch_size=1,
+            binary_equal_between_batch=True,
+        )
+        self.do_forward(network)
diff --git a/lite/pylite/test/test_network_cuda.py b/lite/pylite/test/test_network_cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7fe9e10129fb835bc03ab91cc395ff69ab41e9e
--- /dev/null
+++ b/lite/pylite/test/test_network_cuda.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import functools
+import os
+import unittest
+
+import numpy as np
+
+from megenginelite import *
+
+set_log_level(2)
+
+
+def require_cuda(ngpu=1):
+    """a decorator that disables a testcase if cuda is not enabled"""
+
+    def dector(func):
+        @functools.wraps(func)
+        def wrapped(*args, **kwargs):
+            if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA) >= ngpu:
+                return func(*args, **kwargs)
+
+        return wrapped
+
+    return dector
+
+
+class TestShuffleNetCuda(unittest.TestCase):
+    source_dir = os.getenv("LITE_TEST_RESOUCE")
+    input_data_path = os.path.join(source_dir, "input_data.npy")
+    correct_data_path = os.path.join(source_dir, "output_data.npy")
+    model_path = os.path.join(source_dir, "shufflenet.mge")
+    correct_data = np.load(correct_data_path).flatten()
+    input_data = np.load(input_data_path)
+
+    def check_correct(self, out_data, error=1e-4):
+        out_data = out_data.flatten()
+        assert np.isfinite(out_data.sum())
+        assert self.correct_data.size == out_data.size
+        for i in range(out_data.size):
+            assert abs(out_data[i] - self.correct_data[i]) < error
+
+    def do_forward(self, network, times=3):
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        input_tensor.set_data_by_copy(self.input_data)
+        for i in range(times):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+
+class TestNetwork(TestShuffleNetCuda):
+    @require_cuda()
+    def test_network_basic(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        network.load(self.model_path)
+
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        assert input_tensor.layout.shapes[0] == 1
+        assert input_tensor.layout.shapes[1] == 3
+        assert input_tensor.layout.shapes[2] == 224
+        assert input_tensor.layout.shapes[3] == 224
+        assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
+        assert input_tensor.layout.ndim == 4
+
+        self.do_forward(network)
+
+    @require_cuda()
+    def test_network_shared_data(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        network.load(self.model_path)
+
+        input_name = network.get_input_name(0)
+        input_tensor = network.get_io_tensor(input_name)
+        output_name = network.get_output_name(0)
+        output_tensor = network.get_io_tensor(output_name)
+
+        input_tensor.set_data_by_share(self.input_data)
+        for i in range(3):
+            network.forward()
+            network.wait()
+
+        output_data = output_tensor.to_numpy()
+        self.check_correct(output_data)
+
+    @require_cuda(2)
+    def test_network_set_device_id(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        assert network.device_id == 0
+
+        network.device_id = 1
+        network.load(self.model_path)
+        assert network.device_id == 1
+
+        with self.assertRaises(RuntimeError):
+            network.device_id = 1
+
+        self.do_forward(network)
+
+    @require_cuda()
+    def test_network_option(self):
+        option = LiteOptions()
+        option.weight_preprocess = 1
+        option.var_sanity_check_first_run = 0
+
+        config = LiteConfig(option=option)
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config=config)
+        network.load(self.model_path)
+
+        self.do_forward(network)
+
+    @require_cuda()
+    def test_network_reset_io(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+
+        config.device_type = LiteDeviceType.LITE_CUDA
+        input_io = LiteIO("data")
+        ios = LiteNetworkIO()
+        ios.add_input(input_io)
+        network = LiteNetwork(config=config, io=ios)
+        network.load(self.model_path)
+
+        input_tensor = network.get_io_tensor("data")
+        assert input_tensor.device_type == LiteDeviceType.LITE_CPU
+
+        self.do_forward(network)
+
+    @require_cuda()
+    def test_network_share_weights(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+        config.device_type = LiteDeviceType.LITE_CUDA
+
+        src_network = LiteNetwork(config=config)
+        src_network.load(self.model_path)
+
+        new_network = LiteNetwork()
+        new_network.enable_cpu_inplace_mode()
+        new_network.share_weights_with(src_network)
+
+        self.do_forward(src_network)
+        self.do_forward(new_network)
+
+    @require_cuda()
+    def test_network_share_runtime_memory(self):
+        option = LiteOptions()
+        option.var_sanity_check_first_run = 0
+        config = LiteConfig(option=option)
+        config.device_type = LiteDeviceType.LITE_CUDA
+
+        src_network = LiteNetwork(config=config)
+        src_network.load(self.model_path)
+
+        new_network = LiteNetwork()
+        new_network.enable_cpu_inplace_mode()
+        new_network.share_runtime_memroy(src_network)
+        new_network.load(self.model_path)
+
+        self.do_forward(src_network)
+        self.do_forward(new_network)
+
+    @require_cuda()
+    def test_enable_profile(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        network.load(self.model_path)
+        network.enable_profile_performance("./profile.json")
+
+        self.do_forward(network)
+
+        fi = open("./profile.json", "r")
+        fi.close()
+        os.remove("./profile.json")
+
+    @require_cuda()
+    def test_algo_workspace_limit(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        network.load(self.model_path)
+        print("modify the workspace limit.")
+        network.set_network_algo_workspace_limit(10000)
+        self.do_forward(network)
+
+    @require_cuda()
+    def test_network_algo_policy(self):
+        config = LiteConfig()
+        config.device_type = LiteDeviceType.LITE_CUDA
+        network = LiteNetwork(config)
+        network.load(self.model_path)
+        network.set_network_algo_policy(
+            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE
+            | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE
+        )
+        self.do_forward(network)
diff --git a/lite/pylite/test/test_tensor.py b/lite/pylite/test/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d6575b074004a7fbff8e53b43715d1e5dc1b49
--- /dev/null
+++ b/lite/pylite/test/test_tensor.py
@@ -0,0 +1,291 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import functools
+
+import numpy as np
+
+from megenginelite import *
+
+
+def require_cuda(func):
+    """a decorator that disables a testcase if cuda is not enabled"""
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA):
+            return func(*args, **kwargs)
+
+    return wrapped
+
+
+def test_tensor_make():
+    empty_layout = LiteLayout()
+    assert empty_layout.ndim == 0
+    assert empty_layout.data_type == int(LiteDataType.LITE_FLOAT)
+
+    empty_tensor = LiteTensor()
+    assert empty_tensor.layout.ndim == empty_layout.ndim
+    assert empty_tensor.layout.data_type == empty_layout.data_type
+
+    layout = LiteLayout([4, 16])
+    layout = LiteLayout(dtype="float32")
+    layout = LiteLayout([4, 16], "float32")
+    layout = LiteLayout([4, 16], "float16")
+    layout = LiteLayout([4, 16], np.float32)
+    layout = LiteLayout([4, 16], np.int8)
+    layout = LiteLayout([4, 16], LiteDataType.LITE_FLOAT)
+
+    tensor = LiteTensor(layout)
+    tensor = LiteTensor(layout, LiteDeviceType.LITE_CPU)
+    assert tensor.layout == layout
+    assert tensor.device_type == LiteDeviceType.LITE_CPU
+    assert tensor.is_continue == True
+    assert tensor.is_pinned_host == False
+    assert tensor.nbytes == 4 * 16 * 4
+    assert tensor.device_id == 0
+
+    tensor = LiteTensor(layout, device_id=1)
+    assert tensor.device_id == 1
+
+
+def test_tensor_set_data():
+    layout = LiteLayout([2, 16], "int8")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 2 * 16
+
+    data = [i for i in range(32)]
+    tensor.set_data_by_copy(data)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == i
+
+    arr = np.ones([2, 16], "int8")
+    tensor.set_data_by_copy(arr)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == 1
+
+    for i in range(32):
+        arr[i // 16][i % 16] = i
+    tensor.set_data_by_share(arr)
+    real_data = tensor.to_numpy()
+    for i in range(32):
+        assert real_data[i // 16][i % 16] == i
+
+    arr[0][8] = 100
+    arr[1][3] = 20
+    real_data = tensor.to_numpy()
+    assert real_data[0][8] == 100
+    assert real_data[1][3] == 20
+
+
+def test_fill_zero():
+    layout = LiteLayout([4, 8], "int16")
+    tensor1 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 2
+
+    tensor1.set_data_by_copy([i for i in range(32)])
+    real_data = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i
+
+    tensor1.fill_zero()
+    real_data = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == 0
+
+
+def test_copy_from():
+    layout = LiteLayout([4, 8], "int16")
+    tensor1 = LiteTensor(layout)
+    tensor2 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 2
+    assert tensor2.nbytes == 4 * 8 * 2
+
+    tensor1.set_data_by_copy([i for i in range(32)])
+    tensor2.copy_from(tensor1)
+    real_data = tensor2.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i
+
+    tensor1.set_data_by_copy([i + 5 for i in range(32)])
+    tensor2.copy_from(tensor1)
+    real_data = tensor2.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i + 5
+
+
+def test_reshape():
+    layout = LiteLayout([4, 8], "int16")
+    tensor1 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 2
+
+    tensor1.set_data_by_copy([i for i in range(32)])
+    real_data = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i
+
+    tensor1.reshape([8, 4])
+    real_data = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data[i // 4][i % 4] == i
+
+
+def test_slice():
+    layout = LiteLayout([4, 8], "int32")
+    tensor1 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 4
+
+    tensor1.set_data_by_copy([i for i in range(32)])
+    real_data_org = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data_org[i // 8][i % 8] == i
+
+    tensor2 = tensor1.slice([1, 4], [3, 8])
+    assert tensor2.layout.shapes[0] == 2
+    assert tensor2.layout.shapes[1] == 4
+    assert tensor2.is_continue == False
+
+    real_data = tensor2.to_numpy()
+    for i in range(8):
+        row = i // 4
+        col = i % 4
+        assert real_data[row][col] == real_data_org[row + 1][col + 4]
+
+
+def test_tensor_share_memory():
+    layout = LiteLayout([4, 8], "int16")
+    tensor1 = LiteTensor(layout)
+    tensor2 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 2
+    assert tensor2.nbytes == 4 * 8 * 2
+
+    tensor1.set_data_by_copy([i for i in range(32)])
+    tensor2.share_memory_with(tensor1)
+    real_data = tensor2.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i
+
+    tensor1.set_data_by_copy([i + 5 for i in range(32)])
+    real_data = tensor2.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i + 5
+
+
+def test_tensor_share_ctype_memory():
+    layout = LiteLayout([4, 8], "int16")
+    tensor1 = LiteTensor(layout)
+    assert tensor1.nbytes == 4 * 8 * 2
+
+    arr = np.ones([4, 8], "int16")
+    for i in range(32):
+        arr[i // 8][i % 8] = i
+    tensor1.set_data_by_share(arr.ctypes.data, 4 * 8 * 2)
+    real_data = tensor1.to_numpy()
+    for i in range(32):
+        assert real_data[i // 8][i % 8] == i
+
+
+@require_cuda
+def test_tensor_share_ctype_memory_device():
+    layout = LiteLayout([4, 8], "int16")
+    tensor_cpu = LiteTensor(
+        layout=layout, device_type=LiteDeviceType.LITE_CUDA, is_pinned_host=True
+    )
+    tensor_cuda1 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA)
+    tensor_cuda2 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA)
+    assert tensor_cpu.nbytes == 4 * 8 * 2
+    assert tensor_cuda1.nbytes == 4 * 8 * 2
+    assert tensor_cuda2.nbytes == 4 * 8 * 2
+
+    arr = np.ones([4, 8], "int16")
+    for i in range(32):
+        arr[i // 8][i % 8] = i
+    tensor_cpu.set_data_by_share(arr.ctypes.data, 4 * 8 * 2)
+    tensor_cuda1.copy_from(tensor_cpu)
+    device_mem = tensor_cuda1.get_ctypes_memory()
+    tensor_cuda2.set_data_by_share(device_mem, tensor_cuda1.nbytes)
+    real_data1 = tensor_cuda1.to_numpy()
+    real_data2 = tensor_cuda2.to_numpy()
+    for i in range(32):
+        assert real_data1[i // 8][i % 8] == i
+        assert real_data2[i // 8][i % 8] == i
+
+
+def test_tensor_share_memory_with():
+    layout = LiteLayout([4, 32], "int16")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 4 * 32 * 2
+
+    arr = np.ones([4, 32], "int16")
+    for i in range(128):
+        arr[i // 32][i % 32] = i
+    tensor.set_data_by_share(arr)
+    real_data = tensor.to_numpy()
+    for i in range(128):
+        assert real_data[i // 32][i % 32] == i
+
+    tensor2 = LiteTensor(layout)
+    tensor2.share_memory_with(tensor)
+    real_data = tensor.to_numpy()
+    real_data2 = tensor2.to_numpy()
+    for i in range(128):
+        assert real_data[i // 32][i % 32] == i
+        assert real_data2[i // 32][i % 32] == i
+
+    arr[1][18] = 5
+    arr[3][7] = 345
+    real_data = tensor2.to_numpy()
+    assert real_data[1][18] == 5
+    assert real_data[3][7] == 345
+
+
+def test_empty_tensor():
+    empty_tensor = LiteTensor()
+    assert empty_tensor.layout.ndim == 0
+    assert empty_tensor.layout.data_type == int(LiteDataType.LITE_FLOAT)
+    # check empty tensor to numpy
+    data = empty_tensor.to_numpy()
+
+
+def test_tensor_by_set_copy_with_new_layout():
+    layout = LiteLayout([4, 32], "int16")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 4 * 32 * 2
+
+    arr = np.ones([8, 64], "int32")
+    tensor.set_data_by_copy(arr)
+    new_layout = tensor.layout
+    assert new_layout.ndim == 2
+    assert new_layout.shapes[0] == 8
+    assert new_layout.shapes[1] == 64
+
+    tensor = LiteTensor(layout)
+    tensor.set_data_by_share(arr)
+    new_layout = tensor.layout
+    assert new_layout.ndim == 2
+    assert new_layout.shapes[0] == 8
+    assert new_layout.shapes[1] == 64
+
+
+def test_tensor_concat():
+    layout = LiteLayout([4, 32], "int16")
+    tensors = []
+    arr = np.ones([4, 32], "int16")
+    for j in range(4):
+        for i in range(128):
+            arr[i // 32][i % 32] = j
+        tensor = LiteTensor(layout)
+        tensor.set_data_by_copy(arr)
+        tensors.append(tensor)
+    new_tensor = LiteTensorConcat(tensors, 0)
+
+    real_data = new_tensor.to_numpy()
+    for j in range(4):
+        for i in range(128):
+            index = j * 128 + i
+            assert real_data[index // 32][index % 32] == j
diff --git a/lite/pylite/test/test_utils.py b/lite/pylite/test/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7435825207c5ec723c7179b9b26f095bd215fe7b
--- /dev/null
+++ b/lite/pylite/test/test_utils.py
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import functools
+
+import numpy as np
+
+from megenginelite import *
+
+
+def require_cuda(func):
+    """a decorator that disables a testcase if cuda is not enabled"""
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA):
+            return func(*args, **kwargs)
+
+    return wrapped
+
+
+@require_cuda
+def test_tensor_collect_batch():
+    batch_tensor = TensorBatchCollector(
+        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
+    )
+    arr = np.ones([8, 8], "int32")
+    for i in range(4):
+        batch_tensor.collect(arr)
+        arr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 8
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(64):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+def test_tensor_collect_batch_cpu():
+    batch_tensor = TensorBatchCollector(
+        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
+    )
+    arr = np.ones([8, 8], "int32")
+    for i in range(4):
+        batch_tensor.collect(arr)
+        arr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 8
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(64):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+@require_cuda
+def test_tensor_collect_batch_by_index():
+    batch_tensor = TensorBatchCollector(
+        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
+    )
+    arr = np.ones([8, 8], "int32")
+    arr += 1  # ==2
+    batch_tensor.collect_id(arr, 1)
+    arr -= 1  # ==1
+    batch_tensor.collect_id(arr, 0)
+    arr += 2  # ==3
+    batch_tensor.collect_id(arr, 2)
+    arr += 1  # ==4
+    batch_tensor.collect_id(arr, 3)
+
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 8
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(64):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+@require_cuda
+def test_tensor_collect_batch_tensor():
+    batch_tensor = TensorBatchCollector(
+        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
+    )
+    nparr = np.ones([6, 8], "int32")
+    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
+    for i in range(4):
+        tensor.set_data_by_share(nparr)
+        batch_tensor.collect(tensor)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+def test_tensor_collect_batch_tensor_cpu():
+    batch_tensor = TensorBatchCollector(
+        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
+    )
+    nparr = np.ones([6, 8], "int32")
+    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
+    for i in range(4):
+        tensor.set_data_by_share(nparr)
+        batch_tensor.collect(tensor)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+@require_cuda
+def test_tensor_collect_batch_ctypes():
+    batch_tensor = TensorBatchCollector(
+        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
+    )
+    nparr = np.ones([6, 8], "int32")
+    for i in range(4):
+        in_data = nparr.ctypes.data
+        batch_tensor.collect_by_ctypes(in_data, nparr.nbytes)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+def test_tensor_collect_batch_ctypes_cpu():
+    batch_tensor = TensorBatchCollector(
+        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
+    )
+    nparr = np.ones([6, 8], "int32")
+    for i in range(4):
+        in_data = nparr.ctypes.data
+        batch_tensor.collect_by_ctypes(in_data, nparr.nbytes)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+@require_cuda
+def test_tensor_collect_batch_device_tensor():
+    all_tensor = LiteTensor(
+        LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT),
+        device_type=LiteDeviceType.LITE_CUDA,
+    )
+    batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor)
+    nparr = np.ones([6, 8], "int32")
+    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
+    for i in range(4):
+        tensor.set_data_by_share(nparr)
+        batch_tensor.collect(tensor)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
+
+
+@require_cuda
+def test_tensor_collect_batch_device_numpy():
+    all_tensor = LiteTensor(
+        LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT),
+        device_type=LiteDeviceType.LITE_CUDA,
+    )
+    batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor)
+    nparr = np.ones([6, 8], "int32")
+    for i in range(4):
+        batch_tensor.collect(nparr)
+        nparr += 1
+    data = batch_tensor.to_numpy()
+    assert data.shape[0] == 4
+    assert data.shape[1] == 6
+    assert data.shape[2] == 8
+    for i in range(4):
+        for j in range(48):
+            assert data[i][j // 8][j % 8] == i + 1
diff --git a/lite/src/decryption/aes_decrypt.h b/lite/src/decryption/aes_decrypt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f9b134bead76580cd18f6841025df6759b8fa65
--- /dev/null
+++ b/lite/src/decryption/aes_decrypt.h
@@ -0,0 +1,53 @@
+/**
+ * \file src/decryption/aes_decrypt.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "./mbedtls/aes.h"
+#include "decrypt_base.h"
+
+namespace lite {
+
+class AESDcryption {
+public:
+    static std::vector<uint8_t> decrypt_model(const void* model_mem,
+                                              size_t size,
+                                              const std::vector<uint8_t>& key) {
+        mbedtls_aes_context ctx;
+        mbedtls_aes_init(&ctx);
+        mbedtls_aes_setkey_dec(&ctx, key.data(), 256);
+
+        auto data = static_cast<const uint8_t*>(model_mem);
+        //! first 16 bytes is IV
+        uint8_t iv[16];
+        //! last 8 bytes is file size(length)
+        auto length_ptr = data + size - 8;
+        size_t length = 0;
+        for (int i = 0; i < 8; i++) {
+            length |= length_ptr[i] << (8 * (7 - i));
+        }
+        std::copy(data, data + 16, iv);
+        auto output = std::vector<uint8_t>(size - 24);
+        mbedtls_aes_crypt_cbc(&ctx, MBEDTLS_AES_DECRYPT, size - 24, iv,
+                              data + 16, output.data());
+        mbedtls_aes_free(&ctx);
+        output.erase(output.begin() + length, output.end());
+        return output;
+    }
+
+    static std::vector<uint8_t> get_decrypt_key() {
+        std::vector<uint8_t> key = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+                                    0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                    0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+                                    0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B,
+                                    0x1C, 0x1D, 0x1E, 0x1F};
+        return key;
+    }
+};
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/decrypt_base.h b/lite/src/decryption/decrypt_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3df19f900e9d8d49b876df88f6c3253160fcc0f
--- /dev/null
+++ b/lite/src/decryption/decrypt_base.h
@@ -0,0 +1,49 @@
+/**
+ * \file src/decryption/decrypt_base.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include "lite/global.h"
+#include "misc.h"
+
+namespace lite {
+
+struct DecryptionStaticData {
+    std::unordered_map<
+            std::string,
+            std::pair<DecryptionFunc, std::shared_ptr<std::vector<uint8_t>>>>
+            decryption_methods;
+    LITE_MUTEX map_mutex;
+};
+
+DecryptionStaticData& decryption_static_data();
+
+template <int count>
+struct DecryptionRegister;
+
+}  // namespace lite
+
+#define CONCAT_IMPL(a, b) a##b
+#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)
+
+#define REGIST_DECRYPTION_METHOD(name_, func_, key_) \
+    REGIST_DECRYPTION_METHOD_WITH_NUM(__COUNTER__, name_, func_, key_)
+
+#define REGIST_DECRYPTION_METHOD_WITH_NUM(number_, name_, func_, key_) \
+    template <>                                                        \
+    struct DecryptionRegister<number_> {                               \
+        DecryptionRegister() {                                         \
+            register_decryption_and_key(name_, func_, key_);           \
+        }                                                              \
+    };                                                                 \
+    namespace {                                                        \
+    DecryptionRegister<number_> MACRO_CONCAT(decryption_, number_);    \
+    }
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/mbedtls/aes.cc b/lite/src/decryption/mbedtls/aes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83dece97f2a43bf31cdaf90c8648aff338c0b273
--- /dev/null
+++ b/lite/src/decryption/mbedtls/aes.cc
@@ -0,0 +1,1363 @@
+/*
+ *  FIPS-197 compliant AES implementation
+ *
+ *  Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+/*
+ *  The AES block cipher was designed by Vincent Rijmen and Joan Daemen.
+ *
+ *  http://csrc.nist.gov/encryption/aes/rijndael/Rijndael.pdf
+ *  http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
+ */
+
+/**
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#if !defined(MBEDTLS_CONFIG_FILE)
+#include "config.h"
+#else
+#include MBEDTLS_CONFIG_FILE
+#endif
+
+#if defined(MBEDTLS_AES_C)
+
+#include <string.h>
+
+#include "aes.h"
+#if defined(MBEDTLS_PADLOCK_C)
+#include "mbedtls/padlock.h"
+#endif
+#if defined(MBEDTLS_AESNI_C)
+#include "mbedtls/aesni.h"
+#endif
+
+#if defined(MBEDTLS_SELF_TEST)
+#if defined(MBEDTLS_PLATFORM_C)
+#include "mbedtls/platform.h"
+#else
+#include <stdio.h>
+#define mbedtls_printf printf
+#endif /* MBEDTLS_PLATFORM_C */
+#endif /* MBEDTLS_SELF_TEST */
+
+#if !defined(MBEDTLS_AES_ALT)
+
+/* Implementation that should never be optimized out by the compiler */
+static void mbedtls_zeroize(void *v, size_t n) {
+    volatile unsigned char *p = (unsigned char *)v;
+    while (n--) *p++ = 0;
+}
+
+/*
+ * 32-bit integer manipulation macros (little endian)
+ */
+#ifndef GET_UINT32_LE
+#define GET_UINT32_LE(n, b, i)                                                 \
+    {                                                                          \
+        (n) = ((uint32_t)(b)[(i)]) | ((uint32_t)(b)[(i) + 1] << 8) |           \
+              ((uint32_t)(b)[(i) + 2] << 16) | ((uint32_t)(b)[(i) + 3] << 24); \
+    }
+#endif
+
+#ifndef PUT_UINT32_LE
+#define PUT_UINT32_LE(n, b, i)                              \
+    {                                                       \
+        (b)[(i)] = (unsigned char)(((n)) & 0xFF);           \
+        (b)[(i) + 1] = (unsigned char)(((n) >> 8) & 0xFF);  \
+        (b)[(i) + 2] = (unsigned char)(((n) >> 16) & 0xFF); \
+        (b)[(i) + 3] = (unsigned char)(((n) >> 24) & 0xFF); \
+    }
+#endif
+
+#if defined(MBEDTLS_PADLOCK_C) && \
+    (defined(MBEDTLS_HAVE_X86) || defined(MBEDTLS_PADLOCK_ALIGN16))
+static int aes_padlock_ace = -1;
+#endif
+
+#if defined(MBEDTLS_AES_ROM_TABLES)
+/*
+ * Forward S-box
+ */
+static const unsigned char FSb[256] = {
+    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B,
+    0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+    0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26,
+    0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2,
+    0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+    0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED,
+    0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F,
+    0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+    0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC,
+    0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14,
+    0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+    0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D,
+    0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F,
+    0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+    0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11,
+    0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F,
+    0xB0, 0x54, 0xBB, 0x16};
+
+/*
+ * Forward tables
+ */
+#define FT                                                       \
+                                                                 \
+    V(A5, 63, 63, C6), V(84, 7C, 7C, F8), V(99, 77, 77, EE),     \
+        V(8D, 7B, 7B, F6), V(0D, F2, F2, FF), V(BD, 6B, 6B, D6), \
+        V(B1, 6F, 6F, DE), V(54, C5, C5, 91), V(50, 30, 30, 60), \
+        V(03, 01, 01, 02), V(A9, 67, 67, CE), V(7D, 2B, 2B, 56), \
+        V(19, FE, FE, E7), V(62, D7, D7, B5), V(E6, AB, AB, 4D), \
+        V(9A, 76, 76, EC), V(45, CA, CA, 8F), V(9D, 82, 82, 1F), \
+        V(40, C9, C9, 89), V(87, 7D, 7D, FA), V(15, FA, FA, EF), \
+        V(EB, 59, 59, B2), V(C9, 47, 47, 8E), V(0B, F0, F0, FB), \
+        V(EC, AD, AD, 41), V(67, D4, D4, B3), V(FD, A2, A2, 5F), \
+        V(EA, AF, AF, 45), V(BF, 9C, 9C, 23), V(F7, A4, A4, 53), \
+        V(96, 72, 72, E4), V(5B, C0, C0, 9B), V(C2, B7, B7, 75), \
+        V(1C, FD, FD, E1), V(AE, 93, 93, 3D), V(6A, 26, 26, 4C), \
+        V(5A, 36, 36, 6C), V(41, 3F, 3F, 7E), V(02, F7, F7, F5), \
+        V(4F, CC, CC, 83), V(5C, 34, 34, 68), V(F4, A5, A5, 51), \
+        V(34, E5, E5, D1), V(08, F1, F1, F9), V(93, 71, 71, E2), \
+        V(73, D8, D8, AB), V(53, 31, 31, 62), V(3F, 15, 15, 2A), \
+        V(0C, 04, 04, 08), V(52, C7, C7, 95), V(65, 23, 23, 46), \
+        V(5E, C3, C3, 9D), V(28, 18, 18, 30), V(A1, 96, 96, 37), \
+        V(0F, 05, 05, 0A), V(B5, 9A, 9A, 2F), V(09, 07, 07, 0E), \
+        V(36, 12, 12, 24), V(9B, 80, 80, 1B), V(3D, E2, E2, DF), \
+        V(26, EB, EB, CD), V(69, 27, 27, 4E), V(CD, B2, B2, 7F), \
+        V(9F, 75, 75, EA), V(1B, 09, 09, 12), V(9E, 83, 83, 1D), \
+        V(74, 2C, 2C, 58), V(2E, 1A, 1A, 34), V(2D, 1B, 1B, 36), \
+        V(B2, 6E, 6E, DC), V(EE, 5A, 5A, B4), V(FB, A0, A0, 5B), \
+        V(F6, 52, 52, A4), V(4D, 3B, 3B, 76), V(61, D6, D6, B7), \
+        V(CE, B3, B3, 7D), V(7B, 29, 29, 52), V(3E, E3, E3, DD), \
+        V(71, 2F, 2F, 5E), V(97, 84, 84, 13), V(F5, 53, 53, A6), \
+        V(68, D1, D1, B9), V(00, 00, 00, 00), V(2C, ED, ED, C1), \
+        V(60, 20, 20, 40), V(1F, FC, FC, E3), V(C8, B1, B1, 79), \
+        V(ED, 5B, 5B, B6), V(BE, 6A, 6A, D4), V(46, CB, CB, 8D), \
+        V(D9, BE, BE, 67), V(4B, 39, 39, 72), V(DE, 4A, 4A, 94), \
+        V(D4, 4C, 4C, 98), V(E8, 58, 58, B0), V(4A, CF, CF, 85), \
+        V(6B, D0, D0, BB), V(2A, EF, EF, C5), V(E5, AA, AA, 4F), \
+        V(16, FB, FB, ED), V(C5, 43, 43, 86), V(D7, 4D, 4D, 9A), \
+        V(55, 33, 33, 66), V(94, 85, 85, 11), V(CF, 45, 45, 8A), \
+        V(10, F9, F9, E9), V(06, 02, 02, 04), V(81, 7F, 7F, FE), \
+        V(F0, 50, 50, A0), V(44, 3C, 3C, 78), V(BA, 9F, 9F, 25), \
+        V(E3, A8, A8, 4B), V(F3, 51, 51, A2), V(FE, A3, A3, 5D), \
+        V(C0, 40, 40, 80), V(8A, 8F, 8F, 05), V(AD, 92, 92, 3F), \
+        V(BC, 9D, 9D, 21), V(48, 38, 38, 70), V(04, F5, F5, F1), \
+        V(DF, BC, BC, 63), V(C1, B6, B6, 77), V(75, DA, DA, AF), \
+        V(63, 21, 21, 42), V(30, 10, 10, 20), V(1A, FF, FF, E5), \
+        V(0E, F3, F3, FD), V(6D, D2, D2, BF), V(4C, CD, CD, 81), \
+        V(14, 0C, 0C, 18), V(35, 13, 13, 26), V(2F, EC, EC, C3), \
+        V(E1, 5F, 5F, BE), V(A2, 97, 97, 35), V(CC, 44, 44, 88), \
+        V(39, 17, 17, 2E), V(57, C4, C4, 93), V(F2, A7, A7, 55), \
+        V(82, 7E, 7E, FC), V(47, 3D, 3D, 7A), V(AC, 64, 64, C8), \
+        V(E7, 5D, 5D, BA), V(2B, 19, 19, 32), V(95, 73, 73, E6), \
+        V(A0, 60, 60, C0), V(98, 81, 81, 19), V(D1, 4F, 4F, 9E), \
+        V(7F, DC, DC, A3), V(66, 22, 22, 44), V(7E, 2A, 2A, 54), \
+        V(AB, 90, 90, 3B), V(83, 88, 88, 0B), V(CA, 46, 46, 8C), \
+        V(29, EE, EE, C7), V(D3, B8, B8, 6B), V(3C, 14, 14, 28), \
+        V(79, DE, DE, A7), V(E2, 5E, 5E, BC), V(1D, 0B, 0B, 16), \
+        V(76, DB, DB, AD), V(3B, E0, E0, DB), V(56, 32, 32, 64), \
+        V(4E, 3A, 3A, 74), V(1E, 0A, 0A, 14), V(DB, 49, 49, 92), \
+        V(0A, 06, 06, 0C), V(6C, 24, 24, 48), V(E4, 5C, 5C, B8), \
+        V(5D, C2, C2, 9F), V(6E, D3, D3, BD), V(EF, AC, AC, 43), \
+        V(A6, 62, 62, C4), V(A8, 91, 91, 39), V(A4, 95, 95, 31), \
+        V(37, E4, E4, D3), V(8B, 79, 79, F2), V(32, E7, E7, D5), \
+        V(43, C8, C8, 8B), V(59, 37, 37, 6E), V(B7, 6D, 6D, DA), \
+        V(8C, 8D, 8D, 01), V(64, D5, D5, B1), V(D2, 4E, 4E, 9C), \
+        V(E0, A9, A9, 49), V(B4, 6C, 6C, D8), V(FA, 56, 56, AC), \
+        V(07, F4, F4, F3), V(25, EA, EA, CF), V(AF, 65, 65, CA), \
+        V(8E, 7A, 7A, F4), V(E9, AE, AE, 47), V(18, 08, 08, 10), \
+        V(D5, BA, BA, 6F), V(88, 78, 78, F0), V(6F, 25, 25, 4A), \
+        V(72, 2E, 2E, 5C), V(24, 1C, 1C, 38), V(F1, A6, A6, 57), \
+        V(C7, B4, B4, 73), V(51, C6, C6, 97), V(23, E8, E8, CB), \
+        V(7C, DD, DD, A1), V(9C, 74, 74, E8), V(21, 1F, 1F, 3E), \
+        V(DD, 4B, 4B, 96), V(DC, BD, BD, 61), V(86, 8B, 8B, 0D), \
+        V(85, 8A, 8A, 0F), V(90, 70, 70, E0), V(42, 3E, 3E, 7C), \
+        V(C4, B5, B5, 71), V(AA, 66, 66, CC), V(D8, 48, 48, 90), \
+        V(05, 03, 03, 06), V(01, F6, F6, F7), V(12, 0E, 0E, 1C), \
+        V(A3, 61, 61, C2), V(5F, 35, 35, 6A), V(F9, 57, 57, AE), \
+        V(D0, B9, B9, 69), V(91, 86, 86, 17), V(58, C1, C1, 99), \
+        V(27, 1D, 1D, 3A), V(B9, 9E, 9E, 27), V(38, E1, E1, D9), \
+        V(13, F8, F8, EB), V(B3, 98, 98, 2B), V(33, 11, 11, 22), \
+        V(BB, 69, 69, D2), V(70, D9, D9, A9), V(89, 8E, 8E, 07), \
+        V(A7, 94, 94, 33), V(B6, 9B, 9B, 2D), V(22, 1E, 1E, 3C), \
+        V(92, 87, 87, 15), V(20, E9, E9, C9), V(49, CE, CE, 87), \
+        V(FF, 55, 55, AA), V(78, 28, 28, 50), V(7A, DF, DF, A5), \
+        V(8F, 8C, 8C, 03), V(F8, A1, A1, 59), V(80, 89, 89, 09), \
+        V(17, 0D, 0D, 1A), V(DA, BF, BF, 65), V(31, E6, E6, D7), \
+        V(C6, 42, 42, 84), V(B8, 68, 68, D0), V(C3, 41, 41, 82), \
+        V(B0, 99, 99, 29), V(77, 2D, 2D, 5A), V(11, 0F, 0F, 1E), \
+        V(CB, B0, B0, 7B), V(FC, 54, 54, A8), V(D6, BB, BB, 6D), \
+        V(3A, 16, 16, 2C)
+
+#define V(a, b, c, d) 0x##a##b##c##d
+static const uint32_t FT0[256] = {FT};
+#undef V
+
+#define V(a, b, c, d) 0x##b##c##d##a
+static const uint32_t FT1[256] = {FT};
+#undef V
+
+#define V(a, b, c, d) 0x##c##d##a##b
+static const uint32_t FT2[256] = {FT};
+#undef V
+
+#define V(a, b, c, d) 0x##d##a##b##c
+static const uint32_t FT3[256] = {FT};
+#undef V
+
+#undef FT
+
+/*
+ * Reverse S-box
+ */
+static const unsigned char RSb[256] = {
+    0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E,
+    0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
+    0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32,
+    0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+    0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49,
+    0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50,
+    0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+    0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05,
+    0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
+    0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41,
+    0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+    0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8,
+    0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
+    0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B,
+    0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+    0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59,
+    0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
+    0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D,
+    0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63,
+    0x55, 0x21, 0x0C, 0x7D};
+
+/*
+ * Reverse tables
+ */
+#define RT                                                       \
+                                                                 \
+    V(50, A7, F4, 51), V(53, 65, 41, 7E), V(C3, A4, 17, 1A),     \
+        V(96, 5E, 27, 3A), V(CB, 6B, AB, 3B), V(F1, 45, 9D, 1F), \
+        V(AB, 58, FA, AC), V(93, 03, E3, 4B), V(55, FA, 30, 20), \
+        V(F6, 6D, 76, AD), V(91, 76, CC, 88), V(25, 4C, 02, F5), \
+        V(FC, D7, E5, 4F), V(D7, CB, 2A, C5), V(80, 44, 35, 26), \
+        V(8F, A3, 62, B5), V(49, 5A, B1, DE), V(67, 1B, BA, 25), \
+        V(98, 0E, EA, 45), V(E1, C0, FE, 5D), V(02, 75, 2F, C3), \
+        V(12, F0, 4C, 81), V(A3, 97, 46, 8D), V(C6, F9, D3, 6B), \
+        V(E7, 5F, 8F, 03), V(95, 9C, 92, 15), V(EB, 7A, 6D, BF), \
+        V(DA, 59, 52, 95), V(2D, 83, BE, D4), V(D3, 21, 74, 58), \
+        V(29, 69, E0, 49), V(44, C8, C9, 8E), V(6A, 89, C2, 75), \
+        V(78, 79, 8E, F4), V(6B, 3E, 58, 99), V(DD, 71, B9, 27), \
+        V(B6, 4F, E1, BE), V(17, AD, 88, F0), V(66, AC, 20, C9), \
+        V(B4, 3A, CE, 7D), V(18, 4A, DF, 63), V(82, 31, 1A, E5), \
+        V(60, 33, 51, 97), V(45, 7F, 53, 62), V(E0, 77, 64, B1), \
+        V(84, AE, 6B, BB), V(1C, A0, 81, FE), V(94, 2B, 08, F9), \
+        V(58, 68, 48, 70), V(19, FD, 45, 8F), V(87, 6C, DE, 94), \
+        V(B7, F8, 7B, 52), V(23, D3, 73, AB), V(E2, 02, 4B, 72), \
+        V(57, 8F, 1F, E3), V(2A, AB, 55, 66), V(07, 28, EB, B2), \
+        V(03, C2, B5, 2F), V(9A, 7B, C5, 86), V(A5, 08, 37, D3), \
+        V(F2, 87, 28, 30), V(B2, A5, BF, 23), V(BA, 6A, 03, 02), \
+        V(5C, 82, 16, ED), V(2B, 1C, CF, 8A), V(92, B4, 79, A7), \
+        V(F0, F2, 07, F3), V(A1, E2, 69, 4E), V(CD, F4, DA, 65), \
+        V(D5, BE, 05, 06), V(1F, 62, 34, D1), V(8A, FE, A6, C4), \
+        V(9D, 53, 2E, 34), V(A0, 55, F3, A2), V(32, E1, 8A, 05), \
+        V(75, EB, F6, A4), V(39, EC, 83, 0B), V(AA, EF, 60, 40), \
+        V(06, 9F, 71, 5E), V(51, 10, 6E, BD), V(F9, 8A, 21, 3E), \
+        V(3D, 06, DD, 96), V(AE, 05, 3E, DD), V(46, BD, E6, 4D), \
+        V(B5, 8D, 54, 91), V(05, 5D, C4, 71), V(6F, D4, 06, 04), \
+        V(FF, 15, 50, 60), V(24, FB, 98, 19), V(97, E9, BD, D6), \
+        V(CC, 43, 40, 89), V(77, 9E, D9, 67), V(BD, 42, E8, B0), \
+        V(88, 8B, 89, 07), V(38, 5B, 19, E7), V(DB, EE, C8, 79), \
+        V(47, 0A, 7C, A1), V(E9, 0F, 42, 7C), V(C9, 1E, 84, F8), \
+        V(00, 00, 00, 00), V(83, 86, 80, 09), V(48, ED, 2B, 32), \
+        V(AC, 70, 11, 1E), V(4E, 72, 5A, 6C), V(FB, FF, 0E, FD), \
+        V(56, 38, 85, 0F), V(1E, D5, AE, 3D), V(27, 39, 2D, 36), \
+        V(64, D9, 0F, 0A), V(21, A6, 5C, 68), V(D1, 54, 5B, 9B), \
+        V(3A, 2E, 36, 24), V(B1, 67, 0A, 0C), V(0F, E7, 57, 93), \
+        V(D2, 96, EE, B4), V(9E, 91, 9B, 1B), V(4F, C5, C0, 80), \
+        V(A2, 20, DC, 61), V(69, 4B, 77, 5A), V(16, 1A, 12, 1C), \
+        V(0A, BA, 93, E2), V(E5, 2A, A0, C0), V(43, E0, 22, 3C), \
+        V(1D, 17, 1B, 12), V(0B, 0D, 09, 0E), V(AD, C7, 8B, F2), \
+        V(B9, A8, B6, 2D), V(C8, A9, 1E, 14), V(85, 19, F1, 57), \
+        V(4C, 07, 75, AF), V(BB, DD, 99, EE), V(FD, 60, 7F, A3), \
+        V(9F, 26, 01, F7), V(BC, F5, 72, 5C), V(C5, 3B, 66, 44), \
+        V(34, 7E, FB, 5B), V(76, 29, 43, 8B), V(DC, C6, 23, CB), \
+        V(68, FC, ED, B6), V(63, F1, E4, B8), V(CA, DC, 31, D7), \
+        V(10, 85, 63, 42), V(40, 22, 97, 13), V(20, 11, C6, 84), \
+        V(7D, 24, 4A, 85), V(F8, 3D, BB, D2), V(11, 32, F9, AE), \
+        V(6D, A1, 29, C7), V(4B, 2F, 9E, 1D), V(F3, 30, B2, DC), \
+        V(EC, 52, 86, 0D), V(D0, E3, C1, 77), V(6C, 16, B3, 2B), \
+        V(99, B9, 70, A9), V(FA, 48, 94, 11), V(22, 64, E9, 47), \
+        V(C4, 8C, FC, A8), V(1A, 3F, F0, A0), V(D8, 2C, 7D, 56), \
+        V(EF, 90, 33, 22), V(C7, 4E, 49, 87), V(C1, D1, 38, D9), \
+        V(FE, A2, CA, 8C), V(36, 0B, D4, 98), V(CF, 81, F5, A6), \
+        V(28, DE, 7A, A5), V(26, 8E, B7, DA), V(A4, BF, AD, 3F), \
+        V(E4, 9D, 3A, 2C), V(0D, 92, 78, 50), V(9B, CC, 5F, 6A), \
+        V(62, 46, 7E, 54), V(C2, 13, 8D, F6), V(E8, B8, D8, 90), \
+        V(5E, F7, 39, 2E), V(F5, AF, C3, 82), V(BE, 80, 5D, 9F), \
+        V(7C, 93, D0, 69), V(A9, 2D, D5, 6F), V(B3, 12, 25, CF), \
+        V(3B, 99, AC, C8), V(A7, 7D, 18, 10), V(6E, 63, 9C, E8), \
+        V(7B, BB, 3B, DB), V(09, 78, 26, CD), V(F4, 18, 59, 6E), \
+        V(01, B7, 9A, EC), V(A8, 9A, 4F, 83), V(65, 6E, 95, E6), \
+        V(7E, E6, FF, AA), V(08, CF, BC, 21), V(E6, E8, 15, EF), \
+        V(D9, 9B, E7, BA), V(CE, 36, 6F, 4A), V(D4, 09, 9F, EA), \
+        V(D6, 7C, B0, 29), V(AF, B2, A4, 31), V(31, 23, 3F, 2A), \
+        V(30, 94, A5, C6), V(C0, 66, A2, 35), V(37, BC, 4E, 74), \
+        V(A6, CA, 82, FC), V(B0, D0, 90, E0), V(15, D8, A7, 33), \
+        V(4A, 98, 04, F1), V(F7, DA, EC, 41), V(0E, 50, CD, 7F), \
+        V(2F, F6, 91, 17), V(8D, D6, 4D, 76), V(4D, B0, EF, 43), \
+        V(54, 4D, AA, CC), V(DF, 04, 96, E4), V(E3, B5, D1, 9E), \
+        V(1B, 88, 6A, 4C), V(B8, 1F, 2C, C1), V(7F, 51, 65, 46), \
+        V(04, EA, 5E, 9D), V(5D, 35, 8C, 01), V(73, 74, 87, FA), \
+        V(2E, 41, 0B, FB), V(5A, 1D, 67, B3), V(52, D2, DB, 92), \
+        V(33, 56, 10, E9), V(13, 47, D6, 6D), V(8C, 61, D7, 9A), \
+        V(7A, 0C, A1, 37), V(8E, 14, F8, 59), V(89, 3C, 13, EB), \
+        V(EE, 27, A9, CE), V(35, C9, 61, B7), V(ED, E5, 1C, E1), \
+        V(3C, B1, 47, 7A), V(59, DF, D2, 9C), V(3F, 73, F2, 55), \
+        V(79, CE, 14, 18), V(BF, 37, C7, 73), V(EA, CD, F7, 53), \
+        V(5B, AA, FD, 5F), V(14, 6F, 3D, DF), V(86, DB, 44, 78), \
+        V(81, F3, AF, CA), V(3E, C4, 68, B9), V(2C, 34, 24, 38), \
+        V(5F, 40, A3, C2), V(72, C3, 1D, 16), V(0C, 25, E2, BC), \
+        V(8B, 49, 3C, 28), V(41, 95, 0D, FF), V(71, 01, A8, 39), \
+        V(DE, B3, 0C, 08), V(9C, E4, B4, D8), V(90, C1, 56, 64), \
+        V(61, 84, CB, 7B), V(70, B6, 32, D5), V(74, 5C, 6C, 48), \
+        V(42, 57, B8, D0)
+
+#define V(a, b, c, d) 0x##a##b##c##d
+static const uint32_t RT0[256] = {RT};
+#undef V
+
+#define V(a, b, c, d) 0x##b##c##d##a
+static const uint32_t RT1[256] = {RT};
+#undef V
+
+#define V(a, b, c, d) 0x##c##d##a##b
+static const uint32_t RT2[256] = {RT};
+#undef V
+
+#define V(a, b, c, d) 0x##d##a##b##c
+static const uint32_t RT3[256] = {RT};
+#undef V
+
+#undef RT
+
+/*
+ * Round constants
+ */
+static const uint32_t RCON[10] = {
+    0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010,
+    0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036};
+
+#else /* MBEDTLS_AES_ROM_TABLES */
+
+/*
+ * Forward S-box & tables
+ */
+static unsigned char FSb[256];
+static uint32_t FT0[256];
+static uint32_t FT1[256];
+static uint32_t FT2[256];
+static uint32_t FT3[256];
+
+/*
+ * Reverse S-box & tables
+ */
+static unsigned char RSb[256];
+static uint32_t RT0[256];
+static uint32_t RT1[256];
+static uint32_t RT2[256];
+static uint32_t RT3[256];
+
+/*
+ * Round constants
+ */
+static uint32_t RCON[10];
+
+/*
+ * Tables generation code
+ */
+#define ROTL8(x) ((x << 8) & 0xFFFFFFFF) | (x >> 24)
+#define XTIME(x) ((x << 1) ^ ((x & 0x80) ? 0x1B : 0x00))
+#define MUL(x, y) ((x && y) ? pow[(log[x] + log[y]) % 255] : 0)
+
+static int aes_init_done = 0;
+
+static void aes_gen_tables(void) {
+    int i, x, y, z;
+    int pow[256];
+    int log[256];
+
+    /*
+     * compute pow and log tables over GF(2^8)
+     */
+    for (i = 0, x = 1; i < 256; i++) {
+        pow[i] = x;
+        log[x] = i;
+        x = (x ^ XTIME(x)) & 0xFF;
+    }
+
+    /*
+     * calculate the round constants
+     */
+    for (i = 0, x = 1; i < 10; i++) {
+        RCON[i] = (uint32_t)x;
+        x = XTIME(x) & 0xFF;
+    }
+
+    /*
+     * generate the forward and reverse S-boxes
+     */
+    FSb[0x00] = 0x63;
+    RSb[0x63] = 0x00;
+
+    for (i = 1; i < 256; i++) {
+        x = pow[255 - log[i]];
+
+        y = x;
+        y = ((y << 1) | (y >> 7)) & 0xFF;
+        x ^= y;
+        y = ((y << 1) | (y >> 7)) & 0xFF;
+        x ^= y;
+        y = ((y << 1) | (y >> 7)) & 0xFF;
+        x ^= y;
+        y = ((y << 1) | (y >> 7)) & 0xFF;
+        x ^= y ^ 0x63;
+
+        FSb[i] = (unsigned char)x;
+        RSb[x] = (unsigned char)i;
+    }
+
+    /*
+     * generate the forward and reverse tables
+     */
+    for (i = 0; i < 256; i++) {
+        x = FSb[i];
+        y = XTIME(x) & 0xFF;
+        z = (y ^ x) & 0xFF;
+
+        FT0[i] = ((uint32_t)y) ^ ((uint32_t)x << 8) ^ ((uint32_t)x << 16) ^
+                 ((uint32_t)z << 24);
+
+        FT1[i] = ROTL8(FT0[i]);
+        FT2[i] = ROTL8(FT1[i]);
+        FT3[i] = ROTL8(FT2[i]);
+
+        x = RSb[i];
+
+        RT0[i] = ((uint32_t)MUL(0x0E, x)) ^ ((uint32_t)MUL(0x09, x) << 8) ^
+                 ((uint32_t)MUL(0x0D, x) << 16) ^
+                 ((uint32_t)MUL(0x0B, x) << 24);
+
+        RT1[i] = ROTL8(RT0[i]);
+        RT2[i] = ROTL8(RT1[i]);
+        RT3[i] = ROTL8(RT2[i]);
+    }
+}
+
+#endif /* MBEDTLS_AES_ROM_TABLES */
+
+void mbedtls_aes_init(mbedtls_aes_context *ctx) {
+    memset(ctx, 0, sizeof(mbedtls_aes_context));
+}
+
+void mbedtls_aes_free(mbedtls_aes_context *ctx) {
+    if (ctx == NULL) return;
+
+    mbedtls_zeroize(ctx, sizeof(mbedtls_aes_context));
+}
+
+/*
+ * AES key schedule (encryption)
+ */
+#if !defined(MBEDTLS_AES_SETKEY_ENC_ALT)
+int mbedtls_aes_setkey_enc(mbedtls_aes_context *ctx, const unsigned char *key,
+                           unsigned int keybits) {
+    unsigned int i;
+    uint32_t *RK;
+
+#if !defined(MBEDTLS_AES_ROM_TABLES)
+    if (aes_init_done == 0) {
+        aes_gen_tables();
+        aes_init_done = 1;
+    }
+#endif
+
+    switch (keybits) {
+        case 128:
+            ctx->nr = 10;
+            break;
+        case 192:
+            ctx->nr = 12;
+            break;
+        case 256:
+            ctx->nr = 14;
+            break;
+        default:
+            return (MBEDTLS_ERR_AES_INVALID_KEY_LENGTH);
+    }
+
+#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_PADLOCK_ALIGN16)
+    if (aes_padlock_ace == -1)
+        aes_padlock_ace = mbedtls_padlock_has_support(MBEDTLS_PADLOCK_ACE);
+
+    if (aes_padlock_ace)
+        ctx->rk = RK = MBEDTLS_PADLOCK_ALIGN16(ctx->buf);
+    else
+#endif
+        ctx->rk = RK = ctx->buf;
+
+#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
+    if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES))
+        return (
+            mbedtls_aesni_setkey_enc((unsigned char *)ctx->rk, key, keybits));
+#endif
+
+    for (i = 0; i < (keybits >> 5); i++) {
+        GET_UINT32_LE(RK[i], key, i << 2);
+    }
+
+    switch (ctx->nr) {
+        case 10:
+
+            for (i = 0; i < 10; i++, RK += 4) {
+                RK[4] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[3] >> 8) & 0xFF]) ^
+                        ((uint32_t)FSb[(RK[3] >> 16) & 0xFF] << 8) ^
+                        ((uint32_t)FSb[(RK[3] >> 24) & 0xFF] << 16) ^
+                        ((uint32_t)FSb[(RK[3]) & 0xFF] << 24);
+
+                RK[5] = RK[1] ^ RK[4];
+                RK[6] = RK[2] ^ RK[5];
+                RK[7] = RK[3] ^ RK[6];
+            }
+            break;
+
+        case 12:
+
+            for (i = 0; i < 8; i++, RK += 6) {
+                RK[6] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[5] >> 8) & 0xFF]) ^
+                        ((uint32_t)FSb[(RK[5] >> 16) & 0xFF] << 8) ^
+                        ((uint32_t)FSb[(RK[5] >> 24) & 0xFF] << 16) ^
+                        ((uint32_t)FSb[(RK[5]) & 0xFF] << 24);
+
+                RK[7] = RK[1] ^ RK[6];
+                RK[8] = RK[2] ^ RK[7];
+                RK[9] = RK[3] ^ RK[8];
+                RK[10] = RK[4] ^ RK[9];
+                RK[11] = RK[5] ^ RK[10];
+            }
+            break;
+
+        case 14:
+
+            for (i = 0; i < 7; i++, RK += 8) {
+                RK[8] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[7] >> 8) & 0xFF]) ^
+                        ((uint32_t)FSb[(RK[7] >> 16) & 0xFF] << 8) ^
+                        ((uint32_t)FSb[(RK[7] >> 24) & 0xFF] << 16) ^
+                        ((uint32_t)FSb[(RK[7]) & 0xFF] << 24);
+
+                RK[9] = RK[1] ^ RK[8];
+                RK[10] = RK[2] ^ RK[9];
+                RK[11] = RK[3] ^ RK[10];
+
+                RK[12] = RK[4] ^ ((uint32_t)FSb[(RK[11]) & 0xFF]) ^
+                         ((uint32_t)FSb[(RK[11] >> 8) & 0xFF] << 8) ^
+                         ((uint32_t)FSb[(RK[11] >> 16) & 0xFF] << 16) ^
+                         ((uint32_t)FSb[(RK[11] >> 24) & 0xFF] << 24);
+
+                RK[13] = RK[5] ^ RK[12];
+                RK[14] = RK[6] ^ RK[13];
+                RK[15] = RK[7] ^ RK[14];
+            }
+            break;
+    }
+
+    return (0);
+}
+#endif /* !MBEDTLS_AES_SETKEY_ENC_ALT */
+
+/*
+ * AES key schedule (decryption)
+ */
+#if !defined(MBEDTLS_AES_SETKEY_DEC_ALT)
+int mbedtls_aes_setkey_dec(mbedtls_aes_context *ctx, const unsigned char *key,
+                           unsigned int keybits) {
+    int i, j, ret;
+    mbedtls_aes_context cty;
+    uint32_t *RK;
+    uint32_t *SK;
+
+    mbedtls_aes_init(&cty);
+
+#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_PADLOCK_ALIGN16)
+    if (aes_padlock_ace == -1)
+        aes_padlock_ace = mbedtls_padlock_has_support(MBEDTLS_PADLOCK_ACE);
+
+    if (aes_padlock_ace)
+        ctx->rk = RK = MBEDTLS_PADLOCK_ALIGN16(ctx->buf);
+    else
+#endif
+        ctx->rk = RK = ctx->buf;
+
+    /* Also checks keybits */
+    if ((ret = mbedtls_aes_setkey_enc(&cty, key, keybits)) != 0) goto exit;
+
+    ctx->nr = cty.nr;
+
+#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
+    if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) {
+        mbedtls_aesni_inverse_key((unsigned char *)ctx->rk,
+                                  (const unsigned char *)cty.rk, ctx->nr);
+        goto exit;
+    }
+#endif
+
+    SK = cty.rk + cty.nr * 4;
+
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+
+    for (i = ctx->nr - 1, SK -= 8; i > 0; i--, SK -= 8) {
+        for (j = 0; j < 4; j++, SK++) {
+            *RK++ = RT0[FSb[(*SK) & 0xFF]] ^ RT1[FSb[(*SK >> 8) & 0xFF]] ^
+                    RT2[FSb[(*SK >> 16) & 0xFF]] ^ RT3[FSb[(*SK >> 24) & 0xFF]];
+        }
+    }
+
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+    *RK++ = *SK++;
+
+exit:
+    mbedtls_aes_free(&cty);
+
+    return (ret);
+}
+#endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */
+
+#define AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3)            \
+    {                                                         \
+        X0 = *RK++ ^ FT0[(Y0)&0xFF] ^ FT1[(Y1 >> 8) & 0xFF] ^ \
+             FT2[(Y2 >> 16) & 0xFF] ^ FT3[(Y3 >> 24) & 0xFF]; \
+                                                              \
+        X1 = *RK++ ^ FT0[(Y1)&0xFF] ^ FT1[(Y2 >> 8) & 0xFF] ^ \
+             FT2[(Y3 >> 16) & 0xFF] ^ FT3[(Y0 >> 24) & 0xFF]; \
+                                                              \
+        X2 = *RK++ ^ FT0[(Y2)&0xFF] ^ FT1[(Y3 >> 8) & 0xFF] ^ \
+             FT2[(Y0 >> 16) & 0xFF] ^ FT3[(Y1 >> 24) & 0xFF]; \
+                                                              \
+        X3 = *RK++ ^ FT0[(Y3)&0xFF] ^ FT1[(Y0 >> 8) & 0xFF] ^ \
+             FT2[(Y1 >> 16) & 0xFF] ^ FT3[(Y2 >> 24) & 0xFF]; \
+    }
+
+#define AES_RROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3)            \
+    {                                                         \
+        X0 = *RK++ ^ RT0[(Y0)&0xFF] ^ RT1[(Y3 >> 8) & 0xFF] ^ \
+             RT2[(Y2 >> 16) & 0xFF] ^ RT3[(Y1 >> 24) & 0xFF]; \
+                                                              \
+        X1 = *RK++ ^ RT0[(Y1)&0xFF] ^ RT1[(Y0 >> 8) & 0xFF] ^ \
+             RT2[(Y3 >> 16) & 0xFF] ^ RT3[(Y2 >> 24) & 0xFF]; \
+                                                              \
+        X2 = *RK++ ^ RT0[(Y2)&0xFF] ^ RT1[(Y1 >> 8) & 0xFF] ^ \
+             RT2[(Y0 >> 16) & 0xFF] ^ RT3[(Y3 >> 24) & 0xFF]; \
+                                                              \
+        X3 = *RK++ ^ RT0[(Y3)&0xFF] ^ RT1[(Y2 >> 8) & 0xFF] ^ \
+             RT2[(Y1 >> 16) & 0xFF] ^ RT3[(Y0 >> 24) & 0xFF]; \
+    }
+
+/*
+ * AES-ECB block encryption
+ */
+#if !defined(MBEDTLS_AES_ENCRYPT_ALT)
+int mbedtls_internal_aes_encrypt(mbedtls_aes_context *ctx,
+                                 const unsigned char input[16],
+                                 unsigned char output[16]) {
+    int i;
+    uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+
+    RK = ctx->rk;
+
+    GET_UINT32_LE(X0, input, 0);
+    X0 ^= *RK++;
+    GET_UINT32_LE(X1, input, 4);
+    X1 ^= *RK++;
+    GET_UINT32_LE(X2, input, 8);
+    X2 ^= *RK++;
+    GET_UINT32_LE(X3, input, 12);
+    X3 ^= *RK++;
+
+    for (i = (ctx->nr >> 1) - 1; i > 0; i--) {
+        AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+        AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+    }
+
+    AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+
+    X0 = *RK++ ^ ((uint32_t)FSb[(Y0)&0xFF]) ^
+         ((uint32_t)FSb[(Y1 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)FSb[(Y2 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)FSb[(Y3 >> 24) & 0xFF] << 24);
+
+    X1 = *RK++ ^ ((uint32_t)FSb[(Y1)&0xFF]) ^
+         ((uint32_t)FSb[(Y2 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)FSb[(Y3 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)FSb[(Y0 >> 24) & 0xFF] << 24);
+
+    X2 = *RK++ ^ ((uint32_t)FSb[(Y2)&0xFF]) ^
+         ((uint32_t)FSb[(Y3 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)FSb[(Y0 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)FSb[(Y1 >> 24) & 0xFF] << 24);
+
+    X3 = *RK++ ^ ((uint32_t)FSb[(Y3)&0xFF]) ^
+         ((uint32_t)FSb[(Y0 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)FSb[(Y1 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)FSb[(Y2 >> 24) & 0xFF] << 24);
+
+    PUT_UINT32_LE(X0, output, 0);
+    PUT_UINT32_LE(X1, output, 4);
+    PUT_UINT32_LE(X2, output, 8);
+    PUT_UINT32_LE(X3, output, 12);
+
+    return (0);
+}
+#endif /* !MBEDTLS_AES_ENCRYPT_ALT */
+
+/*
+ * AES-ECB block decryption
+ */
+#if !defined(MBEDTLS_AES_DECRYPT_ALT)
+int mbedtls_internal_aes_decrypt(mbedtls_aes_context *ctx,
+                                 const unsigned char input[16],
+                                 unsigned char output[16]) {
+    int i;
+    uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+
+    RK = ctx->rk;
+
+    GET_UINT32_LE(X0, input, 0);
+    X0 ^= *RK++;
+    GET_UINT32_LE(X1, input, 4);
+    X1 ^= *RK++;
+    GET_UINT32_LE(X2, input, 8);
+    X2 ^= *RK++;
+    GET_UINT32_LE(X3, input, 12);
+    X3 ^= *RK++;
+
+    for (i = (ctx->nr >> 1) - 1; i > 0; i--) {
+        AES_RROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+        AES_RROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+    }
+
+    AES_RROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+
+    X0 = *RK++ ^ ((uint32_t)RSb[(Y0)&0xFF]) ^
+         ((uint32_t)RSb[(Y3 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)RSb[(Y2 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)RSb[(Y1 >> 24) & 0xFF] << 24);
+
+    X1 = *RK++ ^ ((uint32_t)RSb[(Y1)&0xFF]) ^
+         ((uint32_t)RSb[(Y0 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)RSb[(Y3 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)RSb[(Y2 >> 24) & 0xFF] << 24);
+
+    X2 = *RK++ ^ ((uint32_t)RSb[(Y2)&0xFF]) ^
+         ((uint32_t)RSb[(Y1 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)RSb[(Y0 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)RSb[(Y3 >> 24) & 0xFF] << 24);
+
+    X3 = *RK++ ^ ((uint32_t)RSb[(Y3)&0xFF]) ^
+         ((uint32_t)RSb[(Y2 >> 8) & 0xFF] << 8) ^
+         ((uint32_t)RSb[(Y1 >> 16) & 0xFF] << 16) ^
+         ((uint32_t)RSb[(Y0 >> 24) & 0xFF] << 24);
+
+    PUT_UINT32_LE(X0, output, 0);
+    PUT_UINT32_LE(X1, output, 4);
+    PUT_UINT32_LE(X2, output, 8);
+    PUT_UINT32_LE(X3, output, 12);
+
+    return (0);
+}
+#endif /* !MBEDTLS_AES_DECRYPT_ALT */
+
+/*
+ * AES-ECB block encryption/decryption
+ */
+int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, int mode,
+                          const unsigned char input[16],
+                          unsigned char output[16]) {
+#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
+    if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES))
+        return (mbedtls_aesni_crypt_ecb(ctx, mode, input, output));
+#endif
+
+#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_HAVE_X86)
+    if (aes_padlock_ace) {
+        if (mbedtls_padlock_xcryptecb(ctx, mode, input, output) == 0)
+            return (0);
+
+        // If padlock data misaligned, we just fall back to
+        // unaccelerated mode
+        //
+    }
+#endif
+
+    if (mode == MBEDTLS_AES_ENCRYPT)
+        return (mbedtls_internal_aes_encrypt(ctx, input, output));
+    else
+        return (mbedtls_internal_aes_decrypt(ctx, input, output));
+}
+
+#if defined(MBEDTLS_CIPHER_MODE_CBC)
+/*
+ * AES-CBC buffer encryption/decryption
+ */
+int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, int mode, size_t length,
+                          unsigned char iv[16], const unsigned char *input,
+                          unsigned char *output) {
+    int i;
+    unsigned char temp[16];
+
+    if (length % 16) return (MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH);
+
+#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_HAVE_X86)
+    if (aes_padlock_ace) {
+        if (mbedtls_padlock_xcryptcbc(ctx, mode, length, iv, input, output) ==
+            0)
+            return (0);
+
+        // If padlock data misaligned, we just fall back to
+        // unaccelerated mode
+        //
+    }
+#endif
+
+    if (mode == MBEDTLS_AES_DECRYPT) {
+        while (length > 0) {
+            memcpy(temp, input, 16);
+            mbedtls_aes_crypt_ecb(ctx, mode, input, output);
+
+            for (i = 0; i < 16; i++)
+                output[i] = (unsigned char)(output[i] ^ iv[i]);
+
+            memcpy(iv, temp, 16);
+
+            input += 16;
+            output += 16;
+            length -= 16;
+        }
+    } else {
+        while (length > 0) {
+            for (i = 0; i < 16; i++)
+                output[i] = (unsigned char)(input[i] ^ iv[i]);
+
+            mbedtls_aes_crypt_ecb(ctx, mode, output, output);
+            memcpy(iv, output, 16);
+
+            input += 16;
+            output += 16;
+            length -= 16;
+        }
+    }
+
+    return (0);
+}
+#endif /* MBEDTLS_CIPHER_MODE_CBC */
+
+#if defined(MBEDTLS_CIPHER_MODE_CFB)
+/*
+ * AES-CFB128 buffer encryption/decryption
+ */
+int mbedtls_aes_crypt_cfb128(mbedtls_aes_context *ctx, int mode, size_t length,
+                             size_t *iv_off, unsigned char iv[16],
+                             const unsigned char *input,
+                             unsigned char *output) {
+    int c;
+    size_t n = *iv_off;
+
+    if (mode == MBEDTLS_AES_DECRYPT) {
+        while (length--) {
+            if (n == 0) mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv);
+
+            c = *input++;
+            *output++ = (unsigned char)(c ^ iv[n]);
+            iv[n] = (unsigned char)c;
+
+            n = (n + 1) & 0x0F;
+        }
+    } else {
+        while (length--) {
+            if (n == 0) mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv);
+
+            iv[n] = *output++ = (unsigned char)(iv[n] ^ *input++);
+
+            n = (n + 1) & 0x0F;
+        }
+    }
+
+    *iv_off = n;
+
+    return (0);
+}
+
+/*
+ * AES-CFB8 buffer encryption/decryption
+ */
+int mbedtls_aes_crypt_cfb8(mbedtls_aes_context *ctx, int mode, size_t length,
+                           unsigned char iv[16], const unsigned char *input,
+                           unsigned char *output) {
+    unsigned char c;
+    unsigned char ov[17];
+
+    while (length--) {
+        memcpy(ov, iv, 16);
+        mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv);
+
+        if (mode == MBEDTLS_AES_DECRYPT) ov[16] = *input;
+
+        c = *output++ = (unsigned char)(iv[0] ^ *input++);
+
+        if (mode == MBEDTLS_AES_ENCRYPT) ov[16] = c;
+
+        memcpy(iv, ov + 1, 16);
+    }
+
+    return (0);
+}
+#endif /*MBEDTLS_CIPHER_MODE_CFB */
+
+#if defined(MBEDTLS_CIPHER_MODE_CTR)
+/*
+ * AES-CTR buffer encryption/decryption
+ */
+int mbedtls_aes_crypt_ctr(mbedtls_aes_context *ctx, size_t length,
+                          size_t *nc_off, unsigned char nonce_counter[16],
+                          unsigned char stream_block[16],
+                          const unsigned char *input, unsigned char *output) {
+    int c, i;
+    size_t n = *nc_off;
+
+    while (length--) {
+        if (n == 0) {
+            mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_counter,
+                                  stream_block);
+
+            for (i = 16; i > 0; i--)
+                if (++nonce_counter[i - 1] != 0) break;
+        }
+        c = *input++;
+        *output++ = (unsigned char)(c ^ stream_block[n]);
+
+        n = (n + 1) & 0x0F;
+    }
+
+    *nc_off = n;
+
+    return (0);
+}
+#endif /* MBEDTLS_CIPHER_MODE_CTR */
+
+#endif /* !MBEDTLS_AES_ALT */
+
+#if defined(MBEDTLS_SELF_TEST)
+/*
+ * AES test vectors from:
+ *
+ * http://csrc.nist.gov/archive/aes/rijndael/rijndael-vals.zip
+ */
+static const unsigned char aes_test_ecb_dec[3][16] = {
+    {0x44, 0x41, 0x6A, 0xC2, 0xD1, 0xF5, 0x3C, 0x58, 0x33, 0x03, 0x91, 0x7E,
+     0x6B, 0xE9, 0xEB, 0xE0},
+    {0x48, 0xE3, 0x1E, 0x9E, 0x25, 0x67, 0x18, 0xF2, 0x92, 0x29, 0x31, 0x9C,
+     0x19, 0xF1, 0x5B, 0xA4},
+    {0x05, 0x8C, 0xCF, 0xFD, 0xBB, 0xCB, 0x38, 0x2D, 0x1F, 0x6F, 0x56, 0x58,
+     0x5D, 0x8A, 0x4A, 0xDE}};
+
+static const unsigned char aes_test_ecb_enc[3][16] = {
+    {0xC3, 0x4C, 0x05, 0x2C, 0xC0, 0xDA, 0x8D, 0x73, 0x45, 0x1A, 0xFE, 0x5F,
+     0x03, 0xBE, 0x29, 0x7F},
+    {0xF3, 0xF6, 0x75, 0x2A, 0xE8, 0xD7, 0x83, 0x11, 0x38, 0xF0, 0x41, 0x56,
+     0x06, 0x31, 0xB1, 0x14},
+    {0x8B, 0x79, 0xEE, 0xCC, 0x93, 0xA0, 0xEE, 0x5D, 0xFF, 0x30, 0xB4, 0xEA,
+     0x21, 0x63, 0x6D, 0xA4}};
+
+#if defined(MBEDTLS_CIPHER_MODE_CBC)
+static const unsigned char aes_test_cbc_dec[3][16] = {
+    {0xFA, 0xCA, 0x37, 0xE0, 0xB0, 0xC8, 0x53, 0x73, 0xDF, 0x70, 0x6E, 0x73,
+     0xF7, 0xC9, 0xAF, 0x86},
+    {0x5D, 0xF6, 0x78, 0xDD, 0x17, 0xBA, 0x4E, 0x75, 0xB6, 0x17, 0x68, 0xC6,
+     0xAD, 0xEF, 0x7C, 0x7B},
+    {0x48, 0x04, 0xE1, 0x81, 0x8F, 0xE6, 0x29, 0x75, 0x19, 0xA3, 0xE8, 0x8C,
+     0x57, 0x31, 0x04, 0x13}};
+
+static const unsigned char aes_test_cbc_enc[3][16] = {
+    {0x8A, 0x05, 0xFC, 0x5E, 0x09, 0x5A, 0xF4, 0x84, 0x8A, 0x08, 0xD3, 0x28,
+     0xD3, 0x68, 0x8E, 0x3D},
+    {0x7B, 0xD9, 0x66, 0xD5, 0x3A, 0xD8, 0xC1, 0xBB, 0x85, 0xD2, 0xAD, 0xFA,
+     0xE8, 0x7B, 0xB1, 0x04},
+    {0xFE, 0x3C, 0x53, 0x65, 0x3E, 0x2F, 0x45, 0xB5, 0x6F, 0xCD, 0x88, 0xB2,
+     0xCC, 0x89, 0x8F, 0xF0}};
+#endif /* MBEDTLS_CIPHER_MODE_CBC */
+
+#if defined(MBEDTLS_CIPHER_MODE_CFB)
+/*
+ * AES-CFB128 test vectors from:
+ *
+ * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
+ */
+static const unsigned char aes_test_cfb128_key[3][32] = {
+    {0x2B, 0x7E, 0x15, 0x16, 0x28, 0xAE, 0xD2, 0xA6, 0xAB, 0xF7, 0x15, 0x88,
+     0x09, 0xCF, 0x4F, 0x3C},
+    {0x8E, 0x73, 0xB0, 0xF7, 0xDA, 0x0E, 0x64, 0x52, 0xC8, 0x10, 0xF3, 0x2B,
+     0x80, 0x90, 0x79, 0xE5, 0x62, 0xF8, 0xEA, 0xD2, 0x52, 0x2C, 0x6B, 0x7B},
+    {0x60, 0x3D, 0xEB, 0x10, 0x15, 0xCA, 0x71, 0xBE, 0x2B, 0x73, 0xAE,
+     0xF0, 0x85, 0x7D, 0x77, 0x81, 0x1F, 0x35, 0x2C, 0x07, 0x3B, 0x61,
+     0x08, 0xD7, 0x2D, 0x98, 0x10, 0xA3, 0x09, 0x14, 0xDF, 0xF4}};
+
+static const unsigned char aes_test_cfb128_iv[16] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F};
+
+static const unsigned char aes_test_cfb128_pt[64] = {
+    0x6B, 0xC1, 0xBE, 0xE2, 0x2E, 0x40, 0x9F, 0x96, 0xE9, 0x3D, 0x7E,
+    0x11, 0x73, 0x93, 0x17, 0x2A, 0xAE, 0x2D, 0x8A, 0x57, 0x1E, 0x03,
+    0xAC, 0x9C, 0x9E, 0xB7, 0x6F, 0xAC, 0x45, 0xAF, 0x8E, 0x51, 0x30,
+    0xC8, 0x1C, 0x46, 0xA3, 0x5C, 0xE4, 0x11, 0xE5, 0xFB, 0xC1, 0x19,
+    0x1A, 0x0A, 0x52, 0xEF, 0xF6, 0x9F, 0x24, 0x45, 0xDF, 0x4F, 0x9B,
+    0x17, 0xAD, 0x2B, 0x41, 0x7B, 0xE6, 0x6C, 0x37, 0x10};
+
+static const unsigned char aes_test_cfb128_ct[3][64] = {
+    {0x3B, 0x3F, 0xD9, 0x2E, 0xB7, 0x2D, 0xAD, 0x20, 0x33, 0x34, 0x49,
+     0xF8, 0xE8, 0x3C, 0xFB, 0x4A, 0xC8, 0xA6, 0x45, 0x37, 0xA0, 0xB3,
+     0xA9, 0x3F, 0xCD, 0xE3, 0xCD, 0xAD, 0x9F, 0x1C, 0xE5, 0x8B, 0x26,
+     0x75, 0x1F, 0x67, 0xA3, 0xCB, 0xB1, 0x40, 0xB1, 0x80, 0x8C, 0xF1,
+     0x87, 0xA4, 0xF4, 0xDF, 0xC0, 0x4B, 0x05, 0x35, 0x7C, 0x5D, 0x1C,
+     0x0E, 0xEA, 0xC4, 0xC6, 0x6F, 0x9F, 0xF7, 0xF2, 0xE6},
+    {0xCD, 0xC8, 0x0D, 0x6F, 0xDD, 0xF1, 0x8C, 0xAB, 0x34, 0xC2, 0x59,
+     0x09, 0xC9, 0x9A, 0x41, 0x74, 0x67, 0xCE, 0x7F, 0x7F, 0x81, 0x17,
+     0x36, 0x21, 0x96, 0x1A, 0x2B, 0x70, 0x17, 0x1D, 0x3D, 0x7A, 0x2E,
+     0x1E, 0x8A, 0x1D, 0xD5, 0x9B, 0x88, 0xB1, 0xC8, 0xE6, 0x0F, 0xED,
+     0x1E, 0xFA, 0xC4, 0xC9, 0xC0, 0x5F, 0x9F, 0x9C, 0xA9, 0x83, 0x4F,
+     0xA0, 0x42, 0xAE, 0x8F, 0xBA, 0x58, 0x4B, 0x09, 0xFF},
+    {0xDC, 0x7E, 0x84, 0xBF, 0xDA, 0x79, 0x16, 0x4B, 0x7E, 0xCD, 0x84,
+     0x86, 0x98, 0x5D, 0x38, 0x60, 0x39, 0xFF, 0xED, 0x14, 0x3B, 0x28,
+     0xB1, 0xC8, 0x32, 0x11, 0x3C, 0x63, 0x31, 0xE5, 0x40, 0x7B, 0xDF,
+     0x10, 0x13, 0x24, 0x15, 0xE5, 0x4B, 0x92, 0xA1, 0x3E, 0xD0, 0xA8,
+     0x26, 0x7A, 0xE2, 0xF9, 0x75, 0xA3, 0x85, 0x74, 0x1A, 0xB9, 0xCE,
+     0xF8, 0x20, 0x31, 0x62, 0x3D, 0x55, 0xB1, 0xE4, 0x71}};
+#endif /* MBEDTLS_CIPHER_MODE_CFB */
+
+#if defined(MBEDTLS_CIPHER_MODE_CTR)
+/*
+ * AES-CTR test vectors from:
+ *
+ * http://www.faqs.org/rfcs/rfc3686.html
+ */
+
+static const unsigned char aes_test_ctr_key[3][16] = {
+    {0xAE, 0x68, 0x52, 0xF8, 0x12, 0x10, 0x67, 0xCC, 0x4B, 0xF7, 0xA5, 0x76,
+     0x55, 0x77, 0xF3, 0x9E},
+    {0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F,
+     0x32, 0x53, 0x91, 0x63},
+    {0x76, 0x91, 0xBE, 0x03, 0x5E, 0x50, 0x20, 0xA8, 0xAC, 0x6E, 0x61, 0x85,
+     0x29, 0xF9, 0xA0, 0xDC}};
+
+static const unsigned char aes_test_ctr_nonce_counter[3][16] = {
+    {0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x01},
+    {0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
+     0x00, 0x00, 0x00, 0x01},
+    {0x00, 0xE0, 0x01, 0x7B, 0x27, 0x77, 0x7F, 0x3F, 0x4A, 0x17, 0x86, 0xF0,
+     0x00, 0x00, 0x00, 0x01}};
+
+static const unsigned char aes_test_ctr_pt[3][48] = {
+    {0x53, 0x69, 0x6E, 0x67, 0x6C, 0x65, 0x20, 0x62, 0x6C, 0x6F, 0x63, 0x6B,
+     0x20, 0x6D, 0x73, 0x67},
+
+    {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+     0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+     0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F},
+
+    {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+     0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23}};
+
+static const unsigned char aes_test_ctr_ct[3][48] = {
+    {0xE4, 0x09, 0x5D, 0x4F, 0xB7, 0xA7, 0xB3, 0x79, 0x2D, 0x61, 0x75, 0xA3,
+     0x26, 0x13, 0x11, 0xB8},
+    {0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41,
+     0xEE, 0x8E, 0xDA, 0xD3, 0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA,
+     0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41, 0xBE, 0x28},
+    {0xC1, 0xCF, 0x48, 0xA8, 0x9F, 0x2F, 0xFD, 0xD9, 0xCF, 0x46, 0x52, 0xE9,
+     0xEF, 0xDB, 0x72, 0xD7, 0x45, 0x40, 0xA4, 0x2B, 0xDE, 0x6D, 0x78, 0x36,
+     0xD5, 0x9A, 0x5C, 0xEA, 0xAE, 0xF3, 0x10, 0x53, 0x25, 0xB2, 0x07, 0x2F}};
+
+static const int aes_test_ctr_len[3] = {16, 32, 36};
+#endif /* MBEDTLS_CIPHER_MODE_CTR */
+
+/*
+ * Checkup routine
+ */
+int mbedtls_aes_self_test(int verbose) {
+    int ret = 0, i, j, u, v;
+    unsigned char key[32];
+    unsigned char buf[64];
+#if defined(MBEDTLS_CIPHER_MODE_CBC) || defined(MBEDTLS_CIPHER_MODE_CFB)
+    unsigned char iv[16];
+#endif
+#if defined(MBEDTLS_CIPHER_MODE_CBC)
+    unsigned char prv[16];
+#endif
+#if defined(MBEDTLS_CIPHER_MODE_CTR) || defined(MBEDTLS_CIPHER_MODE_CFB)
+    size_t offset;
+#endif
+#if defined(MBEDTLS_CIPHER_MODE_CTR)
+    int len;
+    unsigned char nonce_counter[16];
+    unsigned char stream_block[16];
+#endif
+    mbedtls_aes_context ctx;
+
+    memset(key, 0, 32);
+    mbedtls_aes_init(&ctx);
+
+    /*
+     * ECB mode
+     */
+    for (i = 0; i < 6; i++) {
+        u = i >> 1;
+        v = i & 1;
+
+        if (verbose != 0)
+            mbedtls_printf("  AES-ECB-%3d (%s): ", 128 + u * 64,
+                           (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc");
+
+        memset(buf, 0, 16);
+
+        if (v == MBEDTLS_AES_DECRYPT) {
+            mbedtls_aes_setkey_dec(&ctx, key, 128 + u * 64);
+
+            for (j = 0; j < 10000; j++)
+                mbedtls_aes_crypt_ecb(&ctx, v, buf, buf);
+
+            if (memcmp(buf, aes_test_ecb_dec[u], 16) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        } else {
+            mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64);
+
+            for (j = 0; j < 10000; j++)
+                mbedtls_aes_crypt_ecb(&ctx, v, buf, buf);
+
+            if (memcmp(buf, aes_test_ecb_enc[u], 16) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        }
+
+        if (verbose != 0) mbedtls_printf("passed\n");
+    }
+
+    if (verbose != 0) mbedtls_printf("\n");
+
+#if defined(MBEDTLS_CIPHER_MODE_CBC)
+    /*
+     * CBC mode
+     */
+    for (i = 0; i < 6; i++) {
+        u = i >> 1;
+        v = i & 1;
+
+        if (verbose != 0)
+            mbedtls_printf("  AES-CBC-%3d (%s): ", 128 + u * 64,
+                           (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc");
+
+        memset(iv, 0, 16);
+        memset(prv, 0, 16);
+        memset(buf, 0, 16);
+
+        if (v == MBEDTLS_AES_DECRYPT) {
+            mbedtls_aes_setkey_dec(&ctx, key, 128 + u * 64);
+
+            for (j = 0; j < 10000; j++)
+                mbedtls_aes_crypt_cbc(&ctx, v, 16, iv, buf, buf);
+
+            if (memcmp(buf, aes_test_cbc_dec[u], 16) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        } else {
+            mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64);
+
+            for (j = 0; j < 10000; j++) {
+                unsigned char tmp[16];
+
+                mbedtls_aes_crypt_cbc(&ctx, v, 16, iv, buf, buf);
+
+                memcpy(tmp, prv, 16);
+                memcpy(prv, buf, 16);
+                memcpy(buf, tmp, 16);
+            }
+
+            if (memcmp(prv, aes_test_cbc_enc[u], 16) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        }
+
+        if (verbose != 0) mbedtls_printf("passed\n");
+    }
+
+    if (verbose != 0) mbedtls_printf("\n");
+#endif /* MBEDTLS_CIPHER_MODE_CBC */
+
+#if defined(MBEDTLS_CIPHER_MODE_CFB)
+    /*
+     * CFB128 mode
+     */
+    for (i = 0; i < 6; i++) {
+        u = i >> 1;
+        v = i & 1;
+
+        if (verbose != 0)
+            mbedtls_printf("  AES-CFB128-%3d (%s): ", 128 + u * 64,
+                           (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc");
+
+        memcpy(iv, aes_test_cfb128_iv, 16);
+        memcpy(key, aes_test_cfb128_key[u], 16 + u * 8);
+
+        offset = 0;
+        mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64);
+
+        if (v == MBEDTLS_AES_DECRYPT) {
+            memcpy(buf, aes_test_cfb128_ct[u], 64);
+            mbedtls_aes_crypt_cfb128(&ctx, v, 64, &offset, iv, buf, buf);
+
+            if (memcmp(buf, aes_test_cfb128_pt, 64) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        } else {
+            memcpy(buf, aes_test_cfb128_pt, 64);
+            mbedtls_aes_crypt_cfb128(&ctx, v, 64, &offset, iv, buf, buf);
+
+            if (memcmp(buf, aes_test_cfb128_ct[u], 64) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        }
+
+        if (verbose != 0) mbedtls_printf("passed\n");
+    }
+
+    if (verbose != 0) mbedtls_printf("\n");
+#endif /* MBEDTLS_CIPHER_MODE_CFB */
+
+#if defined(MBEDTLS_CIPHER_MODE_CTR)
+    /*
+     * CTR mode
+     */
+    for (i = 0; i < 6; i++) {
+        u = i >> 1;
+        v = i & 1;
+
+        if (verbose != 0)
+            mbedtls_printf("  AES-CTR-128 (%s): ",
+                           (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc");
+
+        memcpy(nonce_counter, aes_test_ctr_nonce_counter[u], 16);
+        memcpy(key, aes_test_ctr_key[u], 16);
+
+        offset = 0;
+        mbedtls_aes_setkey_enc(&ctx, key, 128);
+
+        if (v == MBEDTLS_AES_DECRYPT) {
+            len = aes_test_ctr_len[u];
+            memcpy(buf, aes_test_ctr_ct[u], len);
+
+            mbedtls_aes_crypt_ctr(&ctx, len, &offset, nonce_counter,
+                                  stream_block, buf, buf);
+
+            if (memcmp(buf, aes_test_ctr_pt[u], len) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        } else {
+            len = aes_test_ctr_len[u];
+            memcpy(buf, aes_test_ctr_pt[u], len);
+
+            mbedtls_aes_crypt_ctr(&ctx, len, &offset, nonce_counter,
+                                  stream_block, buf, buf);
+
+            if (memcmp(buf, aes_test_ctr_ct[u], len) != 0) {
+                if (verbose != 0) mbedtls_printf("failed\n");
+
+                ret = 1;
+                goto exit;
+            }
+        }
+
+        if (verbose != 0) mbedtls_printf("passed\n");
+    }
+
+    if (verbose != 0) mbedtls_printf("\n");
+#endif /* MBEDTLS_CIPHER_MODE_CTR */
+
+    ret = 0;
+
+exit:
+    mbedtls_aes_free(&ctx);
+
+    return (ret);
+}
+
+#endif /* MBEDTLS_SELF_TEST */
+
+#endif /* MBEDTLS_AES_C */
diff --git a/lite/src/decryption/mbedtls/aes.h b/lite/src/decryption/mbedtls/aes.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e47c48cb2192de1078247cab19d17d7d7032a6a
--- /dev/null
+++ b/lite/src/decryption/mbedtls/aes.h
@@ -0,0 +1,349 @@
+/**
+ * \file aes.h
+ *
+ * \brief AES block cipher
+ *
+ *  Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+/**
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#ifndef MBEDTLS_AES_H
+#define MBEDTLS_AES_H
+
+#if !defined(MBEDTLS_CONFIG_FILE)
+#include "config.h"
+#else
+#include MBEDTLS_CONFIG_FILE
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* padlock.c and aesni.c rely on these values! */
+#define MBEDTLS_AES_ENCRYPT 1
+#define MBEDTLS_AES_DECRYPT 0
+
+#define MBEDTLS_ERR_AES_INVALID_KEY_LENGTH -0x0020 /**< Invalid key length. */
+#define MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH \
+    -0x0022 /**< Invalid data input length. */
+
+#if (defined(__ARMCC_VERSION) || defined(_MSC_VER)) && !defined(inline) && \
+        !defined(__cplusplus)
+#define inline __inline
+#endif
+
+#if !defined(MBEDTLS_AES_ALT)
+// Regular implementation
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          AES context structure
+ *
+ * \note           buf is able to hold 32 extra bytes, which can be used:
+ *                 - for alignment purposes if VIA padlock is used, and/or
+ *                 - to simplify key expansion in the 256-bit case by
+ *                 generating an extra round key
+ */
+typedef struct {
+    int nr;           /*!<  number of rounds  */
+    uint32_t* rk;     /*!<  AES round keys    */
+    uint32_t buf[68]; /*!<  unaligned data    */
+} mbedtls_aes_context;
+
+/**
+ * \brief          Initialize AES context
+ *
+ * \param ctx      AES context to be initialized
+ */
+void mbedtls_aes_init(mbedtls_aes_context* ctx);
+
+/**
+ * \brief          Clear AES context
+ *
+ * \param ctx      AES context to be cleared
+ */
+void mbedtls_aes_free(mbedtls_aes_context* ctx);
+
+/**
+ * \brief          AES key schedule (encryption)
+ *
+ * \param ctx      AES context to be initialized
+ * \param key      encryption key
+ * \param keybits  must be 128, 192 or 256
+ *
+ * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH
+ */
+int mbedtls_aes_setkey_enc(mbedtls_aes_context* ctx, const unsigned char* key,
+                           unsigned int keybits);
+
+/**
+ * \brief          AES key schedule (decryption)
+ *
+ * \param ctx      AES context to be initialized
+ * \param key      decryption key
+ * \param keybits  must be 128, 192 or 256
+ *
+ * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH
+ */
+int mbedtls_aes_setkey_dec(mbedtls_aes_context* ctx, const unsigned char* key,
+                           unsigned int keybits);
+
+/**
+ * \brief          AES-ECB block encryption/decryption
+ *
+ * \param ctx      AES context
+ * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
+ * \param input    16-byte input block
+ * \param output   16-byte output block
+ *
+ * \return         0 if successful
+ */
+int mbedtls_aes_crypt_ecb(mbedtls_aes_context* ctx, int mode,
+                          const unsigned char input[16],
+                          unsigned char output[16]);
+
+#if defined(MBEDTLS_CIPHER_MODE_CBC)
+/**
+ * \brief          AES-CBC buffer encryption/decryption
+ *                 Length should be a multiple of the block
+ *                 size (16 bytes)
+ *
+ * \note           Upon exit, the content of the IV is updated so that you can
+ *                 call the function same function again on the following
+ *                 block(s) of data and get the same result as if it was
+ *                 encrypted in one call. This allows a "streaming" usage.
+ *                 If on the other hand you need to retain the contents of the
+ *                 IV, you should either save it manually or use the cipher
+ *                 module instead.
+ *
+ * \param ctx      AES context
+ * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
+ * \param length   length of the input data
+ * \param iv       initialization vector (updated after use)
+ * \param input    buffer holding the input data
+ * \param output   buffer holding the output data
+ *
+ * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH
+ */
+int mbedtls_aes_crypt_cbc(mbedtls_aes_context* ctx, int mode, size_t length,
+                          unsigned char iv[16], const unsigned char* input,
+                          unsigned char* output);
+#endif /* MBEDTLS_CIPHER_MODE_CBC */
+
+#if defined(MBEDTLS_CIPHER_MODE_CFB)
+/**
+ * \brief          AES-CFB128 buffer encryption/decryption.
+ *
+ * Note: Due to the nature of CFB you should use the same key schedule for
+ * both encryption and decryption. So a context initialized with
+ * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
+ * MBEDTLS_AES_DECRYPT.
+ *
+ * \note           Upon exit, the content of the IV is updated so that you can
+ *                 call the function same function again on the following
+ *                 block(s) of data and get the same result as if it was
+ *                 encrypted in one call. This allows a "streaming" usage.
+ *                 If on the other hand you need to retain the contents of the
+ *                 IV, you should either save it manually or use the cipher
+ *                 module instead.
+ *
+ * \param ctx      AES context
+ * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
+ * \param length   length of the input data
+ * \param iv_off   offset in IV (updated after use)
+ * \param iv       initialization vector (updated after use)
+ * \param input    buffer holding the input data
+ * \param output   buffer holding the output data
+ *
+ * \return         0 if successful
+ */
+int mbedtls_aes_crypt_cfb128(mbedtls_aes_context* ctx, int mode, size_t length,
+                             size_t* iv_off, unsigned char iv[16],
+                             const unsigned char* input, unsigned char* output);
+
+/**
+ * \brief          AES-CFB8 buffer encryption/decryption.
+ *
+ * Note: Due to the nature of CFB you should use the same key schedule for
+ * both encryption and decryption. So a context initialized with
+ * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
+ * MBEDTLS_AES_DECRYPT.
+ *
+ * \note           Upon exit, the content of the IV is updated so that you can
+ *                 call the function same function again on the following
+ *                 block(s) of data and get the same result as if it was
+ *                 encrypted in one call. This allows a "streaming" usage.
+ *                 If on the other hand you need to retain the contents of the
+ *                 IV, you should either save it manually or use the cipher
+ *                 module instead.
+ *
+ * \param ctx      AES context
+ * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
+ * \param length   length of the input data
+ * \param iv       initialization vector (updated after use)
+ * \param input    buffer holding the input data
+ * \param output   buffer holding the output data
+ *
+ * \return         0 if successful
+ */
+int mbedtls_aes_crypt_cfb8(mbedtls_aes_context* ctx, int mode, size_t length,
+                           unsigned char iv[16], const unsigned char* input,
+                           unsigned char* output);
+#endif /*MBEDTLS_CIPHER_MODE_CFB */
+
+#if defined(MBEDTLS_CIPHER_MODE_CTR)
+/**
+ * \brief               AES-CTR buffer encryption/decryption
+ *
+ * Warning: You have to keep the maximum use of your counter in mind!
+ *
+ * Note: Due to the nature of CTR you should use the same key schedule for
+ * both encryption and decryption. So a context initialized with
+ * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
+ * MBEDTLS_AES_DECRYPT.
+ *
+ * \param ctx           AES context
+ * \param length        The length of the data
+ * \param nc_off        The offset in the current stream_block (for resuming
+ *                      within current cipher stream). The offset pointer to
+ *                      should be 0 at the start of a stream.
+ * \param nonce_counter The 128-bit nonce and counter.
+ * \param stream_block  The saved stream-block for resuming. Is overwritten
+ *                      by the function.
+ * \param input         The input data stream
+ * \param output        The output data stream
+ *
+ * \return         0 if successful
+ */
+int mbedtls_aes_crypt_ctr(mbedtls_aes_context* ctx, size_t length,
+                          size_t* nc_off, unsigned char nonce_counter[16],
+                          unsigned char stream_block[16],
+                          const unsigned char* input, unsigned char* output);
+#endif /* MBEDTLS_CIPHER_MODE_CTR */
+
+/**
+ * \brief           Internal AES block encryption function
+ *                  (Only exposed to allow overriding it,
+ *                  see MBEDTLS_AES_ENCRYPT_ALT)
+ *
+ * \param ctx       AES context
+ * \param input     Plaintext block
+ * \param output    Output (ciphertext) block
+ *
+ * \return          0 if successful
+ */
+int mbedtls_internal_aes_encrypt(mbedtls_aes_context* ctx,
+                                 const unsigned char input[16],
+                                 unsigned char output[16]);
+
+/**
+ * \brief           Internal AES block decryption function
+ *                  (Only exposed to allow overriding it,
+ *                  see MBEDTLS_AES_DECRYPT_ALT)
+ *
+ * \param ctx       AES context
+ * \param input     Ciphertext block
+ * \param output    Output (plaintext) block
+ *
+ * \return          0 if successful
+ */
+int mbedtls_internal_aes_decrypt(mbedtls_aes_context* ctx,
+                                 const unsigned char input[16],
+                                 unsigned char output[16]);
+
+#if !defined(MBEDTLS_DEPRECATED_REMOVED)
+#if defined(MBEDTLS_DEPRECATED_WARNING)
+#define MBEDTLS_DEPRECATED __attribute__((deprecated))
+#else
+#define MBEDTLS_DEPRECATED
+#endif
+/**
+ * \brief           Internal AES block encryption function
+ *                  (Only exposed to allow overriding it,
+ *                  see MBEDTLS_AES_ENCRYPT_ALT)
+ *
+ * \deprecated      Superseded by mbedtls_aes_encrypt_ext() in 2.5.0
+ *
+ * \param ctx       AES context
+ * \param input     Plaintext block
+ * \param output    Output (ciphertext) block
+ */
+MBEDTLS_DEPRECATED static inline void mbedtls_aes_encrypt(
+        mbedtls_aes_context* ctx, const unsigned char input[16],
+        unsigned char output[16]) {
+    mbedtls_internal_aes_encrypt(ctx, input, output);
+}
+
+/**
+ * \brief           Internal AES block decryption function
+ *                  (Only exposed to allow overriding it,
+ *                  see MBEDTLS_AES_DECRYPT_ALT)
+ *
+ * \deprecated      Superseded by mbedtls_aes_decrypt_ext() in 2.5.0
+ *
+ * \param ctx       AES context
+ * \param input     Ciphertext block
+ * \param output    Output (plaintext) block
+ */
+MBEDTLS_DEPRECATED static inline void mbedtls_aes_decrypt(
+        mbedtls_aes_context* ctx, const unsigned char input[16],
+        unsigned char output[16]) {
+    mbedtls_internal_aes_decrypt(ctx, input, output);
+}
+
+#undef MBEDTLS_DEPRECATED
+#endif /* !MBEDTLS_DEPRECATED_REMOVED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* MBEDTLS_AES_ALT */
+#include "aes_alt.h"
+#endif /* MBEDTLS_AES_ALT */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          Checkup routine
+ *
+ * \return         0 if successful, or 1 if the test failed
+ */
+int mbedtls_aes_self_test(int verbose);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* aes.h */
diff --git a/lite/src/decryption/mbedtls/config.h b/lite/src/decryption/mbedtls/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..57f12656c5ecd7ffed32315a1661dcce1a0cd87a
--- /dev/null
+++ b/lite/src/decryption/mbedtls/config.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define MBEDTLS_AES_C
+#define MBEDTLS_AES_ROM_TABLES
+#define MBEDTLS_CIPHER_MODE_CBC
diff --git a/lite/src/decryption/rc4/rc4_cryption_base.h b/lite/src/decryption/rc4/rc4_cryption_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..e811acba59a8bebd58dd9f492c714ad858375692
--- /dev/null
+++ b/lite/src/decryption/rc4/rc4_cryption_base.h
@@ -0,0 +1,156 @@
+/**
+ * \file src/decryption/rc4/rc4_cryption_base.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+namespace lite {
+namespace rc4 {
+
+#define m256(x) static_cast<uint8_t>(x)
+
+/*! \brief Pseudo-random byte stream for RC4.
+ */
+class RC4RandStream {
+public:
+    RC4RandStream() = default;
+
+    RC4RandStream(uint64_t key) { reset(key); }
+
+    void reset(uint64_t init_key) {
+        i_ = j_ = 0;
+        for (int i = 0; i < 256; i++)
+            s_[i] = i;
+        uint8_t j = 0;
+        for (int i = 0; i < 256; i++) {
+            j = j + s_[i] + m256(init_key >> ((i % 8) * 8));
+            std::swap(s_[i], s_[j]);
+        }
+        // drop
+        for (int i = 0; i < 768; i++) {
+            next8();
+        }
+        for (int i = 0, t = next8(); i < t; i++) {
+            next8();
+        }
+    }
+
+    uint8_t next8() {
+        i_++;
+        uint8_t a = s_[i_];
+        j_ += a;
+        uint8_t b = s_[j_];
+        s_[i_] = b;
+        s_[j_] = a;
+        uint8_t c = s_[m256((i_ << 5) ^ (j_ >> 3))] +
+                    s_[m256((j_ << 5) ^ (i_ >> 3))];
+        return (s_[m256(a + b)] + s_[c ^ 0xAA]) ^ s_[m256(j_ + b)];
+    }
+
+    uint64_t next64() {
+        uint64_t rst;
+        uint8_t* buf = reinterpret_cast<uint8_t*>(&rst);
+        for (int i = 0; i < 8; i++) {
+            buf[i] = next8();
+        }
+        return rst;
+    }
+
+private:
+    uint8_t s_[256], i_ = 0, j_ = 0;
+};
+#undef m256
+
+/*!
+ * \brief fast and secure 64-bit hash
+ * see https://code.google.com/p/fast-hash/
+ */
+class FastHash64 {
+public:
+    FastHash64(uint64_t seed)
+            : hash_{seed},
+              mul0_{key_gen_hash_mul0()},
+              mul1_{key_gen_hash_mul1()} {}
+
+    void feed(uint64_t val) {
+        val ^= val >> 23;
+        val *= mul0_;
+        val ^= val >> 47;
+        hash_ ^= val;
+        hash_ *= mul1_;
+    }
+
+    uint64_t get() { return hash_; }
+
+private:
+    uint64_t hash_;
+    const uint64_t mul0_, mul1_;
+
+    static uint64_t key_gen_hash_mul0() {
+        uint64_t rst;
+        uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
+        buf[2] = 50;
+        buf[3] = 244;
+        buf[6] = 39;
+        buf[1] = 92;
+        buf[5] = 89;
+        buf[4] = 155;
+        buf[0] = 55;
+        buf[7] = 33;
+        return rst;
+    }
+
+    static uint64_t key_gen_hash_mul1() {
+        uint64_t rst;
+        uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
+        buf[6] = 3;
+        buf[2] = 109;
+        buf[7] = 136;
+        buf[1] = 25;
+        buf[5] = 85;
+        buf[0] = 101;
+        buf[4] = 242;
+        buf[3] = 30;
+        return rst;
+    }
+};
+
+// The encryption keys are always inlined.
+static inline uint64_t key_gen_enc_key() {
+    uint64_t rst;
+    uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
+    buf[4] = 120;
+    buf[3] = 121;
+    buf[7] = 122;
+    buf[6] = 123;
+    buf[0] = 124;
+    buf[5] = 125;
+    buf[2] = 126;
+    buf[1] = 127;
+    return rst;
+}
+
+static inline uint64_t key_gen_hash_key() {
+    uint64_t rst;
+    uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
+    buf[2] = 101;
+    buf[5] = 102;
+    buf[4] = 103;
+    buf[7] = 104;
+    buf[1] = 105;
+    buf[3] = 106;
+    buf[6] = 107;
+    buf[0] = 108;
+    return rst;
+}
+}  // namespace rc4
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/rc4/rc4_cryption_impl.cpp b/lite/src/decryption/rc4/rc4_cryption_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca477bcfbce0e922788a44820d6e7460c239a8a8
--- /dev/null
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.cpp
@@ -0,0 +1,219 @@
+/**
+ * \file src/decryption/rc4/rc4_cryption_impl.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "rc4_cryption_impl.h"
+#include "../../misc.h"
+
+#include <cstring>
+
+using namespace lite;
+
+/*!
+ * \brief Read the input stream once in order to initialize the decryption
+ *        state.
+ */
+void RC4Impl::init_rc4_state() {
+    rc4::RC4RandStream enc_stream(m_enc_key);
+    rc4::FastHash64 dechash(m_hash_key);
+
+    size_t offset = 0;
+
+    std::vector<uint64_t> buffer(128);
+    size_t remaining = m_model_length - sizeof(uint64_t);
+    while (remaining > 0) {
+        size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t));
+        memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset,
+               toread);
+        offset += toread;
+        remaining -= toread;
+
+        for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) {
+            uint64_t value = buffer[i];
+            value ^= enc_stream.next64();
+            dechash.feed(value);
+        }
+    }
+
+    uint64_t hashvalue;
+    memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset,
+           sizeof(hashvalue));
+    offset += sizeof(hashvalue);
+
+    hashvalue ^= dechash.get() ^ enc_stream.next64();
+    m_state.hash_stream.reset(hashvalue);
+    m_state.enc_stream.reset(m_enc_key);
+}
+
+std::vector<uint8_t> RC4Impl::decrypt_model() {
+    std::vector<uint8_t> result(m_model_length, 0);
+
+    uint8_t* ptr = result.data();
+    for (size_t i = 0; i < m_model_length; ++i) {
+        ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i];
+        ptr[i] ^= m_state.hash_stream.next8() ^ m_state.enc_stream.next8();
+    }
+    return result;
+}
+
+/*! \brief Encrypt the data in m_buffer.
+ *
+ * The basic idea is to calculate a 64-bit hash from the buffer and append
+ * it to the end of the buffer. The basic requirement is that the change of
+ * every byte including the hash value will destroy the whole model in every
+ * byte.
+ *
+ * Encryption:
+ *
+ * 1. First calculate a 64-bit hash, called plain hash value, from the
+ * buffer.
+ * 2. Initialize a RC4 stream with the plain hash value.
+ * 3. Obfuscate the model body with the RC4 stream defined in step 2.
+ * 4. Calculate the hash value of the obfuscated model, called hash value
+ *    after hashing.
+ * 5. Encrypt the model body with a RC4 stream made from the encryption key.
+ * 6. Bit-xor the hash value after hashing with the plain hash value, called
+ *    mixed hash.
+ * 7. Encrypt the mixed hash with the RC4 stream defined in step 5, called
+ * the protected hash.
+ * 8. Append the protected hash to the buffer.
+ *
+ * Decryption:
+ * 1. Decrypt the model body with a RC4 stream made from the encryption key,
+ *    which is the reverse of step 5 and 7 of encryption and get the mixed
+ *    hash.
+ * 2. Calculate the hash value of the decrypted model, which equals to the
+ *    hash value after hashing in step 4 of encryption.
+ * 3. Bit-xor the hash value after hashing and the mixed hash to get the
+ * plain hash value, which is the reverse of step 6 of encryption.
+ * 4. Un-obfuscate the model body with the plain hash value, which is the
+ *    reverse of step 3 of encryption.
+ *
+ * Think:
+ * 1. If any byte in the model body is broken, the hash value after hashing
+ *    will be broken in step 2, and hence the plain hash value in step 3
+ * will be also broken, and finally, the model body will be broken in
+ * step 4.
+ * 2. If the protected hash is broken, the plain hash value in step 3 will
+ * be broken, and finally the model body will be broken.
+ */
+std::vector<uint8_t> RC4Impl::encrypt_model() {
+    size_t total_length = (m_model_length + (sizeof(size_t) - 1)) /
+                          sizeof(size_t) * sizeof(size_t);
+    std::vector<uint8_t> pad_model(total_length, 0);
+    memcpy(pad_model.data(), m_model_mem, m_model_length);
+
+    // Calculate the hash of the model.
+    rc4::FastHash64 plainhash(m_hash_key);
+    uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data());
+    size_t len = pad_model.size() / sizeof(uint64_t);
+
+    for (size_t i = 0; i < len; ++i)
+        plainhash.feed(ptr[i]);
+    uint64_t plainhash_value = plainhash.get();
+
+    // Encrypt the model.
+    rc4::RC4RandStream hash_enc(plainhash_value);
+    rc4::RC4RandStream outmost_enc(m_enc_key);
+    rc4::FastHash64 afterhashenc_hash(m_hash_key);
+
+    for (size_t i = 0; i < len; ++i) {
+        uint64_t value = ptr[i] ^ hash_enc.next64();
+        afterhashenc_hash.feed(value);
+        ptr[i] = value ^ outmost_enc.next64();
+    }
+
+    uint64_t protected_hash =
+            plainhash_value ^ afterhashenc_hash.get() ^ outmost_enc.next64();
+
+    size_t end = pad_model.size();
+    pad_model.resize(pad_model.size() + sizeof(uint64_t));
+    ptr = reinterpret_cast<uint64_t*>(&pad_model[end]);
+    *ptr = protected_hash;
+    return pad_model;
+}
+
+/*!
+ * \brief Read the input stream once in order to initialize the decryption
+ *        state.
+ */
+void SimpleFastRC4Impl::init_sfrc4_state() {
+    rc4::RC4RandStream enc_stream(m_enc_key);
+    rc4::FastHash64 dechash(m_hash_key);
+
+    size_t offset = 0;
+    std::vector<uint64_t> buffer(128);
+    size_t remaining = m_model_length - sizeof(uint64_t);
+    while (remaining > 0) {
+        size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t));
+        memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset,
+               toread);
+        offset += toread;
+        remaining -= toread;
+
+        for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) {
+            uint64_t value = buffer[i];
+            dechash.feed(value);
+        }
+    }
+
+    uint64_t hashvalue;
+    memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset,
+           sizeof(hashvalue));
+
+    offset += sizeof(hashvalue);
+
+    /*! \brief test the hash_val. */
+    if (hashvalue != dechash.get())
+        LITE_THROW(
+                "The checksum of the file cannot be verified. The file may "
+                "be encrypted in the wrong algorithm or different keys.");
+
+    m_state.hash_stream.reset(m_hash_key);
+    m_state.enc_stream.reset(m_enc_key);
+}
+
+std::vector<uint8_t> SimpleFastRC4Impl::decrypt_model() {
+    std::vector<uint8_t> result(m_model_length, 0);
+    uint8_t* ptr = result.data();
+    for (size_t i = 0; i < m_model_length; ++i) {
+        ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i];
+        ptr[i] ^= m_state.enc_stream.next8();
+    }
+    return result;
+}
+
+std::vector<uint8_t> SimpleFastRC4Impl::encrypt_model() {
+    size_t total_length = (m_model_length + (sizeof(size_t) - 1)) /
+                          sizeof(size_t) * sizeof(size_t);
+    std::vector<uint8_t> pad_model(total_length, 0);
+    memcpy(pad_model.data(), m_model_mem, m_model_length);
+
+    // Calculate the hash of the model.
+    rc4::FastHash64 enchash(m_hash_key);
+    uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data());
+    size_t len = pad_model.size() / sizeof(uint64_t);
+
+    // Encrypt the model.
+    rc4::RC4RandStream out_enc(m_enc_key);
+    for (size_t i = 0; i < len; ++i) {
+        ptr[i] = ptr[i] ^ out_enc.next64();
+        enchash.feed(ptr[i]);
+    }
+
+    uint64_t hash_value = enchash.get();
+
+    size_t end = pad_model.size();
+    pad_model.resize(pad_model.size() + sizeof(uint64_t));
+    ptr = reinterpret_cast<uint64_t*>(&pad_model[end]);
+    *ptr = hash_value;
+
+    return pad_model;
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/rc4/rc4_cryption_impl.h b/lite/src/decryption/rc4/rc4_cryption_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5c88e836729a5e27662a0209c9d0163310bfede
--- /dev/null
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.h
@@ -0,0 +1,79 @@
+/**
+ * \file src/decryption/rc4/rc4_cryption_impl.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include "rc4_cryption_base.h"
+
+#include <memory>
+#include <vector>
+
+namespace lite {
+
+class RC4Impl {
+    struct RC4State {
+        rc4::RC4RandStream enc_stream;
+        rc4::RC4RandStream hash_stream;
+    } m_state;
+
+public:
+    RC4Impl(const void* model_mem, size_t size, const std::vector<uint8_t>& key)
+            : m_model_mem(model_mem), m_model_length(size) {
+        const uint8_t* data = key.data();
+        m_hash_key = *reinterpret_cast<const uint64_t*>(data);
+        m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8);
+    }
+
+    std::vector<uint8_t> encrypt_model();
+    std::vector<uint8_t> decrypt_model();
+
+    /*! \brief Read the input stream once in order to initialize the decryption
+     *         state.
+     */
+    void init_rc4_state();
+
+private:
+    const void* m_model_mem;
+    size_t m_model_length;
+
+    uint64_t m_hash_key;
+    uint64_t m_enc_key;
+};
+
+class SimpleFastRC4Impl {
+    struct SFRC4State {
+        rc4::RC4RandStream enc_stream;
+        rc4::RC4RandStream hash_stream;
+    } m_state;
+
+public:
+    SimpleFastRC4Impl(const void* model_mem, size_t size,
+                      const std::vector<uint8_t>& key)
+            : m_model_mem(model_mem), m_model_length(size) {
+        const uint8_t* data = key.data();
+        m_hash_key = *reinterpret_cast<const uint64_t*>(data);
+        m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8);
+    }
+    std::vector<uint8_t> encrypt_model();
+    std::vector<uint8_t> decrypt_model();
+
+    /*! \brief Read the input stream once in order to initialize the decryption
+     *         state.
+     */
+    void init_sfrc4_state();
+
+private:
+    const void* m_model_mem;
+    size_t m_model_length;
+
+    uint64_t m_hash_key;
+    uint64_t m_enc_key;
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/rc4_cryption.cpp b/lite/src/decryption/rc4_cryption.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8105deec04b40350f5b9b356fb996a8bdcfaf53
--- /dev/null
+++ b/lite/src/decryption/rc4_cryption.cpp
@@ -0,0 +1,58 @@
+/**
+ * \file src/decryption/rc4_cryption.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "rc4_cryption.h"
+#include "rc4/rc4_cryption_impl.h"
+
+#include <vector>
+
+using namespace lite;
+
+std::vector<uint8_t> RC4::decrypt_model(const void* model_mem, size_t size,
+                                        const std::vector<uint8_t>& key) {
+    RC4Impl rc4_impl(model_mem, size, key);
+    rc4_impl.init_rc4_state();
+    return rc4_impl.decrypt_model();
+}
+
+std::vector<uint8_t> RC4::encrypt_model(const void* model_mem, size_t size,
+                                        const std::vector<uint8_t>& key) {
+    RC4Impl rc4_impl(model_mem, size, key);
+    return rc4_impl.encrypt_model();
+}
+
+std::vector<uint8_t> RC4::get_decrypt_key() {
+    std::vector<uint8_t> keys(128, 0);
+    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
+    data[0] = rc4::key_gen_hash_key();
+    data[1] = rc4::key_gen_enc_key();
+    return keys;
+};
+
+std::vector<uint8_t> SimpleFastRC4::decrypt_model(
+        const void* model_mem, size_t size, const std::vector<uint8_t>& key) {
+    SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key);
+    simple_fast_rc4_impl.init_sfrc4_state();
+    return simple_fast_rc4_impl.decrypt_model();
+}
+std::vector<uint8_t> SimpleFastRC4::encrypt_model(
+        const void* model_mem, size_t size, const std::vector<uint8_t>& key) {
+    SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key);
+    return simple_fast_rc4_impl.encrypt_model();
+}
+
+std::vector<uint8_t> SimpleFastRC4::get_decrypt_key() {
+    std::vector<uint8_t> keys(128, 0);
+    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
+    data[0] = rc4::key_gen_hash_key();
+    data[1] = rc4::key_gen_enc_key();
+    return keys;
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/decryption/rc4_cryption.h b/lite/src/decryption/rc4_cryption.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c5c9f89aded18559ecef610cf4cb31650ca604c
--- /dev/null
+++ b/lite/src/decryption/rc4_cryption.h
@@ -0,0 +1,44 @@
+/**
+ * \file src/decryption/rc4_cryption.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "rc4/rc4_cryption_base.h"
+
+#include <vector>
+
+namespace lite {
+
+class RC4 {
+public:
+    static std::vector<uint8_t> decrypt_model(const void* model_mem,
+                                              size_t size,
+                                              const std::vector<uint8_t>& key);
+
+    static std::vector<uint8_t> encrypt_model(const void* model_mem,
+                                              size_t size,
+                                              const std::vector<uint8_t>& key);
+
+    static std::vector<uint8_t> get_decrypt_key();
+};
+
+class SimpleFastRC4 {
+public:
+    static std::vector<uint8_t> decrypt_model(const void* model_mem,
+                                              size_t size,
+                                              const std::vector<uint8_t>& key);
+    static std::vector<uint8_t> encrypt_model(const void* model_mem,
+                                              size_t size,
+                                              const std::vector<uint8_t>& key);
+
+    static std::vector<uint8_t> get_decrypt_key();
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/function_base.h b/lite/src/function_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..42c1abe8a0e5a11dd9330be3141e839cda4768ba
--- /dev/null
+++ b/lite/src/function_base.h
@@ -0,0 +1,53 @@
+/**
+ * \file src/function_base.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include <unordered_map>
+#include "misc.h"
+#include "type_info.h"
+// template <typename tensor_type, typename ...Arg>
+namespace lite {
+class TensorImplDft;
+class NetworkImplDft;
+namespace {
+
+template <typename class_type>
+struct class_type_name {
+    std::string operator()() { return ""; }
+};
+#define ADD_STATEMENT(class_name, backend_name)            \
+    template <>                                            \
+    struct class_type_name<class_name> {                   \
+        std::string operator()() { return #backend_name; } \
+    }
+ADD_STATEMENT(TensorImplDft, Dft);
+ADD_STATEMENT(NetworkImplDft, Dft);
+#undef ADD_STATEMENT
+}  // namespace
+
+// if it can't find the function, ignore
+template <typename tensor_type, typename ret_type, typename... Args>
+ret_type try_call_func(std::string func_name, Args... args) {
+    mark_used_variable(func_name);
+    mark_used_variable(args...);
+    return nullptr;
+}
+
+// if it can't find the function, throw error
+template <typename tensor_type, typename ret_type, typename... Args>
+ret_type call_func(std::string func_name, Args... args) {
+    mark_used_variable(args...);
+    auto backend_name = class_type_name<tensor_type>()();
+    auto msg_info =
+            func_name + "  is not aviliable in " + backend_name + " backend.";
+    LITE_THROW(msg_info.c_str());
+}
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/global.cpp b/lite/src/global.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..378127e2fdb4e0e3f72376b6b5aa40eb0bba6e4b
--- /dev/null
+++ b/lite/src/global.cpp
@@ -0,0 +1,256 @@
+/**
+ * \file src/global.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <lite_build_config.h>
+
+#include "lite/global.h"
+#include "decryption/aes_decrypt.h"
+#include "decryption/decrypt_base.h"
+#include "decryption/rc4_cryption.h"
+#include "misc.h"
+#include "parse_info/parse_info_base.h"
+#include "parse_info/default_parse.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "megbrain/common.h"
+#include "megbrain/comp_node.h"
+#include "megbrain/serialization/extern_c_opr.h"
+#include "megbrain/version.h"
+#include "megcore_opencl.h"
+#include "mge/algo_cache/file_cache.h"
+#include "mge/common.h"
+#if MGB_ENABLE_TENSOR_RT
+#include "megbrain/tensorrt/tensorrt_engine_cache.h"
+#endif
+#if LITE_WITH_CUDA
+#include "mge/algo_cache/redis_cache.h"
+#endif
+#endif
+
+#include <mutex>
+#include <unordered_map>
+
+using namespace lite;
+
+lite::DecryptionStaticData& lite::decryption_static_data() {
+    static lite::DecryptionStaticData global_map;
+    return global_map;
+}
+
+void lite::get_version(int& major, int& minor, int& patch) {
+#if LITE_BUILD_WITH_MGE
+    auto version = mgb::get_version();
+    major = version.major;
+    minor = version.minor;
+    patch = version.patch;
+#else
+    //! without mge, the version set the max version
+    major = 8;
+    minor = 9999;
+    patch = 0;
+#endif
+}
+
+size_t lite::get_device_count(LiteDeviceType device_type) {
+#if LITE_BUILD_WITH_MGE
+    auto mgb_device_type = to_compnode_locator(device_type).type;
+    return mgb::CompNode::get_device_count(mgb_device_type);
+#else
+    LITE_MARK_USED_VAR(device_type);
+    LITE_THROW("no lite backend avialible, please check build macro.");
+#endif
+}
+
+bool lite::register_decryption_and_key(std::string decrypt_name,
+                                       const DecryptionFunc& func,
+                                       const std::vector<uint8_t>& key) {
+    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
+    auto& global_map = decryption_static_data().decryption_methods;
+    if (global_map.find(decrypt_name) != global_map.end()) {
+        LITE_THROW(ssprintf("The decryption method %s is already registered.",
+                            decrypt_name.c_str()));
+        return false;
+    } else {
+        auto key_pointer = std::make_shared<std::vector<uint8_t>>(key);
+        global_map[decrypt_name] = {func, key_pointer};
+        LITE_LOG("Registered ecryption method %s.", decrypt_name.c_str());
+        return true;
+    }
+}
+
+bool lite::update_decryption_or_key(std::string decrypt_name,
+                                    const DecryptionFunc& func,
+                                    const std::vector<uint8_t>& key) {
+    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
+    auto& global_map = decryption_static_data().decryption_methods;
+    if (global_map.find(decrypt_name) != global_map.end()) {
+        std::shared_ptr<std::vector<uint8_t>> key_pointer;
+        DecryptionFunc new_func;
+        if (func) {
+            new_func = func;
+            LITE_LOG("%s decryption function is updated.",
+                     decrypt_name.c_str());
+        } else {
+            new_func = global_map[decrypt_name].first;
+        }
+        if (key.size()) {
+            key_pointer = std::make_shared<std::vector<uint8_t>>(key);
+            LITE_LOG("%s decryption key is updated.", decrypt_name.c_str());
+        } else {
+            key_pointer = global_map[decrypt_name].second;
+        }
+        global_map[decrypt_name] = {new_func, key_pointer};
+        return true;
+    } else {
+        LITE_THROW(ssprintf("The decryption method %s is not registered.",
+                            decrypt_name.c_str()));
+        return false;
+    }
+}
+
+lite::ParseInfoStaticData& lite::parse_info_static_data() {
+    static lite::ParseInfoStaticData global_map;
+    return global_map;
+}
+
+bool lite::register_parse_info_func(std::string info_type,
+                                    const ParseInfoFunc& parse_func) {
+    LITE_LOCK_GUARD(parse_info_static_data().map_mutex);
+    auto& global_map = parse_info_static_data().parse_info_methods;
+    if (global_map.find(info_type) != global_map.end()) {
+        LITE_THROW(ssprintf("The parse info method %s is already registered.",
+                            info_type.c_str()));
+        return false;
+    } else {
+        global_map[info_type] = parse_func;
+        LITE_LOG("Registered infomation parser method %s.", info_type.c_str());
+        return true;
+    }
+}
+
+#if LITE_BUILD_WITH_MGE
+
+namespace {
+struct CacheControl {
+    LITE_MUTEX cache_mutex;
+    std::string cache_type = "file";
+    std::atomic_size_t config_algo_times{0};
+    std::atomic_size_t config_trt_times{0};
+};
+CacheControl cache_control;
+}  // namespace
+
+
+void lite::try_coalesce_all_free_memory() {
+    mgb::CompNode::try_coalesce_all_free_memory();
+}
+
+void lite::set_loader_lib_path(const std::string& loader_path) {
+    const char* lib_path = loader_path.c_str();
+    LITE_LOG("load a device loader of path %s.", lib_path);
+    auto handle = dlopen(lib_path, RTLD_LAZY);
+    LITE_ASSERT(handle, "failed to open c opr lib %s: %s", lib_path, dlerror());
+    const char* entry = MGB_C_OPR_INIT_FUNC_STR;
+    auto func = dlsym(handle, entry);
+    LITE_ASSERT(func, "can not resolve %s: %s", entry, dlerror());
+    typedef void (*entry_f_t)(void*);
+    reinterpret_cast<entry_f_t>(func)(
+            reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned));
+}
+
+void lite::set_persistent_cache(const std::string& cache_path,
+                                bool always_sync) {
+    LITE_LOCK_GUARD(cache_control.cache_mutex);
+    cache_control.cache_type = "file";
+    if (cache_control.config_algo_times >= 1) {
+        LITE_WARN(
+                "The cache has been set，maybe some model is using now, change "
+                "it now may cause unknow error!!");
+    }
+    cache_control.config_algo_times++;
+    mgb::PersistentCache::set_impl(std::make_shared<InFilePersistentCache>(
+            cache_path.c_str(), always_sync));
+}
+
+void lite::dump_persistent_cache(const std::string& cache_path) {
+    LITE_LOCK_GUARD(cache_control.cache_mutex);
+    LITE_ASSERT(cache_control.cache_type == "file",
+                "now cache type is redis, it can't be dumped.");
+    static_cast<InFilePersistentCache&>(mgb::PersistentCache::inst())
+            .dump_cache(cache_path.c_str());
+}
+
+//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
+void lite::set_tensor_rt_cache(std::string tensorrt_cache_path) {
+#if MGB_ENABLE_TENSOR_RT
+    LITE_LOCK_GUARD(cache_control.cache_mutex);
+    if (cache_control.config_trt_times >= 1) {
+        LITE_WARN(
+                "The trt cache has been set，maybe some model is using now, "
+                "change it now may cause unknow error!!");
+    }
+    cache_control.config_trt_times++;
+    mgb::TensorRTEngineCache::enable_engine_cache(true);
+    mgb::TensorRTEngineCache::set_impl(
+            std::make_shared<mgb::TensorRTEngineCacheIO>(tensorrt_cache_path));
+#else
+    LITE_MARK_USED_VAR(tensorrt_cache_path);
+    LITE_THROW("TensorRT is disable at compile time.");
+#endif
+}
+
+void lite::dump_tensor_rt_cache() {
+#if MGB_ENABLE_TENSOR_RT
+    if (mgb::TensorRTEngineCache::enable_engine_cache()) {
+        mgb::TensorRTEngineCache::inst().dump_cache();
+    }
+#else
+    LITE_THROW("TensorRT is disable at compile time.");
+#endif
+}
+
+#else  //LITE_BUILD_WITH_MGE
+void lite::try_coalesce_all_free_memory() {}
+
+void lite::set_loader_lib_path(const std::string& ) {
+    LITE_THROW("mge is disbale at build time, please build with mge");
+}
+
+void lite::set_persistent_cache(const std::string&, bool) {
+    LITE_THROW("mge is disbale at build time, please build with mge");
+}
+
+void lite::dump_persistent_cache(const std::string& ) {
+    LITE_THROW("mge is disbale at build time, please build with mge");
+}
+
+//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
+void lite::set_tensor_rt_cache(std::string ) {
+    LITE_THROW("mge is disbale at build time, please build with mge");
+}
+
+void lite::dump_tensor_rt_cache() {
+    LITE_THROW("mge is disbale at build time, please build with mge");
+}
+#endif
+namespace lite {
+REGIST_DECRYPTION_METHOD("AES_default", lite::AESDcryption::decrypt_model,
+                         lite::AESDcryption::get_decrypt_key());
+
+REGIST_DECRYPTION_METHOD("RC4_default", lite::RC4::decrypt_model,
+                         lite::RC4::get_decrypt_key());
+
+REGIST_DECRYPTION_METHOD("SIMPLE_FAST_RC4_default",
+                         lite::SimpleFastRC4::decrypt_model,
+                         lite::SimpleFastRC4::get_decrypt_key());
+
+REGIST_PARSE_INFO_FUNCTION("LITE_default", lite::default_parse_info);
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/lite_build_config.h.in b/lite/src/lite_build_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..e1607948da80840360da040e0de8d68b203dfa98
--- /dev/null
+++ b/lite/src/lite_build_config.h.in
@@ -0,0 +1,37 @@
+/**
+ * \file lite/src/lite_build_config.h.in
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+#ifndef _HEADER_LITE_BUILD_CONFIG
+#define _HEADER_LITE_BUILD_CONFIG
+
+#cmakedefine01 LITE_ENABLE_LOGGING
+#cmakedefine01 LITE_ENABLE_EXCEPTION
+#cmakedefine01 LITE_WITH_CUDA
+#cmakedefine01 LITE_ASSERT_LOC
+
+#ifndef LITE_ENABLE_LOGGING
+#define LITE_ENABLE_LOGGING 1
+#endif
+
+#ifndef LITE_ENABLE_EXCEPTION
+#if __cpp_exceptions || __EXCEPTIONS || \
+        (defined(_MSC_VER) && defined(_CPPUNWIND))
+#define LITE_ENABLE_EXCEPTION 1
+#else
+#define LITE_ENABLE_EXCEPTION 0
+#endif
+#endif
+
+#ifndef LITE_WITH_CUDA
+#define LITE_WITH_CUDA 0
+#endif
+
+#ifndef LITE_ASSERT_LOC
+#define LITE_ASSERT_LOC 0
+#endif
+#endif  // _HEADER_LITE_BUILD_CONFIG
diff --git a/lite/src/mge/algo_cache/file_cache.cpp b/lite/src/mge/algo_cache/file_cache.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d96d4b0bcd91f042d8e699227150c95b33da0d4
--- /dev/null
+++ b/lite/src/mge/algo_cache/file_cache.cpp
@@ -0,0 +1,254 @@
+/**
+ * \file lite/src/mge/algo_cache/file_cache.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../common.h"
+#include "file_cache.h"
+
+using namespace lite;
+
+//////////////////////// InFilePersistentCache::InputMemory ///////////////
+class InFilePersistentCache::InputMemory {
+    const uint8_t* m_ptr;
+    size_t m_offset = 0;
+    size_t m_size;
+
+public:
+    InputMemory(const uint8_t* bin, size_t size) : m_ptr{bin}, m_size{size} {}
+
+    template <typename T>
+    void read(T& val) {
+        static_assert(std::is_trivially_copyable<T>::value,
+                      "only support trivially copyable type");
+        LITE_ASSERT(m_offset + sizeof(T) <= m_size);
+        memcpy(&val, m_ptr, sizeof(T));
+        m_offset += sizeof(T);
+        m_ptr += sizeof(T);
+    }
+
+    template <typename T>
+    void read(T* buf, size_t size) {
+        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
+                      "only support read bytes");
+        LITE_ASSERT(m_offset + size <= m_size);
+        memcpy(buf, m_ptr, size);
+        m_offset += size;
+        m_ptr += size;
+    }
+};
+
+//////////////////////// InFilePersistentCache::InputFile ///////////////
+class InFilePersistentCache::InputFile {
+    FILE* m_fp;
+
+public:
+    InputFile(const char* path) : m_fp{fopen(path, "rb")} {
+        LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno));
+    }
+    ~InputFile() {
+        if (m_fp) {
+            fclose(m_fp);
+        }
+    }
+
+    template <typename T>
+    void read(T& val) {
+        static_assert(std::is_trivially_copyable<T>::value,
+                      "only support trivially copyable type");
+        auto ret = fread(&val, sizeof(T), 1, m_fp);
+        LITE_ASSERT(ret == 1);
+    }
+
+    template <typename T>
+    void read(T* buf, size_t size) {
+        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
+                      "only support read bytes");
+        auto ret = fread(buf, size, 1, m_fp);
+        LITE_ASSERT(ret == 1);
+    }
+};
+
+//////////////////////// InFilePersistentCache::OutputFile ///////////////
+class InFilePersistentCache::OutputFile {
+    FILE* m_fp;
+
+public:
+    OutputFile(const char* path) : m_fp{fopen(path, "wb")} {
+        LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno));
+    }
+    ~OutputFile() {
+        if (m_fp) {
+            fclose(m_fp);
+        }
+    }
+
+    template <typename T>
+    void write(T val) {
+        auto ret = fwrite(&val, sizeof(T), 1, m_fp);
+        LITE_ASSERT(ret == 1);
+    }
+
+    template <typename T>
+    void write(const T* buf, size_t size) {
+        static_assert(sizeof(T) == 1, "only support write bytes");
+        auto ret = fwrite(buf, size, 1, m_fp);
+        LITE_ASSERT(ret == 1);
+    }
+
+    void flush() { fflush(m_fp); }
+
+    void set_head() { fseek(m_fp, 0, SEEK_SET); }
+};
+
+//////////////////////// InFilePersistentCache::BlobStorage ///////////////
+
+template <typename Input>
+InFilePersistentCache::BlobStorage&
+InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
+    uint32_t data_size;
+    inp.read(data_size);
+    size = data_size;
+    data_refhold = std::make_unique<uint8_t[]>(size);
+    inp.read(data_refhold.get(), size);
+    ptr = data_refhold.get();
+    return *this;
+}
+
+void InFilePersistentCache::BlobStorage::write_to_file(
+        OutputFile& out_file) const {
+    uint32_t u_size = size;
+    out_file.write(u_size);
+    out_file.write(data_refhold.get(), u_size);
+}
+
+InFilePersistentCache::BlobStorage&
+InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) {
+    data_refhold = std::make_unique<uint8_t[]>(b.size + 1);
+    memcpy(data_refhold.get(), b.ptr, b.size);
+    data_refhold.get()[b.size] = 0;  // for C-string safety
+    ptr = data_refhold.get();
+    size = b.size;
+    return *this;
+}
+
+//////////////////////// InFilePersistentCache //////////////////////
+
+template <typename Input>
+void InFilePersistentCache::read_cache(Input& inp) {
+    uint32_t nr_category;
+    inp.read(nr_category);
+    char category_buf[256];
+    for (uint32_t i = 0; i < nr_category; i++) {
+        uint32_t category_size;
+        inp.read(category_size);
+        inp.read(category_buf, category_size);
+        category_buf[category_size] = '\0';
+
+        std::string category(category_buf);
+        mgb_log_debug("load new category: %s", category_buf);
+
+        // read bobs
+        uint32_t nr_bobs;
+        inp.read(nr_bobs);
+        for (uint32_t j = 0; j < nr_bobs; j++) {
+            BlobStorage key_storage;
+            key_storage.init_from_input(inp).init_hash();
+            mgb_log_debug("read key: %zu", key_storage.hash);
+            m_cache[category][std::move(key_storage)].init_from_input(inp);
+        }
+    }
+}
+
+InFilePersistentCache::InFilePersistentCache(const char* path,
+                                             bool always_open) {
+    if (!access(path, F_OK)) {
+        mgb_log_debug("use fastrun cache: %s", path);
+        InputFile inp(path);
+        read_cache<InputFile>(inp);
+    }
+    if (always_open) {
+        m_always_open_file = std::make_shared<OutputFile>(path);
+    }
+}
+
+InFilePersistentCache::InFilePersistentCache(const uint8_t* bin, size_t size) {
+    LITE_ASSERT(bin);
+    InputMemory inp(bin, size);
+    read_cache<InputMemory>(inp);
+}
+
+void InFilePersistentCache::dump_cache(const char* path) {
+    OutputFile out_file(path);
+    dump_cache(&out_file);
+}
+
+void InFilePersistentCache::dump_cache(OutputFile* out_file) {
+    uint32_t nr_category = m_cache.size();
+    out_file->write(nr_category);
+
+    for (const auto& cached_category : m_cache) {
+        uint32_t category_size = cached_category.first.size();
+        out_file->write(category_size);
+        out_file->write(cached_category.first.data(), category_size);
+        mgb_log_debug("write new category: %s", cached_category.first.c_str());
+
+        uint32_t nr_bobs = cached_category.second.size();
+        out_file->write(nr_bobs);
+        for (const auto& item : cached_category.second) {
+            mgb_log_debug("dump key: %zu", item.first.hash);
+            item.first.write_to_file(*out_file);
+            item.second.write_to_file(*out_file);
+        }
+    }
+}
+
+mgb::Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get(
+        const std::string& category, const Blob& key) {
+    decltype(m_cache.begin()) iter0;
+    {
+        MGB_LOCK_GUARD(m_mtx);
+        iter0 = m_cache.find(category);
+        if (iter0 == m_cache.end())
+            return mgb::None;
+    }
+
+    BlobStorage key_storage;
+    key_storage.Blob::operator=(key);
+    key_storage.init_hash();
+
+    MGB_LOCK_GUARD(m_mtx);
+
+    auto iter1 = iter0->second.find(key_storage);
+    if (iter1 == iter0->second.end())
+        return mgb::None;
+    return iter1->second;
+}
+
+void InFilePersistentCache::put(const std::string& category, const Blob& key,
+                                const Blob& value) {
+    BlobStorage key_storage;
+    key_storage.init_data_ref(key).init_hash();
+
+    MGB_LOCK_GUARD(m_mtx);
+    auto size0 = m_cache.size();
+    m_cache[category][std::move(key_storage)].init_data_ref(value);
+    if (m_cache.size() > size0) {
+        mgb_log_debug("new cache category: %s", category.c_str());
+    }
+    if (m_always_open_file) {
+        m_always_open_file->set_head();
+        dump_cache(m_always_open_file.get());
+        m_always_open_file->flush();
+    }
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/algo_cache/file_cache.h b/lite/src/mge/algo_cache/file_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..340780ec8009cc595abee184296fe2a4cef21716
--- /dev/null
+++ b/lite/src/mge/algo_cache/file_cache.h
@@ -0,0 +1,85 @@
+/**
+ * \file lite/src/mge/algo_cache/file_cache.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+#if LITE_BUILD_WITH_MGE
+
+#include "megbrain/utils/persistent_cache.h"
+
+namespace lite {
+
+/**
+ * dump format:
+ *
+ * all integers in local endian (effectively little endian as I can see)
+ *
+ * dump format:
+ * <nr_category|uint32_t><category_size|uint32_t><category|uint8_t*>
+ * <nr_bob|uint32_t>[<key_size|uint32_t><key|uint8_t*><data_size|
+ * uint32_t><data|uint8_t*>]*
+ */
+//! TODO: fix one thread set cache when other threads is using old cache
+class InFilePersistentCache final : public mgb::PersistentCache {
+    class InputFile;
+    class InputMemory;
+    class OutputFile;
+    struct BlobStorage : public Blob {
+        std::unique_ptr<uint8_t[]> data_refhold;
+        size_t hash = 0;
+
+        template <typename Input>
+        BlobStorage& init_from_input(Input& inp);
+        void write_to_file(OutputFile& out_file) const;
+        BlobStorage& init_data_ref(const Blob& b);
+
+        BlobStorage& init_hash() {
+            hash = mgb::XXHash{}.update(ptr, size).digest();
+            return *this;
+        }
+
+        bool operator==(const BlobStorage& rhs) const {
+            return size == rhs.size && !memcmp(ptr, rhs.ptr, size);
+        }
+
+        struct Hash {
+            size_t operator()(const BlobStorage& b) const { return b.hash; }
+        };
+    };
+    std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
+                                                       BlobStorage::Hash>>
+            m_cache;
+    LITE_MUTEX m_mtx;
+    std::shared_ptr<OutputFile> m_always_open_file;
+
+    template <typename Input>
+    void read_cache(Input& inp);
+
+public:
+    InFilePersistentCache() = default;
+    InFilePersistentCache(const char* path, bool always_open = false);
+    InFilePersistentCache(const uint8_t* bin, size_t size);
+
+    /**
+     * \warning You should invoke \c dump_cache mannually to save the cache
+     * file.
+     */
+    void dump_cache(const char* path);
+    void dump_cache(OutputFile* out_file);
+
+    mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override;
+    void put(const std::string& category, const Blob& key,
+             const Blob& value) override;
+};
+
+}  // namespace lite
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/algo_cache/redis_cache.cpp b/lite/src/mge/algo_cache/redis_cache.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ecf0ccbcb1604f98e61854cd4d827cf3037a34e0
--- /dev/null
+++ b/lite/src/mge/algo_cache/redis_cache.cpp
@@ -0,0 +1,241 @@
+/**
+ * \file lite/src/mge/algo_cache/redis_cache.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA
+#include "../../misc.h"
+#include "redis_cache.h"
+
+#include <iostream>
+#include <vector>
+
+namespace {
+
+/*
+** Translation Table as described in RFC1113
+*/
+static const char cb64[] =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+/*
+** Translation Table to decode:
+*https://github.com/dgiardini/imgcalkap/blob/master/base64.c
+*/
+static const char cd64[] =
+        "|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`"
+        "abcdefghijklmnopq";
+
+/*
+** encodeblock
+**
+** encode 3 8-bit binary bytes as 4 '6-bit' characters
+*/
+void encodeblock(unsigned char in[3], unsigned char out[4], int len) {
+    out[0] = cb64[in[0] >> 2];
+    out[1] = cb64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)];
+    out[2] = (unsigned char)(len > 1 ? cb64[((in[1] & 0x0f) << 2) |
+                                            ((in[2] & 0xc0) >> 6)]
+                                     : '=');
+    out[3] = (unsigned char)(len > 2 ? cb64[in[2] & 0x3f] : '=');
+}
+
+/*
+** decodeblock
+**
+** decode 4 '6-bit' characters into 3 8-bit binary bytes
+*/
+void decodeblock(unsigned char in[4], unsigned char out[3]) {
+    out[0] = (unsigned char)(in[0] << 2 | in[1] >> 4);
+    out[1] = (unsigned char)(in[1] << 4 | in[2] >> 2);
+    out[2] = (unsigned char)(((in[2] << 6) & 0xc0) | in[3]);
+}
+
+/**
+ * Encode string to base64 string
+ * @param input - source string
+ * @param outdata - target base64 string
+ * @param linesize - max size of line
+ */
+void encode(const std::vector<std::uint8_t>& input,
+            std::vector<std::uint8_t>& outdata, int linesize = 76) {
+    outdata.clear();
+
+    unsigned char in[3], out[4];
+    int i, len, blocksout = 0;
+    size_t j = 0;
+
+    auto* indata = reinterpret_cast<const unsigned char*>(input.data());
+    unsigned int insize = input.size();
+
+    while (j <= insize) {
+        len = 0;
+        for (i = 0; i < 3; i++) {
+            in[i] = (unsigned char)indata[j];
+            j++;
+            if (j <= insize) {
+                len++;
+            } else {
+                in[i] = 0;
+            }
+        }
+        if (len) {
+            encodeblock(in, out, len);
+            for (i = 0; i < 4; i++) {
+                outdata.push_back(out[i]);
+            }
+            blocksout++;
+        }
+        if (blocksout >= (linesize / 4) || (j == insize)) {
+            if (blocksout) {
+                outdata.push_back('\r');
+                outdata.push_back('\n');
+            }
+            blocksout = 0;
+        }
+    }
+}
+
+/**
+ * Decode base64 string ot source
+ * @param input - base64 string
+ * @param outdata - source string
+ */
+void decode(const std::vector<std::uint8_t>& input,
+            std::vector<std::uint8_t>& outdata) {
+    outdata.clear();
+
+    unsigned char in[4], out[3], v;
+    int i, len;
+    size_t j = 0;
+
+    auto* indata = reinterpret_cast<const unsigned char*>(input.data());
+    unsigned int insize = input.size();
+
+    while (j <= insize) {
+        for (len = 0, i = 0; i < 4 && (j <= insize); i++) {
+            v = 0;
+            while ((j <= insize) && v == 0) {
+                v = (unsigned char)indata[j++];
+                v = (unsigned char)((v < 43 || v > 122) ? 0 : cd64[v - 43]);
+                if (v) {
+                    v = (unsigned char)((v == '$') ? 0 : v - 61);
+                }
+            }
+            if (j <= insize) {
+                len++;
+                if (v) {
+                    in[i] = (unsigned char)(v - 1);
+                }
+            } else {
+                in[i] = 0;
+            }
+        }
+        if (len) {
+            decodeblock(in, out);
+            for (i = 0; i < len - 1; i++) {
+                outdata.push_back(out[i]);
+            }
+        }
+    }
+}
+
+/**
+ * Encode binary data to base64 buffer
+ * @param input - source data
+ * @param outdata - target base64 buffer
+ * @param linesize
+ */
+void encode(const std::string& input, std::string& outdata, int linesize = 76) {
+    std::vector<std::uint8_t> out;
+    std::vector<std::uint8_t> in(input.begin(), input.end());
+    encode(in, out, linesize);
+    outdata = std::string(out.begin(), out.end());
+}
+
+/**
+ * Decode base64 buffer to source binary data
+ * @param input - base64 buffer
+ * @param outdata - source binary data
+ */
+void decode(const std::string& input, std::string& outdata) {
+    std::vector<std::uint8_t> in(input.begin(), input.end());
+    std::vector<std::uint8_t> out;
+    decode(in, out);
+    outdata = std::string(out.begin(), out.end());
+}
+
+}  // namespace
+
+using namespace lite;
+
+RedisCache::RedisCache(std::string redis_ip, size_t port, std::string password)
+        : m_ip(redis_ip), m_port(port), m_password(password) {
+    m_client.auth(password);
+    m_client.connect(
+            m_ip, m_port,
+            [](const std::string& host, std::size_t port,
+               cpp_redis::connect_state status) {
+                if (status == cpp_redis::connect_state::dropped) {
+                    LITE_LOG("client disconnected from %s.", host.c_str());
+                    LITE_LOG("Redis server connect to %s :%zu failed.",
+                             host.c_str(), port);
+                }
+            },
+            std::uint32_t(200));
+}
+
+mgb::Maybe<mgb::PersistentCache::Blob> RedisCache::get(
+        const std::string& category, const mgb::PersistentCache::Blob& key) {
+    LITE_LOCK_GUARD(m_mtx);
+    if (m_old == nullptr) {
+        return mgb::None;
+    }
+    auto mem_result = m_old->get(category, key);
+    if (mem_result.valid())
+        return mem_result;
+
+    std::string key_str(static_cast<const char*>(key.ptr), key.size);
+    std::string redis_key_str;
+    encode(category + '@' + key_str, redis_key_str, 24);
+    auto result = m_client.get(redis_key_str);
+    m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100));
+    LITE_ASSERT(is_valid());
+    auto content = result.get();
+    if (content.is_null())
+        return mgb::None;
+    std::string decode_content;
+    decode(content.as_string(), decode_content);
+    m_old->put(category, key, {decode_content.data(), decode_content.length()});
+
+    return m_old->get(category, key);
+}
+
+void RedisCache::put(const std::string& category, const Blob& key,
+                     const mgb::PersistentCache::Blob& value) {
+    // ScopedTimer t1(std::string("put") + category);
+    LITE_LOCK_GUARD(m_mtx);
+    std::string key_str(static_cast<const char*>(key.ptr), key.size);
+    std::string redis_key_str;
+    encode(category + '@' + key_str, redis_key_str);
+    std::string value_str(static_cast<const char*>(value.ptr), value.size);
+    std::string redis_value_str;
+    encode(value_str, redis_value_str);
+
+    auto result = m_client.set(redis_key_str, redis_value_str);
+    if (m_old == nullptr) {
+        return;
+    }
+    m_old->put(category, key, value);
+    m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100));
+    LITE_ASSERT(is_valid());
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/algo_cache/redis_cache.h b/lite/src/mge/algo_cache/redis_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0bd0032f629c8ad4872dd3fec71b5f61124391b
--- /dev/null
+++ b/lite/src/mge/algo_cache/redis_cache.h
@@ -0,0 +1,47 @@
+/**
+ * \file lite/src/mge/algo_cache/redis_cache.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA
+#include <cpp_redis/cpp_redis>
+#include <string>
+#include <vector>
+#include "megbrain/utils/persistent_cache.h"
+
+namespace lite {
+
+//! TODO: fix one thread set cache when other threads is using old cache
+class RedisCache final : public mgb::PersistentCache {
+public:
+    RedisCache(std::string redis_ip, size_t port, std::string password);
+
+    bool is_valid() { return m_client.is_connected(); }
+    ~RedisCache() {}
+    void init(std::shared_ptr<mgb::PersistentCache> old) { m_old = old; }
+
+    mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override;
+
+    void put(const std::string& category, const Blob& key,
+             const Blob& value) override;
+
+private:
+    std::shared_ptr<mgb::PersistentCache> m_old;
+    LITE_MUTEX m_mtx;
+    cpp_redis::client m_client;
+    const std::string m_ip;
+    const size_t m_port;
+    const std::string m_password;
+};
+
+}  // namespace lite
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/common.cpp b/lite/src/mge/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08fdcaa756be423811ef77237c19d6c78517b980
--- /dev/null
+++ b/lite/src/mge/common.cpp
@@ -0,0 +1,191 @@
+/**
+ * \file src/mge/common.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "common.h"
+#include "megdnn/dtype.h"
+
+using namespace lite;
+using namespace mgb;
+
+enum class CompressionMethod {
+    NO_COMPRESSION = 0,
+    FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS = 1,
+    FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS = 2,
+};
+
+void lite::decompressed_tensor_value_loader(
+        void* ptr_, const mgb::TensorLayout& layout,
+        mgb::serialization::InputFile& fin) {
+    uint8_t compress_flag;
+    fin.read(&compress_flag, sizeof(compress_flag));
+    size_t num_weights = layout.total_nr_elems();
+    switch (CompressionMethod(compress_flag)) {
+        case CompressionMethod::NO_COMPRESSION: {
+            mgb::serialization::GraphLoadConfig::default_tensor_value_loader(
+                    ptr_, layout, fin);
+            break;
+        }
+        case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS: {
+            if (ptr_) {
+                float stride, base;
+                std::vector<uint8_t> weights(num_weights);
+                fin.read(&stride, sizeof(stride));
+                fin.read(&base, sizeof(base));
+                fin.read(weights.data(), num_weights * sizeof(uint8_t));
+                auto* ptr = static_cast<float*>(ptr_);
+                for (size_t i = 0; i < num_weights; ++i)
+                    ptr[i] = stride * weights[i] + base;
+            } else {
+                fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint8_t));
+            }
+            break;
+        }
+        case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS: {
+            if (ptr_) {
+                float stride, base;
+                std::vector<uint16_t> weights(num_weights);
+                fin.read(&stride, sizeof(stride));
+                fin.read(&base, sizeof(base));
+                fin.read(weights.data(), num_weights * sizeof(uint16_t));
+                auto* ptr = static_cast<float*>(ptr_);
+                for (size_t i = 0; i < num_weights; ++i)
+                    ptr[i] = stride * weights[i] + base;
+            } else {
+                fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint16_t));
+            }
+            break;
+        }
+        default:
+            LITE_THROW("Unexpected compression method");
+    }
+}
+
+LTensorLayout lite::to_impl_layout(const Layout& layout) {
+    mgb::TensorLayout mge_layout;
+    mge_layout.ndim = layout.ndim;
+    LITE_ASSERT(layout.ndim < TensorShape::MAX_NDIM,
+                "lite layout ndim is to large");
+    for (size_t i = 0; i < layout.ndim; i++) {
+        mge_layout.shape[i] = layout.shapes[i];
+    }
+    mge_layout.init_contiguous_stride();
+    switch (layout.data_type) {
+        case LiteDataType::LITE_FLOAT:
+            mge_layout.dtype = mgb::dtype::Float32();
+            break;
+        case LiteDataType::LITE_HALF:
+            mge_layout.dtype = mgb::dtype::Float16();
+            break;
+        case LiteDataType::LITE_INT:
+            mge_layout.dtype = mgb::dtype::Int32();
+            break;
+        case LiteDataType::LITE_INT8:
+            mge_layout.dtype = mgb::dtype::Int8();
+            break;
+        case LiteDataType::LITE_UINT8:
+            mge_layout.dtype = mgb::dtype::Uint8();
+            break;
+        case LiteDataType::LITE_INT16:
+            mge_layout.dtype = mgb::dtype::Int16();
+            break;
+        default:
+            LITE_THROW(mgb::ssprintf("unsupport dtype in lite enum id is %d.",
+                                     static_cast<int>(layout.data_type)));
+    }
+    return mge_layout;
+}
+
+Layout lite::to_lite_layout(const LTensorLayout& mge_layout) {
+    Layout layout;
+    if (!mge_layout.dtype.valid()) {
+        return layout;
+    }
+    layout.ndim = mge_layout.ndim;
+    LITE_ASSERT(layout.ndim < layout.MAXDIM, "tensor layout ndim is to large");
+    for (size_t i = 0; i < layout.ndim; i++) {
+        layout.shapes[i] = mge_layout.shape[i];
+    }
+    switch (mge_layout.dtype.enumv()) {
+        case mgb::DTypeEnum::Float32:
+            layout.data_type = LiteDataType::LITE_FLOAT;
+            break;
+        case mgb::DTypeEnum::Float16:
+            layout.data_type = LiteDataType::LITE_HALF;
+            break;
+        case mgb::DTypeEnum::Int32:
+            layout.data_type = LiteDataType::LITE_INT;
+            break;
+        case mgb::DTypeEnum::Int16:
+            layout.data_type = LiteDataType::LITE_INT16;
+            break;
+        case mgb::DTypeEnum::Int8:
+            layout.data_type = LiteDataType::LITE_INT8;
+            break;
+        case mgb::DTypeEnum::Uint8:
+            layout.data_type = LiteDataType::LITE_UINT8;
+            break;
+        default:
+            LITE_THROW(mgb::ssprintf("unsupport dtype in lite : %s.",
+                                     mge_layout.to_string().c_str()));
+    }
+    return layout;
+}
+
+mgb::CompNode::Locator lite::to_compnode_locator(const LiteDeviceType& device) {
+    mgb::CompNode::Locator loc;
+    switch (device) {
+        case LiteDeviceType::LITE_CPU:
+            loc.type = mgb::CompNode::DeviceType::CPU;
+            break;
+        case LiteDeviceType::LITE_CUDA:
+            loc.type = mgb::CompNode::DeviceType::CUDA;
+            break;
+        case LiteDeviceType::LITE_ATLAS:
+            loc.type = mgb::CompNode::DeviceType::ATLAS;
+            break;
+        case LiteDeviceType::LITE_OPENCL:
+            loc.type = mgb::CompNode::DeviceType::OPENCL;
+            break;
+        case LiteDeviceType::LITE_DEVICE_DEFAULT:
+            loc.type = mgb::CompNode::DeviceType::UNSPEC;
+            break;
+        default:
+            LITE_THROW(
+                    ssprintf("lite unsupported compnode type: enum value: %d.",
+                             (int)(device)));
+    }
+    return loc;
+}
+
+LiteDeviceType lite::get_device_from_locator(
+        const mgb::CompNode::Locator& locator) {
+    switch (locator.type) {
+        case mgb::CompNode::DeviceType::CPU:
+        case mgb::CompNode::DeviceType::MULTITHREAD:
+            return LiteDeviceType::LITE_CPU;
+        case mgb::CompNode::DeviceType::CUDA:
+            return LiteDeviceType::LITE_CUDA;
+        case mgb::CompNode::DeviceType::ATLAS:
+            return LiteDeviceType::LITE_ATLAS;
+        case mgb::CompNode::DeviceType::OPENCL:
+            return LiteDeviceType::LITE_OPENCL;
+        case mgb::CompNode::DeviceType::UNSPEC:
+            return LiteDeviceType::LITE_DEVICE_DEFAULT;
+        default:
+            LITE_THROW(
+                    ssprintf("lite unsupported compnode type: enum value: %d.",
+                             (int)(locator.type)));
+    }
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/common.h b/lite/src/mge/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d4066d973908093c0acfbe1d7f1a73749c59406
--- /dev/null
+++ b/lite/src/mge/common.h
@@ -0,0 +1,66 @@
+/**
+ * \file src/mge/common.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../misc.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+#include "megbrain/comp_node.h"
+#include "megbrain/serialization/serializer.h"
+#include "megbrain/tensor.h"
+
+//! rename mge name L*
+namespace lite {
+using LTensorLayout = mgb::TensorLayout;
+using LComputingGraph = mgb::ComputingGraph;
+using LDeviceTensorStorage = mgb::DeviceTensorStorage;
+}  // namespace lite
+
+namespace lite {
+/*!
+ * \brief transform mgelite Layout to mgb TensorLayout
+ */
+LTensorLayout to_impl_layout(const Layout& layout);
+
+/*!
+ * \brief transform mgb TensorLayout to mgelite Layout
+ */
+Layout to_lite_layout(const mgb::TensorLayout& mge_layout);
+
+/*!
+ * \brief transform mgelite device to mgb CompNode Locator
+ */
+mgb::CompNode::Locator to_compnode_locator(const LiteDeviceType& device);
+
+/*!
+ * \brief transform mgb CompNode Locator to lite Device
+ */
+LiteDeviceType get_device_from_locator(const mgb::CompNode::Locator& locator);
+
+/*! \brief A megbrain tensor loader with weight decompression.
+ *
+ * The weight to be compressed must start with a byte of compression flag (CF).
+ *
+ * 1. CF = 0: no compression.
+ * 2. CF = 1: float32 stride + float32 base + uint8 weight (return s*w+b)
+ * 3. CF = 2: float32 stride + float32 base + uint16 weight (return s*w+b)
+ *
+ */
+void decompressed_tensor_value_loader(void* ptr_,
+                                      const mgb::TensorLayout& layout,
+                                      mgb::serialization::InputFile& fin);
+
+}  // namespace lite
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/function_dft.h b/lite/src/mge/function_dft.h
new file mode 100644
index 0000000000000000000000000000000000000000..a997a3f357e6a330cb2be7b584cd651811ef925f
--- /dev/null
+++ b/lite/src/mge/function_dft.h
@@ -0,0 +1,212 @@
+
+/**
+ * \file src/mge/function_dft.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#if LITE_BUILD_WITH_MGE
+#include "function_base.h"
+#include "network_impl.h"
+#include "network_impl_base.h"
+#include "tensor_impl.h"
+namespace lite {
+
+#define THROW_FUNC_ERROR(func_name)                                   \
+    auto msg_info = func_name + "  is not aviliable in Dft backend."; \
+    LITE_THROW(msg_info.c_str())
+
+// the functions used for dft's tensor.cpp are as followed:
+
+template <>
+inline std::shared_ptr<Tensor::TensorImplBase>
+call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
+        std::string func_name) {
+    if (func_name == "create_tensor") {
+        return std::make_shared<TensorImplDft>();
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline std::shared_ptr<Tensor::TensorImplBase>
+call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
+        std::string func_name, LiteDeviceType device_type,
+        bool is_pinned_host) {
+    if (func_name == "create_tensor") {
+        return std::make_shared<TensorImplDft>(device_type, is_pinned_host);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline std::shared_ptr<Tensor::TensorImplBase>
+call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
+        std::string func_name, int device_id, LiteDeviceType device_type,
+        const Layout layout, bool is_pinned_host) {
+    if (func_name == "create_tensor") {
+        return std::make_shared<TensorImplDft>(device_id, device_type, layout,
+                                               is_pinned_host);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline std::shared_ptr<Tensor::TensorImplBase>
+call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
+        std::string func_name, LiteDeviceType device_type, const Layout layout,
+        bool is_pinned_host) {
+    if (func_name == "create_tensor") {
+        return std::make_shared<TensorImplDft>(device_type, layout,
+                                               is_pinned_host);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline std::shared_ptr<Tensor::TensorImplBase>
+call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
+        std::string func_name, int device_id, int stream_id,
+        LiteDeviceType device_type, bool is_pinned_host) {
+    if (func_name == "create_tensor") {
+        return std::make_shared<TensorImplDft>(device_id, stream_id,
+                                               device_type, is_pinned_host);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+// the functions used for dft's network.cpp are as followed:
+
+template <>
+inline std::unique_ptr<Network::NetworkImplBase>
+call_func<NetworkImplDft, std::unique_ptr<Network::NetworkImplBase>>(
+        std::string func_name) {
+    if (func_name == "create_network") {
+        return std::make_unique<NetworkImplDft>();
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline Network::NetworkImplBase*
+try_call_func<NetworkImplDft, Network::NetworkImplBase*>(
+        std::string func_name) {
+    if (func_name == "parse_model") {
+        return new NetworkImplDft();
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+#define CALL_FUNC(func_name, ...) \
+    network_impl->cast_final_safe<NetworkImplDft>().func_name(__VA_ARGS__)
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        size_t num) {
+    if (func_name == "set_cpu_threads_number") {
+        CALL_FUNC(set_cpu_threads_number, num);
+    } else if (func_name == "set_network_algo_workspace_limit") {
+        CALL_FUNC(set_network_algo_workspace_limit, num);
+    } else {
+        THROW_FUNC_ERROR(func_name);
+    }
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl) {
+    if (func_name == "use_tensorrt") {
+        CALL_FUNC(use_tensorrt);
+    } else if (func_name == "set_cpu_inplace_mode") {
+        CALL_FUNC(set_cpu_inplace_mode);
+    } else {
+        THROW_FUNC_ERROR(func_name);
+    }
+}
+
+template <>
+inline size_t call_func<NetworkImplDft, size_t>(
+        std::string func_name, Network::NetworkImplBase* network_impl) {
+    if (func_name == "get_cpu_threads_number") {
+        return CALL_FUNC(get_cpu_threads_number);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline bool call_func<NetworkImplDft, bool>(
+        std::string func_name, Network::NetworkImplBase* network_impl) {
+    if (func_name == "is_cpu_inplace_mode") {
+        return CALL_FUNC(is_cpu_inplace_mode);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        ThreadAffinityCallback thread_affinity_callback) {
+    if (func_name == "set_runtime_thread_affinity") {
+        return CALL_FUNC(set_runtime_thread_affinity,
+                         std::move(thread_affinity_callback));
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size,
+        bool binary_equal_between_batch) {
+    if (func_name == "set_network_algo_policy") {
+        return CALL_FUNC(set_network_algo_policy, strategy, shared_batch_size,
+                         binary_equal_between_batch);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        std::shared_ptr<Allocator> user_allocator) {
+    if (func_name == "set_memory_allocator") {
+        return CALL_FUNC(set_memory_allocator, user_allocator);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        std::string file_name) {
+    if (func_name == "enable_io_txt_dump") {
+        return CALL_FUNC(enable_io_txt_dump, file_name);
+    } else if (func_name == "enable_io_bin_dump") {
+        return CALL_FUNC(enable_io_bin_dump, file_name);
+    }
+    THROW_FUNC_ERROR(func_name);
+}
+
+template <>
+inline void call_func<NetworkImplDft, void>(
+        std::string func_name, Network::NetworkImplBase* network_impl,
+        Network::NetworkImplBase* src_network_impl) {
+    if (func_name == "share_runtime_memory_with") {
+        CALL_FUNC(share_runtime_memory_with, src_network_impl);
+    } else if (func_name == "shared_weight_with") {
+        CALL_FUNC(shared_weight_with, src_network_impl);
+    } else {
+        THROW_FUNC_ERROR(func_name);
+    }
+}
+#undef THROW_FUNC_ERROR
+
+}  // namespace lite
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/memory_allocator.h b/lite/src/mge/memory_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d321fd9fbeeab482403a40c04b5f8e2101159232
--- /dev/null
+++ b/lite/src/mge/memory_allocator.h
@@ -0,0 +1,69 @@
+/**
+ * \file src/mge/memory_alloctor.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "common.h"
+#include "megbrain/dtype.h"
+#include "network_impl.h"
+
+#include "megbrain/graph/cg.h"
+
+namespace lite {
+
+class UserStaticMemAlloc final : public mgb::cg::DeviceMemoryAllocator {
+    std::shared_ptr<Allocator> m_allocator = nullptr;
+
+public:
+    UserStaticMemAlloc(std::shared_ptr<Allocator> allocator)
+            : m_allocator(allocator) {}
+
+    void alloc_static(LComputingGraph*, LDeviceTensorStorage& dest,
+                      size_t size) override {
+        if (size < dest.size()) {
+            return;
+        }
+        auto cn = dest.comp_node_allow_invalid();
+        LITE_ASSERT(cn.valid(), "The compnode is invalid when alloc memory.");
+        LiteDeviceType device_type =
+                get_device_from_locator(cn.locator_logical());
+        int device_id = cn.locator_logical().device;
+        auto ptr_alloc = static_cast<mgb::dt_byte*>(m_allocator->allocate(
+                device_type, device_id, size, cn.get_mem_addr_alignment()));
+        auto storage = std::shared_ptr<mgb::dt_byte>(
+                ptr_alloc,
+                [allocator = m_allocator, device_type, device_id](void* ptr) {
+                    allocator->free(device_type, device_id, ptr);
+                });
+        dest.reset(cn, size, storage);
+    }
+    void alloc_dynamic(mgb::VarNode*, mgb::DeviceTensorStorage& dest,
+                       size_t size) override {
+        alloc_static(nullptr, dest, size);
+    }
+
+    void defrag_prealloc_contig(mgb::ComputingGraph*, mgb::CompNode comp_node,
+                                size_t size) override {
+        LiteDeviceType device_type =
+                get_device_from_locator(comp_node.locator_logical());
+        int device_id = comp_node.locator_logical().device;
+        auto ptr_tmp =
+                m_allocator->allocate(device_type, device_id, size,
+                                      comp_node.get_mem_addr_alignment());
+        m_allocator->free(device_type, device_id, ptr_tmp);
+    }
+};
+
+}  // namespace lite
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae176632edcccb48e566f219d62efcb165703e05
--- /dev/null
+++ b/lite/src/mge/network_impl.cpp
@@ -0,0 +1,781 @@
+/**
+ * \file src/mge/network_impl.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "network_impl.h"
+#include "common.h"
+#include "lite/network.h"
+#include "memory_allocator.h"
+#include "parse_model/model_parser.h"
+#include "parse_info/parse_info_base.h"
+
+#include "megbrain/common.h"
+#include "megbrain/comp_node.h"
+#include "megbrain/comp_node_env.h"
+#include "megbrain/gopt/inference.h"
+#include "megbrain/graph.h"
+#include "megbrain/graph/cg.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/tensor.h"
+
+#if MGB_OPENCL
+#include "megcore_opencl.h"
+#endif
+
+#include <fstream>
+#include <memory>
+#include <set>
+
+using namespace lite;
+using namespace mgb;
+
+LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft);
+
+void NetworkImplDft::set_config(const Config& config) {
+    m_user_config = std::make_unique<Config>();
+    *m_user_config = config;
+    m_load_config.comp_graph = mgb::ComputingGraph::make();
+    m_compnode_locator = to_compnode_locator(m_user_config->device_type);
+    m_compnode_locator.device = config.device_id;
+}
+
+void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) {
+    application_config();
+    const auto& src_impl = src_network->cast_final_safe<NetworkImplDft>();
+    LITE_ASSERT(src_impl.m_loader,
+                "Clone network must after the network is loaded.");
+    m_load_result = src_impl.m_loader->load(m_load_config, true);
+
+    //! flag weather the mode is cross compnode model
+    cross_compnode_model_detect();
+
+    //! update the IO of the network
+    update_io();
+
+    //! replace the IO when there is device input or output
+    compile_graph();
+}
+
+void NetworkImplDft::application_config() {
+    auto device_type = m_user_config->device_type;
+    m_compnode_locator.type = to_compnode_locator(device_type).type;
+    m_compnode_locator.device = m_user_config->device_id;
+    if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) {
+        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
+        m_compnode_locator.device = m_user_config->device_id;
+    }
+    //! model options
+#define ConfigOption(mge_name, lite_name) \
+    options.mge_name = m_user_config->options.lite_name;
+
+    auto&& options = m_load_config.comp_graph->options();
+    ConfigOption(graph_opt.weight_preprocess, weight_preprocess);
+    ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess);
+    ConfigOption(fake_next_exec, fake_next_exec);
+    ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run);
+    m_load_config.const_var_shape = m_user_config->options.const_shape;
+    ConfigOption(force_dynamic_alloc, force_dynamic_alloc);
+    ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc);
+    ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change);
+    LITE_ASSERT(m_user_config->options.jit_level == 0 ||
+                        (m_user_config->options.jit_level > 0 &&
+                         device_type == LiteDeviceType::LITE_CUDA),
+                "jit only support in cuda device.");
+    ConfigOption(graph_opt.jit, jit_level);
+    ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level);
+    ConfigOption(graph_opt_level, graph_opt_level);
+    ConfigOption(async_exec_level, async_exec_level);
+
+#undef ConfigOption
+#define ConfigOptionLayoutTransform(name) \
+    if (m_user_config->options.name) {    \
+        options.graph_opt.name();         \
+    }
+    ConfigOptionLayoutTransform(enable_nchw44);
+    ConfigOptionLayoutTransform(enable_nchw44_dot);
+    ConfigOptionLayoutTransform(enable_nchw88);
+    ConfigOptionLayoutTransform(enable_nhwcd4);
+    ConfigOptionLayoutTransform(enable_nchw4);
+    ConfigOptionLayoutTransform(enable_nchw32);
+    ConfigOptionLayoutTransform(enable_nchw64);
+#undef ConfigOptionLayoutTransform
+    if (m_user_config->has_compression) {
+        m_load_config.tensor_value_loader = decompressed_tensor_value_loader;
+    }
+
+    //! if device is LITE_NONE, the compnode information is stored in model
+    if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) {
+        //! currently not set Locator type because an atlas mgb model is a
+        //! cross-compnode graph
+        if (device_type == LiteDeviceType::LITE_ATLAS) {
+            m_load_config.comp_node_mapper =
+                    [this](mgb::CompNode::Locator& loc) {
+                        if (loc.type == mgb::CompNode::DeviceType::ATLAS) {
+                            loc.device = m_compnode_locator.device;
+                            loc.stream = m_compnode_locator.stream;
+                        } else if (loc.type ==
+                                   mgb::CompNode::DeviceType::MULTITHREAD) {
+                            loc.stream = m_nr_threads;
+                        }
+                    };
+        } else {
+            m_load_config.comp_node_mapper =
+                    [this](mgb::CompNode::Locator& loc) {
+                        loc = m_compnode_locator;
+                    };
+        }
+    }
+}
+
+void NetworkImplDft::set_memory_allocator(
+        std::shared_ptr<Allocator> user_allocator) {
+    auto allocator = std::make_shared<UserStaticMemAlloc>(user_allocator);
+    LITE_ASSERT(m_load_config.comp_graph);
+    m_load_config.comp_graph->set_device_memory_allocator(allocator);
+}
+
+//! share the runtime memory with other network, the weights is not shared
+void NetworkImplDft::share_runtime_memory_with(
+        Network::NetworkImplBase* network_impl) {
+    LITE_ASSERT(network_impl);
+    LITE_ASSERT(m_load_config.comp_graph);
+    m_load_config.comp_graph->share_device_memory_with(
+            *(network_impl->cast_final_safe<NetworkImplDft>()
+                      .m_load_config.comp_graph));
+}
+
+void NetworkImplDft::set_cpu_inplace_mode() {
+    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
+                "cpu inplace mode is only avaliable in CPU.");
+    m_is_cpu_inplace_mode = true;
+    if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) {
+        m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
+    } else {
+        LITE_ASSERT(
+                m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD,
+                "cpu inplace mode is only avaliable in CPU.");
+        m_compnode_locator.device =
+                mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
+    }
+}
+
+void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) {
+    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
+                "multi threads mode is only avaliable in CPU.");
+    if (nr_threads > 1) {
+        m_nr_threads = nr_threads;
+        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
+        m_compnode_locator.nr_threads = nr_threads;
+    }
+}
+
+void NetworkImplDft::set_runtime_thread_affinity(
+        const ThreadAffinityCallback& thread_affinity_callback) {
+    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
+                "multi threads mode is only avaliable in CPU.");
+    mgb::CompNode::Locator loc;
+    m_load_config.comp_node_mapper(loc);
+    auto cn = mgb::CompNode::load(loc);
+    if (m_nr_threads > 1) {
+        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity(
+                thread_affinity_callback);
+    } else {
+        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
+                [thread_affinity_callback](void) {
+                    thread_affinity_callback(0);
+                });
+    }
+}
+
+void NetworkImplDft::set_device_id(int device_id) {
+    m_compnode_locator.device = device_id;
+    m_user_config->device_id = device_id;
+}
+
+void NetworkImplDft::set_stream_id(int stream_id) {
+    m_compnode_locator.stream = stream_id;
+}
+
+void NetworkImplDft::use_tensorrt() {
+    auto&& options = m_load_config.comp_graph->options();
+    options.graph_opt.tensorrt = true;
+}
+
+//! set the callback in async model
+void NetworkImplDft::set_async_callback(const AsyncCallback& callback) {
+    LITE_ASSERT(!m_is_cpu_inplace_mode,
+                "cpu inplace mode not support async mode");
+    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU ||
+                        m_user_config->device_type == LiteDeviceType::LITE_CUDA,
+                "Now only cpu and cuda>10.0 support async mode");
+    m_async = true;
+    m_async_callback = std::move(callback);
+}
+
+void NetworkImplDft::make_output_spec() {
+    m_output_spec.clear();
+    for (auto&& out : m_network_io->outputs) {
+        if (m_load_result.output_var_map.count(out.name)) {
+            auto&& load_out = m_load_result.output_var_map[out.name];
+            auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable {
+                mgb::CompNode comp_node = dv.comp_node();
+                if (out.io_type == LiteIOType::LITE_IO_SHAPE) {
+                    auto mgb_layout = dv.layout();
+                    out.lite_tensor->set_layout(to_lite_layout(mgb_layout));
+                } else {
+                    TensorHelper::implement(out.lite_tensor)
+                            ->cast_final_safe<TensorImplDft>()
+                            .copy_from_mge_tensor(dv);
+                    out.lite_tensor->update_from_implement();
+                }
+                if (m_async) {
+                    out.have_sync = true;
+                    bool need_exec_cb = true;
+                    for (auto&& j : m_network_io->outputs) {
+                        if (!j.have_sync) {
+                            need_exec_cb = false;
+                        }
+                    }
+                    if (need_exec_cb) {
+                        for (auto&& j : m_network_io->outputs) {
+                            j.have_sync = false;
+                        }
+                        comp_node.add_callback([this]() { finish(); });
+                    }
+                }
+            };
+            m_output_spec.emplace_back(load_out, std::move(cb));
+        } else {
+            LITE_THROW(ssprintf("no output named : %s in the mode",
+                                out.name.c_str()));
+        }
+    }
+}
+
+void NetworkImplDft::replace_dev_input_pass() {
+    mgb::CompNode::Locator locator;
+    m_load_config.comp_node_mapper(locator);
+    //! CPU is not need use device input
+    if (locator.type == mgb::CompNode::DeviceType::CPU) {
+        return;
+    }
+    //! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor
+    //! in m_network_io.input, user can directly change the dev tensor
+    //! storage through m_network_io.input.lite_tensor->reset() befor forward
+    using DeviceTensorMap =
+            std::unordered_map<std::string,
+                               std::shared_ptr<mgb::DeviceTensorND>>;
+    DeviceTensorMap name2dev_tensor;
+
+    mgb::ThinHashMap<mgb::HostTensorND*, mgb::SymbolVar> host_val2var;
+
+    //! construct host_val2var that maps from host tensor to corresponding var
+    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
+        if (opr->same_type<mgb::opr::Host2DeviceCopy>()) {
+            mgb::HostTensorND* tensor =
+                    opr->cast_final<mgb::opr::Host2DeviceCopy>()
+                            .host_data()
+                            .get();
+            host_val2var[tensor] = opr->output(0);
+        }
+    };
+    mgb::cg::DepOprIter dep_iter{on_opr};
+    for (auto i : m_load_result.output_var_list) {
+        dep_iter.add(i.node()->owner_opr());
+    }
+
+    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> inp_var_map, out_var_map;
+
+    mgb::SmallVector<std::string> to_clear;
+    for (auto&& config_in : m_network_io->inputs) {
+        if (!config_in.is_host) {
+            auto host_val = m_load_result.tensor_map[config_in.name];
+            auto dev_val = TensorHelper::implement(config_in.lite_tensor)
+                                   ->cast_final_safe<TensorImplDft>()
+                                   .m_dev_tensor;
+            auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make(
+                    *m_load_result.graph, dev_val, {config_in.name});
+            inp_var_map[host_val2var.at(host_val.get())] = dev_var;
+            name2dev_tensor[config_in.name] = dev_val;
+        }
+    }
+    auto new_ovar =
+            mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map);
+    for (size_t i = 0; i < new_ovar.size(); ++i) {
+        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
+    }
+    for (auto&& i : m_load_result.output_var_map) {
+        i.second = out_var_map.at(i.second);
+    }
+    for (auto&& i : m_load_result.output_var_map_id) {
+        i.second = out_var_map.at(i.second);
+    }
+    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
+        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
+    }
+    m_load_result.output_var_list = std::move(new_ovar);
+}
+
+void NetworkImplDft::cross_compnode_model_detect() {
+    mgb::ThinHashSet<LiteDeviceType> nr_used_device_type;
+    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
+        for (auto j : opr->output()) {
+            if (j->comp_node() != mgb::CompNode::default_cpu()) {
+                nr_used_device_type.insert(
+                        get_device_from_locator(j->comp_node().locator()));
+            }
+        }
+    };
+    mgb::cg::DepOprIter dep_iter{on_opr};
+    for (auto i : m_load_result.output_var_list) {
+        dep_iter.add(i.node()->owner_opr());
+    }
+    m_nr_device_type  = nr_used_device_type.size();
+}
+
+void NetworkImplDft::load_model(
+        std::shared_ptr<void> model_mem, size_t size,
+        std::unordered_map<std::string, LiteAny> separate_config_map) {
+    if (!m_loader) {
+        m_input_file = mgb::serialization::InputFile::make_mem_proxy(
+                model_mem, size, false);
+        auto format =
+                mgb::serialization::GraphLoader::identify_graph_dump_format(
+                        *m_input_file);
+        if (!format.valid()) {
+            LITE_THROW("invalid model format");
+        }
+        m_loader = mgb::serialization::GraphLoader::make(
+                std::move(m_input_file), format.val());
+    }
+
+
+    //! applay the user configration to mge model
+    application_config();
+
+    //! config some flag get from json config file
+    if (separate_config_map.find("device_id") != separate_config_map.end()) {
+        set_device_id(separate_config_map["device_id"].unsafe_cast<int>());
+    }
+    if (separate_config_map.find("number_threads") !=
+                separate_config_map.end() &&
+        separate_config_map["number_threads"].unsafe_cast<size_t>() > 1) {
+        set_cpu_threads_number(
+                separate_config_map["number_threads"].unsafe_cast<size_t>());
+    }
+    if (separate_config_map.find("enable_inplace_model") !=
+                separate_config_map.end() &&
+        separate_config_map["enable_inplace_model"].unsafe_cast<bool>()) {
+        set_cpu_inplace_mode();
+    }
+    if (separate_config_map.find("use_tensorrt") != separate_config_map.end() &&
+        separate_config_map["use_tensorrt"].unsafe_cast<bool>()) {
+        use_tensorrt();
+    }
+
+    m_load_result = m_loader->load(m_load_config, true);
+
+    cross_compnode_model_detect();
+
+    //! update the IO of the network
+    update_io();
+
+    //! replace the IO when there is device input or output
+    compile_graph();
+}
+
+void NetworkImplDft::compile_graph() {
+    modify_exection_policy();
+    replace_dev_input_pass();
+    make_output_spec();
+    m_execute_func = m_load_result.graph_compile(m_output_spec);
+}
+
+void NetworkImplDft::start() const {
+    if (m_start_callback) {
+        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
+                input_io_map;
+        for (auto&& io_inner : m_network_io->inputs) {
+            input_io_map[io_inner.name] = {
+                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
+                       io_inner.config_layout},
+                    io_inner.lite_tensor};
+        }
+        m_start_callback(input_io_map);
+    }
+}
+
+void NetworkImplDft::forward() {
+    start();
+    LITE_ASSERT(m_execute_func, "forward must be called after network loaded.");
+    m_execute_func->execute();
+}
+
+void NetworkImplDft::wait() {
+    if (!m_async) {
+        m_execute_func->wait();
+    }
+    finish();
+}
+
+void NetworkImplDft::finish() const {
+    if (m_async) {
+        LITE_ASSERT(m_async_callback,
+                    "The callback func must set when async mode.");
+        m_async_callback();
+    }
+    if (m_finish_callback) {
+        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
+                output_io_map;
+        for (auto&& io_inner : m_network_io->outputs) {
+            output_io_map[io_inner.name] = {
+                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
+                       io_inner.config_layout},
+                    io_inner.lite_tensor};
+        }
+        m_finish_callback(output_io_map);
+    }
+    output_plugin_result();
+}
+
+void NetworkImplDft::set_io(const NetworkIO& network_io) {
+    m_network_io = std::make_unique<NetworkIOInner>();
+    for (auto&& in : network_io.inputs) {
+        m_network_io->inputs.emplace_back(in);
+    }
+    for (auto&& out : network_io.outputs) {
+        m_network_io->outputs.emplace_back(out);
+    }
+}
+
+void NetworkImplDft::update_io() {
+    update_input();
+    update_output();
+}
+
+void NetworkImplDft::update_input() {
+    auto device_type = m_user_config->device_type;
+    auto device_id = m_compnode_locator.device;
+    auto stream_id = m_compnode_locator.stream;
+    //! if cpu all input and output are host
+    if (device_type == LiteDeviceType::LITE_CPU) {
+        for (auto&& in : m_network_io->inputs) {
+            in.is_host = true;
+        }
+    }
+    //! if cross compnode model, modify the device input if it is not valid
+    if (m_nr_device_type > 1) {
+        for (auto&& in_tensor_iter : m_load_result.tensor_map) {
+            for (auto&& config_in : m_network_io->inputs) {
+                //! if tensor is set to device input
+                if (in_tensor_iter.first == config_in.name &&
+                    !config_in.is_host) {
+                    //! if the origin compnode of the tensor is not the device,
+                    //! set the input to host
+                    if (get_device_from_locator(
+                                in_tensor_iter.second->comp_node().locator()) ==
+                        LiteDeviceType::LITE_CPU) {
+                        config_in.is_host = true;
+                        LITE_WARN(
+                                "The input tensor %s of the cross device model "
+                                "should not from device.",
+                                config_in.name.c_str());
+                    }
+                }
+            }
+        }
+    }
+    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
+        bool found = false;
+        for (auto&& config_in : m_network_io->inputs) {
+            if (in_tensor_iter.first == config_in.name) {
+                found = true;
+                if (config_in.is_host) {
+                    config_in.lite_tensor = std::make_shared<Tensor>(
+                            device_id, stream_id, device_type, true);
+                    TensorHelper::implement(config_in.lite_tensor)
+                            ->cast_final_safe<TensorImplDft>()
+                            .m_host_tensor = in_tensor_iter.second;
+                    config_in.lite_tensor->update_from_implement();
+                } else {
+                    config_in.lite_tensor = std::make_shared<Tensor>(
+                            device_id, stream_id, device_type);
+                    config_in.lite_tensor->set_layout(
+                            to_lite_layout(in_tensor_iter.second->layout()));
+                }
+                if (config_in.config_layout.ndim &&
+                    !(config_in.config_layout ==
+                      config_in.lite_tensor->get_layout())) {
+                    config_in.lite_tensor->set_layout(config_in.config_layout);
+                }
+            }
+        }
+        if (!found) {
+            IOInner io_in;
+            io_in.name = in_tensor_iter.first;
+            io_in.lite_tensor = std::make_shared<Tensor>(device_id, stream_id,
+                                                         device_type, true);
+            TensorHelper::implement(io_in.lite_tensor)
+                    ->cast_final_safe<TensorImplDft>()
+                    .m_host_tensor = in_tensor_iter.second;
+            io_in.lite_tensor->update_from_implement();
+            m_network_io->inputs.push_back(io_in);
+        }
+    }
+    //! delete the IO that is not the network
+    for (auto it = m_network_io->inputs.begin();
+         it != m_network_io->inputs.end();) {
+        if (it->lite_tensor == nullptr) {
+            LITE_LOG("%s is not the network input, ignore it.",
+                     it->name.c_str());
+            it = m_network_io->inputs.erase(it);
+        } else {
+            it++;
+        }
+    }
+}
+
+void NetworkImplDft::update_output() {
+    auto device_type = m_user_config->device_type;
+    auto device_id = m_compnode_locator.device;
+    auto stream_id = m_compnode_locator.stream;
+    if (device_type == LiteDeviceType::LITE_CPU) {
+        for (auto&& out : m_network_io->outputs) {
+            out.is_host = true;
+        }
+    }
+    //! delete the output that is not the network
+    for (auto out_it = m_network_io->outputs.begin();
+         out_it != m_network_io->outputs.end();) {
+        if (std::find_if(m_load_result.output_var_list.begin(),
+                         m_load_result.output_var_list.end(),
+                         [out_it](const mgb::SymbolVar var) {
+                             return var.node()->name() == out_it->name;
+                         }) == m_load_result.output_var_list.end()) {
+            LITE_LOG("%s is not the network output, ignore it.",
+                     out_it->name.c_str());
+            out_it = m_network_io->outputs.erase(out_it);
+        } else {
+            out_it++;
+        }
+    }
+    //! user config the output tensor, so only compute the config output
+    if (m_compute_configured_output_only) {
+        LITE_ASSERT(m_network_io->outputs.size() > 0,
+                    "compute configured output only with no configure output.");
+        for (auto out_it = m_network_io->outputs.begin();
+             out_it != m_network_io->outputs.end(); out_it++) {
+            //! use pinned memory to copy form device
+            if (out_it->is_host) {
+                out_it->lite_tensor = std::make_shared<Tensor>(
+                        device_id, stream_id, device_type, true);
+            } else {
+                out_it->lite_tensor = std::make_shared<Tensor>(
+                        device_id, stream_id, device_type);
+            }
+        }
+        //! user not set, use default output
+    } else {
+        for (auto&& out : m_load_result.output_var_list) {
+            auto it = std::find_if(m_network_io->outputs.begin(),
+                                   m_network_io->outputs.end(),
+                                   [&out](const IOInner io) {
+                                       return io.name == out.node()->name();
+                                   });
+            if (it != m_network_io->outputs.end()) {
+                if (it->is_host) {
+                    it->lite_tensor = std::make_shared<Tensor>(
+                            device_id, stream_id, device_type, true);
+                } else {
+                    it->lite_tensor = std::make_shared<Tensor>(
+                            device_id, stream_id, device_type);
+                }
+            } else {
+                IOInner output;
+                output.name = out.node()->name();
+                output.lite_tensor = std::make_shared<Tensor>(
+                        device_id, stream_id, device_type, true);
+                m_network_io->outputs.push_back({output});
+            }
+        }
+    }
+}
+
+std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(std::string io_name,
+                                                      LiteTensorPhase phase) {
+    if (phase == LiteTensorPhase::LITE_INPUT ||
+        phase == LiteTensorPhase::LITE_IO) {
+        for (auto&& config_in : m_network_io->inputs) {
+            if (io_name == config_in.name) {
+                return config_in.lite_tensor;
+            }
+        }
+    }
+    if (phase == LiteTensorPhase::LITE_OUTPUT ||
+        phase == LiteTensorPhase::LITE_IO) {
+        for (auto&& config_out : m_network_io->outputs) {
+            if (io_name == config_out.name) {
+                config_out.lite_tensor->update_from_implement();
+                return config_out.lite_tensor;
+            }
+        }
+    }
+    LITE_THROW(mgb::ssprintf(
+            "tensor name must be %s input tensor name or the registered "
+            "output tensor name if NetworkIO is set, if NetworkIO is not set, "
+            "the output tensor is all the network output tensor, or the output "
+            "tensor is only the registered tensor.",
+            io_name.c_str()));
+    return nullptr;
+}
+
+std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) {
+    return get_io_tensor(get_input_name(index));
+}
+
+std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) {
+    return get_io_tensor(get_output_name(index));
+}
+
+//! set opr algorithm selection strategy in the network
+void NetworkImplDft::set_network_algo_policy(LiteAlgoSelectStrategy strategy,
+                                             uint32_t shared_batch_size,
+                                             bool binary_equal_between_batch) {
+    using S = megdnn::param::ExecutionPolicy::Strategy;
+    auto dst_strategy = static_cast<S>(0);
+    if (static_cast<uint32_t>(strategy) &
+        LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) {
+        dst_strategy = dst_strategy | S::HEURISTIC;
+    }
+    if (static_cast<uint32_t>(strategy) &
+        LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) {
+        dst_strategy = dst_strategy | S::PROFILE;
+    }
+    if (static_cast<uint32_t>(strategy) &
+        LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) {
+        dst_strategy = dst_strategy | S::REPRODUCIBLE;
+    }
+    if (static_cast<uint32_t>(strategy) &
+        LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) {
+        dst_strategy = dst_strategy | S::OPTIMIZED;
+    }
+    m_execution_policy = dst_strategy;
+
+    auto&& fast_run_config =
+            m_load_config.comp_graph->options().fast_run_config;
+    fast_run_config.binary_equal_between_batch = binary_equal_between_batch;
+    fast_run_config.shared_batch_size = shared_batch_size;
+
+    if (m_execute_func) {
+        LITE_WARN(
+                "set_network_algo_policy maybe cause error after loaded "
+                "network!!!!");
+        modify_exection_policy();
+    }
+}
+
+void NetworkImplDft::modify_exection_policy() {
+    mgb::SymbolVarArray vars;
+    for (auto i : m_output_spec) {
+        vars.push_back(i.first);
+    }
+    if (static_cast<uint32_t>(m_execution_policy) != 0)
+        mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy);
+}
+
+//! set opr algorithm selection strategy in the network
+void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) {
+    mgb::SymbolVarArray vars;
+    for (auto i : m_output_spec) {
+        vars.push_back(i.first);
+    }
+    mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit);
+}
+
+//! get the input tensor name in the order of graph
+std::vector<const char*> NetworkImplDft::get_all_output_name() const {
+    std::vector<const char*> output_names;
+    for (auto& output : m_network_io->outputs) {
+        output_names.push_back(output.name.c_str());
+    }
+    return output_names;
+}
+
+//! get the input tensor name in the order of graph
+std::vector<const char*> NetworkImplDft::get_all_input_name() const {
+    std::vector<const char*> input_names;
+    for (auto& input : m_load_result.tensor_map) {
+        input_names.push_back(input.first.c_str());
+    }
+    return input_names;
+}
+
+//! get the output tensor name in the order of graph
+const char* NetworkImplDft::get_output_name(size_t index) const {
+    LITE_ASSERT(
+            index < m_load_result.output_var_list.size(),
+            "The output tensor index is large than the total outputs number.");
+    return m_load_result.output_var_list[index].node()->name().c_str();
+}
+
+//! get the input tensor name in the order of graph
+const char* NetworkImplDft::get_input_name(size_t index) const {
+    LITE_ASSERT(
+            index < m_load_result.tensor_map.size(),
+            "The input tensor index is large than the total inputs number.");
+    size_t i = 0;
+    for (auto& input : m_load_result.tensor_map) {
+        if (i == index) {
+            return input.first.c_str();
+        }
+        i++;
+    }
+    LITE_THROW(ssprintf("no input tensor of index %zu.", index));
+}
+
+//! Plugin part
+void NetworkImplDft::enable_profile_performance(std::string profile_json_file) {
+#if MGB_ENABLE_JSON
+#if MGB_OPENCL
+    mgb::CompNode::enable_opencl_profile(true);
+#endif
+    m_profiler = std::make_unique<mgb::GraphProfiler>(
+            m_load_config.comp_graph.get());
+    m_profiler_output_file = profile_json_file;
+#else
+    LITE_MARK_USED_VAR(profile_json_file);
+    LITE_THROW("JSON is disable at compile time.");
+#endif
+}
+
+void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) {
+    auto iodump = std::make_unique<mgb::TextOprIODump>(
+            m_load_config.comp_graph.get(), io_txt_out_file.c_str());
+    iodump->print_addr(false);
+    m_iodump = std::move(iodump);
+}
+
+void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) {
+    m_iodump = std::make_unique<mgb::BinaryOprIODump>(
+            m_load_config.comp_graph.get(), io_bin_out_dir.c_str());
+}
+
+void inline NetworkImplDft::output_plugin_result() const {
+#if MGB_ENABLE_JSON
+    if (m_profiler && m_execute_func) {
+        m_profiler->to_json_full(m_execute_func.get())
+                ->writeto_fpath(m_profiler_output_file);
+    }
+#endif
+}
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/network_impl.h b/lite/src/mge/network_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd466da82d4360e3b65792a1f08762223c64e208
--- /dev/null
+++ b/lite/src/mge/network_impl.h
@@ -0,0 +1,242 @@
+/**
+ * \file src/mge/network_impl.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "lite/network.h"
+#include "network_impl_base.h"
+#include "tensor_impl.h"
+
+#include "megbrain/graph/bases.h"
+#include "megbrain/plugin/opr_io_dump.h"
+#include "megbrain/plugin/profiler.h"
+#include "megbrain/serialization/extern_c_opr.h"
+#include "megbrain/serialization/file.h"
+#include "megbrain/serialization/load_dump_config.h"
+#include "megbrain/serialization/serializer.h"
+#include "megbrain/utils/thin/hash_table.h"
+
+#include <memory>
+#include <unordered_map>
+
+namespace lite {
+
+/*!
+ * \brief implement the Network, contain the mgb related member
+ */
+class NetworkImplDft final : public Network::NetworkImplBase {
+    LITE_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    using S = megdnn::param::ExecutionPolicy::Strategy;
+    //! set the config of the network, include:
+    //! the inference device
+    //! the other inference options, such as record_level, weight_preprocess...
+    void set_config(const Config& config) override;
+
+    //! set the special io infomation, if not set, default io tensor will used,
+    //! this is special for input/output is not host tensor, default the
+    //! input/output tensors are host tensor
+    void set_io(const NetworkIO& network_io) override;
+
+    //! only compute the output tensor in user configured
+    void compute_only_configured_output() override {
+        m_compute_configured_output_only = true;
+    }
+
+    //! get the network input and ouput tensor, the layout of which is
+    //! sync from mge tensor
+    std::shared_ptr<Tensor> get_io_tensor(
+            std::string io_name,
+            LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override;
+
+    //! get the input tensor by index in the load_result tensormap
+    std::shared_ptr<Tensor> get_input_tensor(size_t index) override;
+
+    //! get the output tensor by index in the load_result output_var_list
+    std::shared_ptr<Tensor> get_output_tensor(size_t index) override;
+
+    //! get all the input tensor name in the order in load return
+    std::vector<const char*> get_all_input_name() const override;
+
+    //! get all the output tensor name in the order in load return
+    std::vector<const char*> get_all_output_name() const override;
+
+    //! get the input tensor name in the order in load return
+    const char* get_input_name(size_t index) const override;
+
+    //! get the output tensor name in the order in load return
+    const char* get_output_name(size_t index) const override;
+
+    //! set the callback in async model
+    void set_async_callback(const AsyncCallback& callback) override;
+
+    //! set the start callback which will execute before network forward
+    void set_start_callback(const StartCallback& callback) override {
+        m_start_callback = std::move(callback);
+    }
+
+    //! set the finish callback which will execute after network forward
+    void set_finish_callback(const FinishCallback& callback) override {
+        m_finish_callback = std::move(callback);
+    }
+
+    //! load the model and get the m_load_result
+    void load_model(std::shared_ptr<void> model_mem, size_t size,
+                    std::unordered_map<std::string, LiteAny>
+                            separate_config_map = {}) override;
+
+    //! forward the network with filled input data and fill the output data
+    //! to the output tensor
+    void forward() override;
+
+    //! in sync model, wait utile the inference finish
+    void wait() override;
+
+    virtual LiteDeviceType get_device_type() const override {
+        return m_user_config->device_type;
+    }
+
+    //! Set cpu default mode when device is CPU, in some low computation
+    //! device or single core device, this mode will get good performace
+    void set_cpu_inplace_mode();
+    bool is_cpu_inplace_mode() const { return m_is_cpu_inplace_mode; }
+
+    //! When device is CPU, this interface will set the to be loaded model
+    //! run in multi thread mode with the given thread number.
+    void set_cpu_threads_number(size_t nr_threads);
+    size_t get_cpu_threads_number() const { return m_nr_threads; }
+
+    //! set device id, default device id = 0
+    void set_device_id(int device_id) override;
+    int get_device_id() const override { return m_compnode_locator.device; };
+
+    LiteBackend get_backend_type() const override {
+        return LiteBackend::LITE_DEFAULT;
+    }
+    //! set stream id, default stream id = 0
+    void set_stream_id(int stream_id) override;
+    int get_stream_id() const override { return m_compnode_locator.stream; };
+
+    //! enable tensorrt
+    void use_tensorrt();
+
+    //! enable profile the network, a JSON format file will be generated
+    void enable_profile_performance(
+            std::string profile_json_file_path) override;
+
+    /********************** mge special function ************************/
+    //! load a new network which will share weights with src network
+    void shared_weight_with(const NetworkImplBase* src_network);
+
+    //! share the runtime memory with other network, the weights is not shared
+    void share_runtime_memory_with(NetworkImplBase* network);
+    //! set threads affinity callback;
+    void set_runtime_thread_affinity(
+            const ThreadAffinityCallback& thread_affinity_callback);
+
+    //! set the network memroy allocator, the allocator is defined by user
+    void set_memory_allocator(std::shared_ptr<Allocator> user_allocator);
+
+    //! set opr algorithm selection strategy in the network
+    void set_network_algo_policy(LiteAlgoSelectStrategy strategy,
+                                 uint32_t shared_batch_size,
+                                 bool binary_equal_between_batch);
+
+    //! set workspace_limit for oprs with multiple algorithms, set
+    //! workspace limitation can save memory but may influence the performance
+    void set_network_algo_workspace_limit(size_t workspace_limit);
+
+    //! Dump input/output values of all internal variables to output file,
+    //! in text format
+    void enable_io_txt_dump(std::string io_txt_out_file);
+
+    //! Dump input/output values of all internal variables to output
+    //! directory, in binary format
+    void enable_io_bin_dump(std::string io_bin_out_dir);
+
+private:
+    //! construct the outputspec according to the m_network_io, and set the
+    //! call_back to the outputspec
+    void make_output_spec();
+
+    //! modify the execution policy
+    void modify_exection_policy();
+
+    //! if the input is dev tensor, the pass will replace the H2D Opr to
+    //! VolatileSharedDeviceTensor Opr
+    void replace_dev_input_pass();
+
+    //! check whether the model is cross compnode
+    void cross_compnode_model_detect();
+
+    //! when the model have loaded, update the IO, if not set networkio, update
+    //! the networkio with the IO of loaded model
+    void update_io();
+
+    void update_input();
+    void update_output();
+
+    //! when the model info have loaded, update the config according the model
+    //! info, finaly use it in compute graph
+    void application_config();
+
+    //! after finish forwarding the netwark, output the result of plugin to file
+    void output_plugin_result() const;
+
+    //! when finish forwarding the network, the function will be called
+    void finish() const;
+
+    //! before forwarding the network, the function will be called
+    void start() const;
+
+    //! compile the graph to get the execute function
+    void compile_graph();
+
+private:
+    bool m_async = false;
+    bool m_is_cpu_inplace_mode = false;
+    int m_nr_device_type = 0;
+    size_t m_nr_threads = 1;
+    bool m_compute_configured_output_only = false;
+    mgb::CompNode::Locator m_compnode_locator;
+
+    AsyncCallback m_async_callback = nullptr;
+    std::unique_ptr<NetworkIOInner> m_network_io;
+    std::unique_ptr<Config> m_user_config;
+    std::unique_ptr<mgb::cg::AsyncExecutable> m_execute_func;
+
+    //! The model load related data
+    S m_execution_policy = static_cast<S>(0);
+    std::unique_ptr<mgb::serialization::InputFile> m_input_file;
+    mgb::serialization::GraphLoadConfig m_load_config;
+    mgb::serialization::GraphLoader::LoadResult m_load_result;
+    mgb::ComputingGraph::OutputSpec m_output_spec;
+    std::shared_ptr<mgb::serialization::GraphLoader> m_loader;
+
+    //! start and finish callback
+    StartCallback m_start_callback = nullptr;
+    FinishCallback m_finish_callback = nullptr;
+
+    //! profile and io dump related data
+#if MGB_ENABLE_JSON
+    std::unique_ptr<mgb::GraphProfiler> m_profiler;
+    std::string m_profiler_output_file;
+#endif
+    std::unique_ptr<mgb::OprIODumpBase> m_iodump;
+};
+
+}  // namespace lite
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/tensor_impl.cpp b/lite/src/mge/tensor_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c699a3d49b248d8ff9c4819014f3f16a7e46c75
--- /dev/null
+++ b/lite/src/mge/tensor_impl.cpp
@@ -0,0 +1,435 @@
+/**
+ * \file inlude/mge/tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "tensor_impl.h"
+#include "common.h"
+
+#include "lite/tensor.h"
+
+#include "megbrain/comp_node.h"
+#include "megbrain/tensor.h"
+
+#include <memory>
+
+using namespace lite;
+
+/**********************TensorImpl****************************/
+
+LITE_DYN_TYPE_OBJ_FINAL_IMPL(TensorImplDft);
+
+TensorImplDft::TensorImplDft() {
+    m_host_tensor =
+            std::make_shared<mgb::HostTensorND>(mgb::CompNode::default_cpu());
+}
+
+TensorImplDft::TensorImplDft(LiteDeviceType device, bool is_pinned_host) {
+    auto cn = mgb::CompNode::load(to_compnode_locator(device));
+    if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
+        device = LiteDeviceType::LITE_CPU;
+    }
+    if (device == LiteDeviceType::LITE_CPU) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(
+                mgb::CompNode::default_cpu());
+    } else if (is_pinned_host) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
+    } else {
+        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
+    }
+}
+
+TensorImplDft::TensorImplDft(LiteDeviceType device, const Layout& layout,
+                             bool is_pinned_host) {
+    auto cn = mgb::CompNode::load(to_compnode_locator(device));
+    auto mge_layout = to_impl_layout(layout);
+    if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
+        device = LiteDeviceType::LITE_CPU;
+    }
+    if (device == LiteDeviceType::LITE_CPU) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(
+                mgb::CompNode::default_cpu(), mge_layout);
+    } else if (is_pinned_host) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout);
+    } else {
+        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn, mge_layout);
+    }
+}
+
+TensorImplDft::TensorImplDft(int device_id, LiteDeviceType device_type,
+                             const Layout& layout, bool is_pinned_host) {
+    auto locator = to_compnode_locator(device_type);
+    locator.device = device_id;
+    auto cn = mgb::CompNode::load(locator);
+    if (device_type == LiteDeviceType::LITE_DEVICE_DEFAULT) {
+        device_type = LiteDeviceType::LITE_CPU;
+    }
+    if (layout.ndim) {
+        auto mge_layout = to_impl_layout(layout);
+        if (device_type == LiteDeviceType::LITE_CPU) {
+            m_host_tensor = std::make_shared<mgb::HostTensorND>(
+                    mgb::CompNode::default_cpu(), mge_layout);
+        } else if (is_pinned_host) {
+            m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout);
+        } else {
+            m_dev_tensor =
+                    std::make_shared<mgb::DeviceTensorND>(cn, mge_layout);
+        }
+    } else {
+        if (device_type == LiteDeviceType::LITE_CPU) {
+            m_host_tensor = std::make_shared<mgb::HostTensorND>(
+                    mgb::CompNode::default_cpu());
+        } else if (is_pinned_host) {
+            m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
+        } else {
+            m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
+        }
+    }
+}
+
+TensorImplDft::TensorImplDft(int device_id, int stream_id,
+                             LiteDeviceType device_type, bool is_pinned_host) {
+    auto locator = to_compnode_locator(device_type);
+    locator.device = device_id;
+    locator.stream = stream_id;
+    auto cn = mgb::CompNode::load(locator);
+    if (get_device_from_locator(locator) == LiteDeviceType::LITE_CPU) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(
+                mgb::CompNode::default_cpu());
+    } else if (is_pinned_host) {
+        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
+    } else {
+        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
+    }
+}
+
+LiteDeviceType TensorImplDft::get_device_type() const {
+    if (is_host()) {
+        return LiteDeviceType::LITE_CPU;
+    } else {
+        return get_device_from_locator(m_dev_tensor->comp_node().locator());
+    }
+}
+
+int TensorImplDft::get_device_id() const {
+    if (is_host()) {
+        return m_host_tensor->comp_node().locator().device;
+    } else {
+        return m_dev_tensor->comp_node().locator().device;
+    }
+}
+
+bool TensorImplDft::is_pinned_host() const {
+    return is_host() &&
+           get_device_from_locator(m_host_tensor->comp_node().locator()) !=
+                   LiteDeviceType::LITE_CPU;
+}
+
+void TensorImplDft::set_mge_tensor_compnode(const mgb::CompNode& comp_node) {
+    if (is_host()) {
+        m_host_tensor->comp_node(comp_node, true);
+    } else {
+        m_dev_tensor->comp_node(comp_node, true);
+    }
+}
+
+Layout TensorImplDft::get_layout() const {
+    if (is_host()) {
+        return to_lite_layout(m_host_tensor->layout());
+    } else {
+        return to_lite_layout(m_dev_tensor->layout());
+    }
+}
+
+void* TensorImplDft::get_memory_ptr() const {
+    if (is_host()) {
+        return static_cast<void*>(m_host_tensor->raw_ptr());
+    } else {
+        return static_cast<void*>(m_dev_tensor->raw_ptr());
+    }
+}
+
+void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const {
+    if (is_host()) {
+        auto elemsize_log = m_host_tensor->layout().dtype.size_log();
+        switch (elemsize_log) {
+            case 0:
+                return static_cast<void*>(
+                        m_host_tensor->ptr<uint8_t>(idx.begin(), idx.end()));
+                break;
+            case 1:
+                return static_cast<void*>(
+                        m_host_tensor->ptr<short>(idx.begin(), idx.end()));
+                break;
+            case 2:
+                return static_cast<void*>(
+                        m_host_tensor->ptr<float>(idx.begin(), idx.end()));
+                break;
+            default:
+                LITE_THROW("not supported data_type.");
+        }
+    } else {
+        auto elemsize_log = m_dev_tensor->layout().dtype.size_log();
+        switch (elemsize_log) {
+            case 0:
+                return static_cast<void*>(
+                        m_dev_tensor->ptr<uint8_t>(idx.begin(), idx.end()));
+                break;
+            case 1:
+                return static_cast<void*>(
+                        m_dev_tensor->ptr<short>(idx.begin(), idx.end()));
+                break;
+            case 2:
+                return static_cast<void*>(
+                        m_dev_tensor->ptr<float>(idx.begin(), idx.end()));
+                break;
+            default:
+                LITE_THROW("not supported data_type.");
+        }
+    }
+}
+
+std::shared_ptr<Tensor> TensorImplDft::slice(
+        const std::vector<size_t>& start, const std::vector<size_t>& end,
+        const std::vector<size_t>& step) {
+    Layout layout;
+    mgb::TensorLayout layout_mge;
+    if (is_host()) {
+        layout_mge = m_host_tensor->layout();
+        layout = to_lite_layout(m_host_tensor->layout());
+    } else {
+        layout_mge = m_dev_tensor->layout();
+        layout = to_lite_layout(m_dev_tensor->layout());
+    }
+
+    size_t length = start.size();
+    LITE_ASSERT(length == end.size() && length <= layout.ndim,
+                "The start and end must be the same size and less than layout "
+                "ndim.");
+    std::vector<mgb::Slice> slices;
+    if (step.size()) {
+        LITE_ASSERT(length == step.size(),
+                    "The start and step must be the same size.");
+        for (size_t i = 0; i < length; i++) {
+            slices.push_back(mgb::Slice{start[i], end[i], step[i]});
+        }
+    } else {
+        for (size_t i = 0; i < length; i++) {
+            slices.push_back(mgb::Slice{start[i], end[i]});
+        }
+    }
+    auto subspec = mgb::SubTensorSpec::make_from_offset_elem(layout_mge, 0);
+    size_t axis = 0;
+    for (auto&& i : slices) {
+        subspec.merge_with(i.apply(subspec.layout(), axis));
+        axis++;
+    }
+    auto ret = std::make_shared<Tensor>();
+    auto& impl = TensorHelper::implement(ret)->cast_final_safe<TensorImplDft>();
+    if (is_host()) {
+        *impl.m_host_tensor = m_host_tensor->sub(subspec);
+    } else {
+        impl.m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(
+                m_dev_tensor->sub(subspec));
+        impl.m_host_tensor = nullptr;
+    }
+    LITE_ASSERT(is_host() == impl.is_host());
+    return ret;
+}
+
+void TensorImplDft::fill_zero() {
+    if (is_host()) {
+        auto mge_layout = m_host_tensor->layout();
+        if (m_host_tensor->layout().is_physical_contiguous()) {
+            auto ptr = get_memory_ptr();
+            std::memset(ptr, 0,
+                        mge_layout.dtype.size(mge_layout.total_nr_elems()));
+        } else {
+            TensorImplDft tmp(LiteDeviceType::LITE_CPU,
+                              to_lite_layout(mge_layout), true);
+            tmp.fill_zero();
+            this->copy_from(&tmp);
+        }
+    } else {
+        mgb::dev_tensor_memset(*m_dev_tensor, 0);
+        m_dev_tensor->sync();
+    }
+}
+
+void TensorImplDft::share_memory_with(const TensorImplBase* src_tensor_impl) {
+    auto src_dft_tensor = static_cast<const TensorImplDft*>(src_tensor_impl);
+    LITE_ASSERT(is_host() == src_dft_tensor->is_host(),
+                "share memory must happen in same device");
+    //! make shape the src memory is ready
+    src_tensor_impl->get_memory_ptr();
+    if (is_host()) {
+        *m_host_tensor = *src_dft_tensor->m_host_tensor;
+    } else {
+        *m_dev_tensor = *src_dft_tensor->m_dev_tensor;
+    }
+}
+
+void TensorImplDft::set_layout(const Layout& layout) {
+    bool host = is_host();
+    auto mgb_layout = to_impl_layout(layout);
+    if (host) {
+        m_host_tensor->dtype(mgb_layout.dtype);
+        m_host_tensor->resize(mgb_layout);
+    } else {
+        m_dev_tensor->dtype(mgb_layout.dtype);
+        m_dev_tensor->resize(mgb_layout);
+    }
+}
+
+void TensorImplDft::reshape(const Layout& layout) {
+    auto mgb_layout = to_impl_layout(layout);
+    bool host = is_host();
+    if (host) {
+        m_host_tensor->resize(mgb_layout);
+    } else {
+        m_dev_tensor->resize(mgb_layout);
+    }
+}
+
+void TensorImplDft::reset(void* prepared_data) {
+    auto raw_ptr = static_cast<mgb::dt_byte*>(prepared_data);
+    auto raw_storage = std::shared_ptr<mgb::dt_byte>(raw_ptr, [](void*) {});
+    bool host = is_host();
+    if (host) {
+        auto cn = m_host_tensor->comp_node();
+        auto mge_layout = m_host_tensor->layout();
+        size_t size = mge_layout.span().dist_byte();
+        mgb::HostTensorStorage storage;
+        storage.reset(cn, size, raw_storage);
+        m_host_tensor->reset(storage, mge_layout);
+    } else {
+        auto cn = m_dev_tensor->comp_node();
+        auto mge_layout = m_dev_tensor->layout();
+        size_t size = mge_layout.span().dist_byte();
+        mgb::DeviceTensorStorage storage;
+        storage.reset(cn, size, raw_storage);
+        m_dev_tensor->reset(storage, mge_layout);
+    }
+}
+
+void TensorImplDft::reset(void* prepared_data, const Layout& layout) {
+    set_layout(layout);
+    reset(prepared_data);
+}
+
+bool TensorImplDft::is_continue_memory() const {
+    if (is_host()) {
+        return m_host_tensor->layout().is_physical_contiguous();
+    } else {
+        return m_dev_tensor->layout().is_physical_contiguous();
+    }
+}
+
+void TensorImplDft::copy_from(const TensorImplBase* src_impl) {
+    if (is_continue_memory()) {
+        copy_from_continue(src_impl);
+    } else {
+        copy_from_fixlayout(src_impl);
+    }
+}
+
+void TensorImplDft::copy_from_continue(const TensorImplBase* src_impl) {
+    auto src = static_cast<const TensorImplDft*>(src_impl);
+    if (is_host()) {
+        //! host to host
+        if (src->is_host()) {
+            m_host_tensor->copy_from(*src->m_host_tensor);
+            //! device to host
+        } else {
+            auto src_cn = src->m_dev_tensor->comp_node();
+            auto dst_cn = m_host_tensor->comp_node();
+            if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) {
+                LITE_WARN(
+                        "The dst tensor memroy is alloced before coping, "
+                        "then pinned memroy would not use to optmize the "
+                        "copy performance.");
+                //! When D2H in megbrain and the compnode of src and dst is not
+                //! equal, there must be one compnode that is cpu-default, so
+                //! here, we use temp tensor for transition
+                auto tmp_impl = std::make_shared<TensorImplDft>();
+                tmp_impl->set_mge_tensor_compnode(src_cn);
+                tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync();
+                m_host_tensor->copy_from(*tmp_impl->m_host_tensor);
+            } else {
+                //! if dst compnode is not valid(memory is not alloced), the
+                //! tensor is pinned host tensor
+                m_host_tensor->comp_node(src_cn, true);
+                m_host_tensor->copy_from(*src->m_dev_tensor).sync();
+            }
+        }
+    } else {
+        //! host to device
+        if (src->is_host()) {
+            m_dev_tensor->copy_from(*src->m_host_tensor).sync();
+            //! device to device
+        } else {
+            m_dev_tensor->copy_from(*src->m_dev_tensor).sync();
+        }
+    }
+}
+
+void TensorImplDft::copy_from_fixlayout(const TensorImplBase* src_impl) {
+    auto src = static_cast<const TensorImplDft*>(src_impl);
+    if (is_host()) {
+        //! host to host
+        if (src->is_host()) {
+            m_host_tensor->copy_from_fixlayout(*src->m_host_tensor);
+            //! device to host
+        } else {
+            auto src_cn = src->m_dev_tensor->comp_node();
+            auto dst_cn = m_host_tensor->comp_node();
+            if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) {
+                LITE_WARN(
+                        "The dst tensor memroy is alloced before coping, "
+                        "then pinned memroy would not use to optmize the "
+                        "copy performance.");
+                //! When D2H in megbrain and the compnode of src and dst is not
+                //! equal, there must be one compnode that is cpu-default, so
+                //! here, we use temp tensor for transition
+                auto tmp_impl = std::make_shared<TensorImplDft>();
+                tmp_impl->set_mge_tensor_compnode(src_cn);
+                tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync();
+                m_host_tensor->copy_from_fixlayout(*tmp_impl->m_host_tensor);
+            } else {
+                //! if dst compnode is not valid(memory is not alloced), the
+                //! tensor is pinned host tensor
+                m_host_tensor->comp_node(src_cn, true);
+                m_host_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync();
+            }
+        }
+    } else {
+        //! host to device
+        if (src->is_host()) {
+            m_dev_tensor->copy_from_fixlayout(*src->m_host_tensor).sync();
+            //! device to device
+        } else {
+            m_dev_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync();
+        }
+    }
+}
+
+void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) {
+    if (is_host()) {
+        auto src_cn = dv.comp_node();
+        m_host_tensor->comp_node(src_cn, true);
+        m_host_tensor->copy_from(dv);
+    } else {
+        m_dev_tensor->copy_from(dv);
+    }
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/tensor_impl.h b/lite/src/mge/tensor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..74991d41c670210eed89da2ab251e682efec50ad
--- /dev/null
+++ b/lite/src/mge/tensor_impl.h
@@ -0,0 +1,128 @@
+/**
+ * \file src/mge/tensor_impl.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "lite/tensor.h"
+#include "tensor_impl_base.h"
+
+#include "megbrain/tensor.h"
+
+#include <unordered_map>
+
+namespace lite {
+
+/*!
+ * \brief implement the Tensor in mge
+ */
+class TensorImplDft final : public Tensor::TensorImplBase {
+    LITE_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    TensorImplDft();
+    TensorImplDft(LiteDeviceType device, bool is_pinned_host = false);
+    TensorImplDft(LiteDeviceType device, const Layout& layout,
+                  bool is_pinned_host = false);
+    TensorImplDft(int device_id, LiteDeviceType device,
+                  const Layout& layout = {}, bool is_pinned_host = false);
+    TensorImplDft(int device_id, int stream_id, LiteDeviceType device,
+                  bool is_pinned_host = false);
+
+    virtual ~TensorImplDft() = default;
+
+    LiteDeviceType get_device_type() const override;
+
+    int get_device_id() const override;
+
+    LiteBackend get_backend_type() const override {
+        return LiteBackend::LITE_DEFAULT;
+    }
+    Layout get_layout() const override;
+
+    bool is_pinned_host() const override;
+
+    //! which will trigger memory alloc in tensor implement
+    void* get_memory_ptr() const override;
+
+    //! which will trigger memory alloc in tensor implement if memory is not
+    //! allocated, and compute the ptr in the gaven idx
+    void* get_memory_ptr(const std::vector<size_t>& idx) const override;
+
+    //! set layout will change the layout and reallocate memory of the tensor
+    void set_layout(const Layout& layout) override;
+
+    //! use the user allocated data to reset the memory of the tensor, the
+    //! memory will not be managed by the lite, later, the user should delete
+    //! it.
+    void reset(void* prepared_data) override;
+
+    //! use the user allocated data and corresponding layout to reset the data
+    //! and layout of the tensor, the memory will not be managed by lite, later,
+    //! the user should delete it.
+    void reset(void* prepared_data, const Layout& layout) override;
+
+    //! get a new tensor slice from the origin tensor
+    std::shared_ptr<Tensor> slice(
+            const std::vector<size_t>& start, const std::vector<size_t>& end,
+            const std::vector<size_t>& step = {}) override;
+
+    //! set the tensor memory with zero
+    void fill_zero() override;
+
+    //! reshape the tensor with new shape, keep the data_type the same
+    void reshape(const Layout& layout) override;
+
+    //! copy tensor form other tensor
+    //! Note: the best way for tensor copy is just set the dst device, left
+    //! layout empty, when copying the dst layout will be set the same with
+    //! src
+    void copy_from(const TensorImplBase* src_impl) override;
+
+    //! share memory with other tensor
+    void share_memory_with(const TensorImplBase* src_impl) override;
+
+    //! whether the memory of tensor is continue
+    bool is_continue_memory() const override;
+
+    //! get host tensor
+    std::shared_ptr<mgb::HostTensorND> host_tensor() const {
+        return m_host_tensor;
+    }
+    //! get device tensor
+    std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const {
+        return m_dev_tensor;
+    }
+    //! copy from mgb tensor
+    void copy_from_mge_tensor(const mgb::DeviceTensorND& dv);
+
+public:
+    friend class NetworkImplDft;
+
+private:
+    bool is_host() const { return m_host_tensor != nullptr; };
+
+    void copy_from_continue(const TensorImplBase* src_impl);
+
+    void copy_from_fixlayout(const TensorImplBase* src_impl);
+
+    void set_mge_tensor_compnode(const mgb::CompNode& comp_node);
+
+private:
+    std::shared_ptr<mgb::HostTensorND> m_host_tensor;
+    std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor;
+};
+
+}  // namespace lite
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/misc.cpp b/lite/src/misc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c44d024179fd8f6b9c08ad14ac89dfd7a4c276a9
--- /dev/null
+++ b/lite/src/misc.cpp
@@ -0,0 +1,154 @@
+/**
+ * \file inlude/misc.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "./misc.h"
+#include "lite/global.h"
+
+#include <time.h>
+#include <chrono>
+#include <cstdarg>
+
+#if LITE_BUILD_WITH_MGE
+#include "megbrain/common.h"
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+using namespace lite;
+
+namespace lite {
+namespace log_detail {
+
+LiteLogLevel current_log_level = LiteLogLevel::ERROR;
+
+template <class T, size_t N>
+constexpr size_t countof(T (&)[N]) {
+    return N;
+}
+}  // namespace log_detail
+}  // namespace lite
+
+namespace {
+std::string svsprintf(const char* fmt, va_list ap_orig) {
+    int size = 100; /* Guess we need no more than 100 bytes */
+    char* p;
+
+    if ((p = (char*)malloc(size)) == nullptr)
+        return "svsprintf: malloc failed";
+
+    for (;;) {
+        va_list ap;
+        va_copy(ap, ap_orig);
+        int n = vsnprintf(p, size, fmt, ap);
+        va_end(ap);
+
+        if (n < 0)
+            return "svsprintf: vsnprintf failed";
+
+        if (n < size) {
+            std::string rst(p);
+            free(p);
+            return rst;
+        }
+
+        size = n + 1;
+
+        char* np = (char*)realloc(p, size);
+        if (!np) {
+            free(p);
+            return "svsprintf: realloc failed";
+        } else
+            p = np;
+    }
+}
+}  // namespace
+
+void lite::set_log_level(LiteLogLevel l) {
+    log_detail::current_log_level = l;
+#if LITE_BUILD_WITH_MGE
+    mgb::LogLevel lite_log_level = mgb::LogLevel::DEBUG;
+    switch (l) {
+        case LiteLogLevel::DEBUG:
+            lite_log_level = mgb::LogLevel::DEBUG;
+            break;
+        case LiteLogLevel::INFO:
+            lite_log_level = mgb::LogLevel::INFO;
+            break;
+        case LiteLogLevel::WARN:
+            lite_log_level = mgb::LogLevel::WARN;
+            break;
+        case LiteLogLevel::ERROR:
+            lite_log_level = mgb::LogLevel::ERROR;
+            break;
+        default:
+            LITE_THROW("unkonw loglevel");
+    }
+    mgb::set_log_level(lite_log_level);
+#endif
+}
+
+LiteLogLevel lite::get_log_level() {
+    return log_detail::current_log_level;
+}
+
+std::string lite::ssprintf(const char* format, ...) {
+    va_list ap;
+    va_start(ap, format);
+    auto ret = svsprintf(format, ap);
+    va_end(ap);
+    return ret;
+}
+
+void lite::print_log(LiteLogLevel level, const char* format, ...) {
+    if (static_cast<uint32_t>(level) < static_cast<uint32_t>(get_log_level())) {
+        return;
+    }
+    using namespace std::chrono;
+
+    auto now = system_clock::now();
+    auto now_time_t = system_clock::to_time_t(now);
+
+    tm now_tm;
+
+#if _WIN32
+    localtime_s(&now_tm, &now_time_t);
+#else
+    localtime_r(&now_time_t, &now_tm);
+#endif
+
+    auto now_trunc_to_sec = system_clock::from_time_t(mktime(&now_tm));
+    auto microsec = duration_cast<microseconds>(now - now_trunc_to_sec);
+
+    char time_buffer[100];
+    snprintf(time_buffer, log_detail::countof(time_buffer),
+             "%02d:%02d:%02d.%06ld ", now_tm.tm_hour, now_tm.tm_min,
+             now_tm.tm_sec, long(microsec.count()));
+
+    const char* prefix[] = {"LITE[DBG] ", "LITE[INF] ", "LITE[WRN] ",
+                            "LITE[ERR] "};
+    std::string out;
+    out += prefix[int(level)];
+    out += time_buffer;
+
+    va_list ap;
+    va_start(ap, format);
+    auto ret = svsprintf(format, ap);
+    va_end(ap);
+    out += ret;
+
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_INFO, "lite", "%s", out.c_str());
+#else
+    fprintf(stderr, "%s\n", out.c_str());
+#endif
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/misc.h b/lite/src/misc.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6799feda0c6974a69b8703d84f99e5828ddb92b
--- /dev/null
+++ b/lite/src/misc.h
@@ -0,0 +1,254 @@
+/**
+ * \file include/misc.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include "lite_build_config.h"
+
+#include <chrono>
+#include <exception>
+#include <stdexcept>
+#include <string>
+#include "lite/common_enum_c.h"
+#include "lite/global.h"
+
+namespace lite {
+#if LITE_ENABLE_EXCEPTION
+/*! \brief The error class in lite.
+ *
+ * It can be used to represent both an error caused by the invalid
+ * input of the caller or an invalid runtime condition.
+ *
+ * The necessary presumption should be guaranteed by assertions instead of
+ * exceptions.
+ */
+class Error : public std::exception {
+public:
+    Error(const std::string& msg) : m_msg("Error: " + msg) {}
+    const char* what() const noexcept override { return m_msg.c_str(); }
+
+private:
+    std::string m_msg;
+};
+#endif
+
+std::string ssprintf(const char* fmt = 0, ...)
+        __attribute__((format(printf, 1, 2)));
+
+/*!
+ * \brief Print a message.
+ *
+ * The message is printed only if level is above or equals to the current log
+ * level.
+ */
+void print_log(LiteLogLevel level, const char* format = 0, ...)
+        __attribute__((format(printf, 2, 3)));
+}  // namespace lite
+
+#if LITE_ENABLE_LOGGING
+#define LITE_LOG_(level, msg...)                     \
+    do {                                             \
+        lite::print_log(LiteLogLevel::level, ##msg); \
+    } while (0)
+#else
+#define LITE_LOG_(level, msg...) (void)0
+#endif
+
+#define LITE_LOG(fmt...) LITE_LOG_(DEBUG, fmt);
+#define LITE_DEBUG(fmt...) LITE_LOG_(DEBUG, fmt);
+#define LITE_WARN(fmt...) LITE_LOG_(WARN, fmt);
+#define LITE_ERROR(fmt...) LITE_LOG_(ERROR, fmt);
+
+#if LITE_ENABLE_EXCEPTION
+#define LITE_THROW(msg) throw lite::Error(msg)
+#else
+#define LITE_THROW(msg)   \
+    do {                  \
+        LITE_ERROR(msg);  \
+        __builtin_trap(); \
+    } while (0)
+#endif
+
+#if LITE_ENABLE_EXCEPTION
+#define LITE_ERROR_HANDLER_BEGIN try {
+#define LITE_ERROR_HANDLER_END                                        \
+    }                                                                 \
+    catch (const ::lite::Error& e) {                                  \
+        std::string msg = std::string("Lite exception: ") + e.what(); \
+        LITE_ERROR("%s.", msg.c_str());                               \
+        throw;                                                        \
+    }
+
+#else
+#define LITE_ERROR_HANDLER_BEGIN
+#define LITE_ERROR_HANDLER_END
+#endif
+
+/*! \brief Return an error if the given pointer is null pointer.
+ *
+ * The macro is used to ensure the validity of the passing context pointer.
+ */
+#define LITE_CHECK_NON_NULL_POINTER(ptr) \
+    LITE_ASSERT(ptr != nullptr, "Input ptr is null.")
+
+//! branch prediction hint: likely to take
+#define lite_likely(v) __builtin_expect(static_cast<bool>(v), 1)
+
+//! branch prediction hint: unlikely to take
+#define lite_unlikely(v) __builtin_expect(static_cast<bool>(v), 0)
+
+#if LITE_ENABLE_LOGGING
+#if LITE_ASSERT_LOC
+#define LITE_ASSERT(expr, msg...)                                           \
+    do {                                                                    \
+        if (lite_unlikely(!(expr))) {                                       \
+            auto info = lite::ssprintf(msg);                                \
+            LITE_THROW(                                                     \
+                    lite::ssprintf("Assert \' %s \' failed at file : %s \n" \
+                                   "line %d : %s,\nextra "                  \
+                                   "message: %s",                           \
+                                   #expr, __FILE__, __LINE__,               \
+                                   __PRETTY_FUNCTION__, info.c_str()));     \
+        }                                                                   \
+    } while (0)
+#else
+#define LITE_ASSERT(expr, msg...)                                          \
+    do {                                                                   \
+        if (lite_unlikely(!(expr))) {                                      \
+            auto info = lite::ssprintf(msg);                               \
+            LITE_THROW(lite::ssprintf(                                     \
+                    "Assert \' %s \' failed at file : %s \n"               \
+                    "line %d : %s,\nextra "                                \
+                    "message: %s",                                         \
+                    #expr, "about location info, please build with debug", \
+                    __LINE__, __PRETTY_FUNCTION__, info.c_str()));         \
+        }                                                                  \
+    } while (0)
+#endif
+#else
+#define LITE_ASSERT(expr, msg...)                  \
+    do {                                           \
+        if (lite_unlikely(!(expr))) {              \
+            auto msg_string = lite::ssprintf(msg); \
+            LITE_THROW(msg_string.c_str());        \
+        }                                          \
+    } while (0)
+#endif
+
+#define LITE_MARK_USED_VAR(var) ((void)var)
+
+namespace lite {
+class ScopedTimer {
+public:
+    typedef std::chrono::system_clock Clock;
+    typedef std::chrono::nanoseconds Nsec;
+
+    ScopedTimer(std::string name) : m_name(name) { m_start = Clock::now(); }
+    ~ScopedTimer() {
+        m_stop = Clock::now();
+        std::chrono::duration<double> elapsed = m_stop - m_start;
+        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
+        auto msg = ssprintf("%s used time %fms.", m_name.c_str(),
+                            static_cast<double>(u.count()) / 1000000.f);
+        LITE_LOG("%s", msg.c_str());
+    }
+
+private:
+    std::chrono::time_point<std::chrono::system_clock> m_start, m_stop;
+    const std::string m_name;
+};
+
+class Timer {
+public:
+    typedef std::chrono::system_clock Clock;
+    typedef std::chrono::nanoseconds Nsec;
+
+    Timer(std::string name) : m_name(name) { m_start = Clock::now(); }
+    double get_used_time() {
+        m_stop = Clock::now();
+        std::chrono::duration<double> elapsed = m_stop - m_start;
+        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
+        return static_cast<double>(u.count()) / 1000000.0;
+    }
+    void print_used_time(int iter) {
+        m_stop = Clock::now();
+        std::chrono::duration<double> elapsed = m_stop - m_start;
+        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
+        printf("%s used time %f ms\n", (m_name + std::to_string(iter)).c_str(),
+               static_cast<double>(u.count()) / 1000000.0);
+    }
+    void reset_start() { m_start = Clock::now(); }
+
+private:
+    std::chrono::time_point<std::chrono::system_clock> m_start, m_stop;
+    const std::string m_name;
+};
+
+inline void mark_used_variable() {}
+template <typename T, typename... Arg>
+inline void mark_used_variable(T firstArg, Arg... args) {
+    LITE_MARK_USED_VAR(firstArg);
+    mark_used_variable(args...);
+}
+}  // namespace lite
+
+#if defined(_WIN32)
+#include <io.h>
+#include <windows.h>
+#undef CONST
+#define F_OK 0
+#define RTLD_LAZY 0
+// On the windows platform we use a lib_filename without a full path so
+// the win-api "LoadLibrary" would uses a standard search strategy to
+// find the lib module. As we cannot access to the lib_filename without a
+// full path, we should not use "access(a, b)" to verify it.
+#define access(a, b) false
+static inline void* dlopen(const char* file, int) {
+    return static_cast<void*>(LoadLibrary(file));
+}
+
+static inline char* dlerror() {
+    const char* errmsg = "dlerror not aviable in windows";
+    return const_cast<char*>(errmsg);
+}
+
+static inline void* dlsym(void* handle, const char* name) {
+    FARPROC symbol = GetProcAddress((HMODULE)handle, name);
+    return reinterpret_cast<void*>(symbol);
+}
+#elif __linux__ || __unix__ || __APPLE__
+#include <dlfcn.h>
+#include <unistd.h>
+#endif
+
+#if __DEPLOY_ON_XP_SP2__
+//! refer to
+//! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160
+//! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not
+//! implement some base apis for c++ std function, for example,
+//! std::mutex/std::thread/std::condition_variable as a workround, we will
+//! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc!
+#define LITE_MUTEX size_t
+#define LITE_RECURSIVE_MUTEX size_t
+#define LITE_LOCK_GUARD(mtx) LITE_MARK_USED_VAR(mtx)
+#define LITE_LOCK_GUARD_UNIQUE(mtx) LITE_MARK_USED_VAR(mtx)
+#define LITE_LOCK_GUARD_SHARED(mtx) LITE_MARK_USED_VAR(LITE_MARK_USED_VAR)
+#else
+#define LITE_MUTEX std::mutex
+#define LITE_RECURSIVE_MUTEX std::recursive_mutex
+#define LITE_LOCK_GUARD(mtx) \
+    std::lock_guard<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)
+
+#define LITE_LOCK_GUARD_UNIQUE(mtx) \
+    std::unique_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)
+
+#define LITE_LOCK_GUARD_SHARED(mtx) \
+    std::shared_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/network.cpp b/lite/src/network.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f779d8792c5d761548468d117d07418f1e4b5f9
--- /dev/null
+++ b/lite/src/network.cpp
@@ -0,0 +1,501 @@
+/**
+ * \file src/network.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/network.h"
+#include "function_base.h"
+#include "network_impl_base.h"
+#include "parse_info/parse_info_base.h"
+#include "parse_model/model_parser.h"
+#include "type_info.h"
+#if LITE_BUILD_WITH_MGE
+#include "mge/function_dft.h"
+#include "mge/network_impl.h"
+#endif
+
+#include <fstream>
+#include <memory>
+
+using namespace lite;
+
+/**
+ * \brief Construct the new work implement
+ * the order must be :
+ * 1. creeat the implement
+ * 2. config and load
+ * 3. set_io
+ */
+Network::Network(const Config& config, const NetworkIO& network_io) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_config = config;
+    m_network_io = network_io;
+    if (config.backend == LiteBackend::LITE_DEFAULT) {
+        m_impl = call_func<NetworkImplDft,
+                           std::unique_ptr<lite::Network::NetworkImplBase>>(
+                "create_network");
+    } else if (config.backend == LiteBackend::LITE_RK_NPU) {
+        m_impl = call_func<NetworkImplRK,
+                           std::unique_ptr<lite::Network::NetworkImplBase>>(
+                "create_network");
+    }
+    m_impl->set_config(config);
+    m_impl->set_io(network_io);
+    LITE_ERROR_HANDLER_END
+}
+
+Network::Network(const NetworkIO& network_io, const Config& config) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_config = config;
+    m_network_io = network_io;
+    if (config.backend == LiteBackend::LITE_DEFAULT) {
+        m_impl = call_func<NetworkImplDft,
+                           std::unique_ptr<lite::Network::NetworkImplBase>>(
+                "create_network");
+    } else if (config.backend == LiteBackend::LITE_RK_NPU) {
+        m_impl = call_func<NetworkImplRK,
+                           std::unique_ptr<lite::Network::NetworkImplBase>>(
+                "create_network");
+    }
+    m_impl->set_config(config);
+    m_impl->set_io(network_io);
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::load_model(void* model_mem, size_t size) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    //! this model_mem is managed by user
+    std::shared_ptr<void> model{model_mem, [](void*) {}};
+    prase_model(model, size);
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::load_model(std::string model_path) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    FILE* fin = fopen(model_path.c_str(), "rb");
+    LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(),
+                strerror(errno));
+    fseek(fin, 0, SEEK_END);
+    size_t size = ftell(fin);
+    fseek(fin, 0, SEEK_SET);
+    void* ptr = malloc(size);
+    std::shared_ptr<void> buf{ptr, ::free};
+    auto nr = fread(buf.get(), 1, size, fin);
+    LITE_ASSERT(nr == size);
+    fclose(fin);
+    prase_model(buf, size);
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::prase_model(std::shared_ptr<void> model_data, size_t size) {
+    std::unordered_map<std::string, LiteAny> separate_config_map;
+    ModelParser model_parser(model_data, size);
+    //! parse the model info
+    if (model_parser.parse_model_info(m_config, m_network_io,
+                                      separate_config_map, m_extra_info)) {
+        if (m_config.backend == LiteBackend::LITE_DEFAULT &&
+            m_impl->get_backend_type() != LiteBackend::LITE_DEFAULT) {
+            m_impl.reset(try_call_func<NetworkImplDft,
+                                       lite::Network::NetworkImplBase*>(
+                    "parse_model"));
+        } else if (m_config.backend == LiteBackend::LITE_RK_NPU &&
+                   m_impl->get_backend_type() != LiteBackend::LITE_RK_NPU) {
+            m_impl.reset(try_call_func<NetworkImplRK,
+                                       lite::Network::NetworkImplBase*>(
+                    "parse_model"));
+        }
+        m_impl->set_config(m_config);
+        m_impl->set_io(m_network_io);
+    }
+    //! decryption the model
+    size_t model_length;
+    auto&& model_shared_ptr = model_parser.parse_model(model_length, m_config);
+
+    m_impl->load_model(model_shared_ptr, model_length, separate_config_map);
+    m_loaded = true;
+    update_from_implement();
+}
+
+Network::~Network() = default;
+
+void Network::update_from_implement() {
+    m_config.device_type = m_impl->get_device_type();
+}
+
+void Network::compute_only_configured_output() {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(!m_loaded,
+                "compute_only_configured_output should be used before model "
+                "loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->compute_only_configured_output();
+    LITE_ERROR_HANDLER_END
+}
+
+std::shared_ptr<Tensor> Network::get_io_tensor(std::string name,
+                                               LiteTensorPhase phase) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_io_tensor(name, phase);
+    LITE_ERROR_HANDLER_END
+}
+
+std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded,
+                "get_input_tensor should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_input_tensor(index);
+    LITE_ERROR_HANDLER_END
+}
+
+std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded,
+                "get_output_tensor should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_output_tensor(index);
+    LITE_ERROR_HANDLER_END
+}
+
+Network& Network::set_async_callback(const AsyncCallback& callback) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->set_async_callback(std::move(callback));
+    return *this;
+    LITE_ERROR_HANDLER_END
+}
+
+Network& Network::set_start_callback(const StartCallback& callback) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->set_start_callback(std::move(callback));
+    return *this;
+    LITE_ERROR_HANDLER_END
+}
+
+Network& Network::set_finish_callback(const FinishCallback& callback) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->set_finish_callback(std::move(callback));
+    return *this;
+    LITE_ERROR_HANDLER_END
+}
+
+Network& Network::set_device_id(int device_id) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(!m_loaded, "set_device_id should be used before model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->set_device_id(device_id);
+    return *this;
+    LITE_ERROR_HANDLER_END
+}
+
+Network& Network::set_stream_id(int stream_id) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(!m_loaded, "set_stream_id should be used before model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->set_stream_id(stream_id);
+    return *this;
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::forward() {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "forward should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl.get());
+    m_impl->forward();
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::wait() {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "wait should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    m_impl->wait();
+    LITE_ERROR_HANDLER_END
+}
+
+std::string Network::get_input_name(size_t index) const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "get_input_name should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_input_name(index);
+    LITE_ERROR_HANDLER_END
+}
+
+std::string Network::get_output_name(size_t index) const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "get_output_name should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_output_name(index);
+    LITE_ERROR_HANDLER_END
+}
+
+std::vector<std::string> Network::get_all_input_name() const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded,
+                "get_all_input_name should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    auto all_input_name = m_impl->get_all_input_name();
+    std::vector<std::string> all_names;
+    for (auto& name : all_input_name) {
+        all_names.push_back(name);
+    }
+    return all_names;
+    LITE_ERROR_HANDLER_END
+}
+
+std::vector<std::string> Network::get_all_output_name() const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded,
+                "get_all_output_name should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    auto all_output_name = m_impl->get_all_output_name();
+    std::vector<std::string> all_names;
+    for (auto& name : all_output_name) {
+        all_names.push_back(name);
+    }
+    return all_names;
+    LITE_ERROR_HANDLER_END
+}
+
+int Network::get_device_id() const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_device_id();
+    LITE_ERROR_HANDLER_END
+}
+
+int Network::get_stream_id() const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_stream_id();
+    LITE_ERROR_HANDLER_END
+}
+
+void Network::enable_profile_performance(std::string profile_file_path) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_impl->enable_profile_performance(profile_file_path);
+    LITE_ERROR_HANDLER_END
+}
+
+const std::string& Network::get_model_extra_info() {
+    LITE_ERROR_HANDLER_BEGIN
+    return m_extra_info;
+    LITE_ERROR_HANDLER_END
+}
+
+LiteDeviceType Network::get_device_type() const {
+    LITE_ERROR_HANDLER_BEGIN
+    return m_impl->get_device_type();
+    LITE_ERROR_HANDLER_END
+}
+
+/*********************** MGE special network function ***************/
+
+void Runtime::set_cpu_threads_number(std::shared_ptr<Network> network,
+                                     size_t nr_threads) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(
+                !NetworkHelper::loaded(network),
+                "set_cpu_threads_number should be used before model loaded.");
+        call_func<NetworkImplDft, void>("set_cpu_threads_number", network_impl,
+                                        nr_threads);
+        return;
+    }
+    LITE_THROW("set_cpu_threads_number is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::use_tensorrt(std::shared_ptr<Network> network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(!NetworkHelper::loaded(network),
+                    "use_tensorrt should be used before model loaded.");
+        call_func<NetworkImplDft, void>("use_tensorrt", network_impl);
+        return;
+    }
+    LITE_THROW("use_tensorrt is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+size_t Runtime::get_cpu_threads_number(const std::shared_ptr<Network> network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        return call_func<NetworkImplDft, size_t>("get_cpu_threads_number",
+                                                 network_impl);
+    }
+    LITE_THROW("get_cpu_threads_number is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::set_runtime_thread_affinity(
+        std::shared_ptr<Network> network,
+        const ThreadAffinityCallback& thread_affinity_callback) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(NetworkHelper::loaded(network),
+                    "set_runtime_thread_affinity should be used after model "
+                    "loaded.");
+        call_func<NetworkImplDft, void>("set_runtime_thread_affinity",
+                                        network_impl, thread_affinity_callback);
+
+        return;
+    }
+    LITE_THROW("set_runtime_thread_affinity is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::set_cpu_inplace_mode(std::shared_ptr<Network> network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(!NetworkHelper::loaded(network),
+                    "set_cpu_inplace_mode should be used before model loaded.");
+        call_func<NetworkImplDft, void>("set_cpu_inplace_mode", network_impl);
+        return;
+    }
+    LITE_THROW("set_cpu_inplace_mode is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+bool Runtime::is_cpu_inplace_mode(const std::shared_ptr<Network> network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        return call_func<NetworkImplDft, bool>("is_cpu_inplace_mode",
+                                               network_impl);
+    }
+    LITE_THROW("is_cpu_inplace_mode is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+//! set opr algorithm selection strategy in the network
+void Runtime::set_network_algo_policy(std::shared_ptr<Network> network,
+                                      LiteAlgoSelectStrategy strategy,
+                                      uint32_t shared_batch_size,
+                                      bool binary_equal_between_batch) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        call_func<NetworkImplDft, void>("set_network_algo_policy", network_impl,
+                                        strategy, shared_batch_size,
+                                        binary_equal_between_batch);
+        return;
+    }
+    LITE_THROW("set_network_algo_policy is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+//! set opr algorithm selection strategy in the network
+void Runtime::set_network_algo_workspace_limit(std::shared_ptr<Network> network,
+                                               size_t workspace_limit) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(NetworkHelper::loaded(network),
+                    "set_network_algo_policy should be used after model "
+                    "loaded.");
+        call_func<NetworkImplDft, void>("set_network_algo_workspace_limit",
+                                        network_impl, workspace_limit);
+        return;
+    }
+    LITE_THROW(
+            "set_network_algo_workspace_limit is not aviliable in the "
+            "backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+//! set the network memroy allocator, the allocator is defined by user
+void Runtime::set_memory_allocator(std::shared_ptr<Network> network,
+                                   std::shared_ptr<Allocator> user_allocator) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(!NetworkHelper::loaded(network),
+                    "set_memory_allocator should be used before model loaded.");
+        call_func<NetworkImplDft, void>("set_memory_allocator", network_impl,
+                                        user_allocator);
+        return;
+    }
+    LITE_THROW("set_memory_allocator is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::share_runtime_memory_with(std::shared_ptr<Network> dst_network,
+                                        std::shared_ptr<Network> src_network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl_dst = NetworkHelper::implement(dst_network);
+    if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(!NetworkHelper::loaded(dst_network),
+                    "share_runtime_memory_with should be used before model "
+                    "loaded.");
+        call_func<NetworkImplDft, void>("share_runtime_memory_with",
+                                        network_impl_dst,
+                                        NetworkHelper::implement(src_network));
+        return;
+    }
+    LITE_THROW("share_runtime_memory_with is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::enable_io_txt_dump(std::shared_ptr<Network> network,
+                                 std::string io_txt_out_file) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        call_func<NetworkImplDft, void>("enable_io_txt_dump", network_impl,
+                                        io_txt_out_file);
+        return;
+    }
+    LITE_THROW("enable_io_txt_dump is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::enable_io_bin_dump(std::shared_ptr<Network> network,
+                                 std::string io_bin_out_dir) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl = NetworkHelper::implement(network);
+    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        call_func<NetworkImplDft, void>("enable_io_bin_dump", network_impl,
+                                        io_bin_out_dir);
+        return;
+    }
+    LITE_THROW("enable_io_bin_dump is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+void Runtime::shared_weight_with_network(
+        std::shared_ptr<Network> dst_network,
+        const std::shared_ptr<Network> src_network) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto network_impl_dst = NetworkHelper::implement(dst_network);
+    if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) {
+        LITE_ASSERT(NetworkHelper::loaded(src_network),
+                    "shared_weight_with_network should be used after the src "
+                    "network "
+                    "loaded.");
+        auto src_implment = NetworkHelper::implement(src_network);
+        call_func<NetworkImplDft, void>("shared_weight_with", network_impl_dst,
+                                        src_implment);
+        NetworkHelper::loaded(dst_network, true);
+        return;
+    }
+    LITE_THROW("shared_weight_with_network is not aviliable in the backend.");
+    LITE_ERROR_HANDLER_END
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/network_impl_base.h b/lite/src/network_impl_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..c90af5b5d316482f1704764b6901983ab69945b2
--- /dev/null
+++ b/lite/src/network_impl_base.h
@@ -0,0 +1,161 @@
+/**
+ * \file src/network_impl_base.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite/network.h"
+#include "misc.h"
+#include "tensor_impl_base.h"
+#include "type_info.h"
+
+#include <unordered_map>
+
+namespace lite {
+
+/*!
+ * \brief the Inner IO data struct, add some inner data from IO
+ */
+class IOInner : public IO {
+public:
+    //! use to flag the corresponding lite_tensor is filled, when the
+    //! value of lite_tensor is filled, the have_sync is true, other wise false,
+    //! this is used in async mode
+    bool have_sync = false;
+    //! Real input and output data location
+    std::shared_ptr<Tensor> lite_tensor = nullptr;
+
+    IOInner() = default;
+    IOInner(const IO& io) {
+        name = io.name;
+        is_host = io.is_host;
+        io_type = io.io_type;
+        config_layout = io.config_layout;
+    }
+};
+
+/*!
+ * \brief the realy network IO info when network run
+ */
+struct NetworkIOInner {
+    std::vector<IOInner> inputs;
+    std::vector<IOInner> outputs;
+};
+
+/*!
+ * \brief implement the Network, contain the mgb related member
+ */
+class Network::NetworkImplBase : public DynTypeObj {
+public:
+    virtual ~NetworkImplBase() = default;
+
+    //! set the config of the network, include:
+    //! the inference device
+    //! the other inference options, such as record_level, weight_preprocess...
+    virtual void set_config(const Config& config) = 0;
+
+    //! set the special io infomation, if not set, default io tensor will used,
+    //! this is special for input/output is not host tensor, default the
+    //! input/output tensors are host tensor
+    virtual void set_io(const NetworkIO& network_io) = 0;
+
+    //! only compute the output tensor in user configured
+    virtual void compute_only_configured_output() = 0;
+
+    //! get the network input and ouput tensor, the layout of which is
+    //! sync from mge tensor
+    virtual std::shared_ptr<Tensor> get_io_tensor(
+            std::string io_name,
+            LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0;
+
+    //! get the input tensor by index in the load_result tensormap
+    virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0;
+
+    //! get the output tensor by index in the load_result output_var_list
+    virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0;
+
+    //! get all the input tensor name in the order in load return
+    virtual std::vector<const char*> get_all_input_name() const = 0;
+
+    //! get all the output tensor name in the order in load return
+    virtual std::vector<const char*> get_all_output_name() const = 0;
+
+    //! get the input tensor name in the order in load return
+    virtual const char* get_input_name(size_t index) const = 0;
+
+    //! get the output tensor name in the order in load return
+    virtual const char* get_output_name(size_t index) const = 0;
+
+    //! set the callback in async model
+    virtual void set_async_callback(const AsyncCallback& callback) = 0;
+
+    //! set the start callback which will execute before network forward
+    virtual void set_start_callback(const StartCallback& callback) = 0;
+
+    //! set the finish callback which will execute after network forward
+    virtual void set_finish_callback(const FinishCallback& callback) = 0;
+
+    //! load the model and get the m_load_result
+    virtual void load_model(std::shared_ptr<void> model_mem, size_t size,
+                            std::unordered_map<std::string, LiteAny>
+                                    separate_config_map = {}) = 0;
+
+    //! forward the network with filled input data and fill the output data
+    //! to the output tensor
+    virtual void forward() = 0;
+
+    //! in sync model, wait utile the inference finish
+    virtual void wait() = 0;
+
+    //! set device id, default device id = 0
+    virtual void set_device_id(int device_id) = 0;
+    virtual int get_device_id() const = 0;
+    virtual LiteBackend get_backend_type() const = 0;
+    //! set stream id, default stream id = 0
+    virtual void set_stream_id(int stream_id) = 0;
+    virtual int get_stream_id() const = 0;
+
+    virtual LiteDeviceType get_device_type() const = 0;
+
+    //! enable profile the network, a file will be generated
+    virtual void enable_profile_performance(std::string profile_file_path) = 0;
+};
+
+/******************************** friend class *****************************/
+/*!
+ * \brief friend class of Network, for convenient accessing the Network members
+ */
+class NetworkHelper {
+public:
+    static bool loaded(const std::shared_ptr<Network> network) {
+        LITE_ASSERT(network);
+        return network->m_loaded;
+    }
+    static void loaded(const std::shared_ptr<Network> network, bool loaded) {
+        LITE_ASSERT(network);
+        network->m_loaded = loaded;
+    }
+    static Network::NetworkImplBase* implement(const Network* network) {
+        LITE_ASSERT(network);
+        return network->m_impl.get();
+    }
+    static Network::NetworkImplBase* implement(
+            const std::shared_ptr<Network> network) {
+        LITE_ASSERT(network);
+        return network->m_impl.get();
+    }
+    static void implement(const std::shared_ptr<Network> network,
+                          std::unique_ptr<Network::NetworkImplBase> impl) {
+        LITE_ASSERT(network);
+        network->m_impl = std::move(impl);
+    }
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/parse_info/default_parse.h b/lite/src/parse_info/default_parse.h
new file mode 100644
index 0000000000000000000000000000000000000000..921b6354d566e6cdc4813cf63244dc2af7bd5c41
--- /dev/null
+++ b/lite/src/parse_info/default_parse.h
@@ -0,0 +1,246 @@
+/**
+ * \file src/parse_info/default_parse.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "../misc.h"
+
+#include "lite/global.h"
+#include "lite/network.h"
+#include "nlohmann/json.hpp"
+
+namespace lite {
+//! The LITE_default parse info function
+bool default_parse_info(
+        const void* info_ptr, size_t length, const std::string& model_name,
+        Config& config, NetworkIO& network_io,
+        std::unordered_map<std::string, LiteAny>& separate_config_map,
+        std::string& extra_info) {
+    using json = nlohmann::json;
+    std::string json_string(static_cast<const char*>(info_ptr), length);
+    auto info = json::parse(json_string);
+
+    if (!info["valid"]) {
+        return false;
+    }
+    auto info_model_name = info["name"];
+    if (info_model_name != model_name) {
+        LITE_THROW(
+                ssprintf("infomation of model name is not match, packed model "
+                         "is %s, but json info get %s.",
+                         model_name.c_str(),
+                         static_cast<std::string>(info_model_name).c_str()));
+    }
+    //! check version
+    std::string model_version = info["version"];
+    int major = std::stoi(model_version.substr(0, model_version.find(".")));
+    int start = model_version.find(".") + 1;
+    int minor = std::stoi(
+            model_version.substr(start, model_version.find(".", start)));
+    start = model_version.find(".", start) + 1;
+    int patch = std::stoi(model_version.substr(start));
+    int lite_major, lite_minor, lite_patch;
+    lite::get_version(lite_major, lite_minor, lite_patch);
+    size_t model_version_sum = (major * 10000 + minor) * 100 + patch;
+    size_t lite_version_sum =
+            (lite_major * 10000 + lite_minor) * 100 + lite_patch;
+    if (model_version_sum > lite_version_sum) {
+        LITE_WARN("Lite load the future version model !!!!!!!!!!!!!");
+    }
+
+    if (info.contains("has_compression")) {
+        config.has_compression = info["has_compression"];
+    }
+    if (info.contains("backend")) {
+        if (info["backend"] == "MGE") {
+            config.backend = LiteBackend::LITE_DEFAULT;
+        }
+        if (info["backend"] == "RK") {
+            config.backend = LiteBackend::LITE_RK_NPU;
+        }
+    }
+
+    auto get_device_type = [](std::string type) -> LiteDeviceType {
+        if (type == "CPU")
+            return LiteDeviceType::LITE_CPU;
+        if (type == "CUDA")
+            return LiteDeviceType::LITE_CUDA;
+        if (type == "OPENCL")
+            return LiteDeviceType::LITE_OPENCL;
+        if (type == "ATLAS")
+            return LiteDeviceType::LITE_ATLAS;
+        if (type == "NPU")
+            return LiteDeviceType::LITE_NPU;
+        else {
+            LITE_THROW(ssprintf("LITE not support device type of %s.",
+                                type.c_str()));
+        }
+    };
+    if (info.contains("device")) {
+        auto device_json = info["device"];
+        config.device_type = get_device_type(device_json["type"]);
+        if (device_json.contains("device_id")) {
+            separate_config_map["device_id"] =
+                    static_cast<int>(device_json["device_id"]);
+        }
+        if (device_json.contains("number_threads")) {
+            separate_config_map["number_threads"] =
+                    static_cast<size_t>(device_json["number_threads"]);
+        }
+        if (device_json.contains("enable_inplace_model")) {
+            separate_config_map["enable_inplace_model"] =
+                    static_cast<bool>(device_json["enable_inplace_model"]);
+        }
+        if (device_json.contains("use_tensorrt")) {
+            separate_config_map["use_tensorrt"] =
+                    static_cast<bool>(device_json["use_tensorrt"]);
+        }
+    }
+    //! options
+    if (info.contains("options")) {
+        auto options = info["options"];
+        if (options.contains("weight_preprocess"))
+            config.options.weight_preprocess = options["weight_preprocess"];
+        if (options.contains("fuse_preprocess"))
+            config.options.fuse_preprocess = options["fuse_preprocess"];
+        if (options.contains("fake_next_exec"))
+            config.options.fake_next_exec = options["fake_next_exec"];
+        if (options.contains("var_sanity_check_first_run"))
+            config.options.var_sanity_check_first_run =
+                    options["var_sanity_check_first_run"];
+        if (options.contains("const_shape"))
+            config.options.const_shape = options["const_shape"];
+        if (options.contains("force_dynamic_alloc"))
+            config.options.force_dynamic_alloc = options["force_dynamic_alloc"];
+        if (options.contains("force_output_dynamic_alloc"))
+            config.options.force_output_dynamic_alloc =
+                    options["force_output_dynamic_alloc"];
+        if (options.contains("no_profiling_on_shape_change"))
+            config.options.no_profiling_on_shape_change =
+                    options["no_profiling_on_shape_change"];
+        if (options.contains("jit_level"))
+            config.options.jit_level = options["jit_level"];
+        if (options.contains("comp_node_seq_record_level"))
+            config.options.comp_node_seq_record_level =
+                    options["comp_node_seq_record_level"];
+        if (options.contains("graph_opt_level"))
+            config.options.graph_opt_level = options["graph_opt_level"];
+        if (options.contains("async_exec_level"))
+            config.options.async_exec_level = options["async_exec_level"];
+    }
+    //! IO
+    auto get_io_type = [](std::string type) -> LiteIOType {
+        if (type == "value")
+            return LiteIOType::LITE_IO_VALUE;
+        if (type == "shape")
+            return LiteIOType::LITE_IO_SHAPE;
+        else {
+            LITE_THROW(
+                    ssprintf("LITE not support IO type of %s.", type.c_str()));
+        }
+    };
+    auto get_data_type = [](std::string type) -> LiteDataType {
+        if (type == "float32")
+            return LiteDataType::LITE_FLOAT;
+        if (type == "float16")
+            return LiteDataType::LITE_HALF;
+        if (type == "int32")
+            return LiteDataType::LITE_INT;
+        if (type == "int16")
+            return LiteDataType::LITE_INT16;
+        if (type == "int8")
+            return LiteDataType::LITE_INT8;
+        if (type == "uint8")
+            return LiteDataType::LITE_UINT8;
+        else {
+            LITE_THROW(ssprintf("LITE not support data type of %s.",
+                                type.c_str()));
+        }
+    };
+#define SET_SHAPE(shape_json_, config_)                                       \
+    do {                                                                      \
+        int ndim = 0;                                                         \
+        for (int i = 0; i < 4; i++) {                                         \
+            if (shape_json_.contains(shape_name[i])) {                        \
+                ndim++;                                                       \
+                config_.config_layout.shapes[i] = shape_json_[shape_name[i]]; \
+            } else {                                                          \
+                break;                                                        \
+            }                                                                 \
+        }                                                                     \
+        config_.config_layout.ndim = ndim;                                    \
+    } while (0)
+
+#define Config_IO(io_json_, io_config_)                                        \
+    if (io_json_.contains("is_host"))                                          \
+        io_config_.is_host = io_json_["is_host"];                              \
+    if (io_json_.contains("io_type"))                                          \
+        io_config_.io_type = get_io_type(io_json_["io_type"]);                 \
+    if (io_json_.contains("dtype"))                                            \
+        io_config_.config_layout.data_type = get_data_type(io_json_["dtype"]); \
+    if (io_json_.contains("shape")) {                                          \
+        auto shape_json = io_json_["shape"];                                   \
+        SET_SHAPE(shape_json, io_config_);                                     \
+    }
+
+    const std::string shape_name[] = {"dim0", "dim1", "dim2", "dim3"};
+    if(info.contains("IO")){
+        auto IOs = info["IO"];
+        if(IOs.contains("inputs")){
+            auto inputs = IOs["inputs"];
+            for (size_t i = 0; i < inputs.size(); i++) {
+                auto input_json = inputs[i];
+                bool found = false;
+                for (auto&& io_config : network_io.inputs) {
+                    if (io_config.name == input_json["name"]) {
+                        found = true;
+                        Config_IO(input_json, io_config);
+                    }
+                }
+                if (!found) {
+                    IO input;
+                    input.name = input_json["name"];
+                    Config_IO(input_json, input);
+                    network_io.inputs.push_back(input);
+                }
+            }
+        }
+        if (IOs.contains("outputs")) {
+            auto outputs = IOs["outputs"];
+            for (size_t i = 0; i < outputs.size(); i++) {
+                auto output_json = outputs[i];
+                bool found = false;
+                for (auto&& io_config : network_io.outputs) {
+                    if (io_config.name == output_json["name"]) {
+                        found = true;
+                        Config_IO(output_json, io_config);
+                    }
+                }
+                if (!found) {
+                    IO output;
+                    output.name = output_json["name"];
+                    Config_IO(output_json, output);
+                    network_io.outputs.push_back(output);
+                }
+            }
+        }
+    }
+    //! extra_info
+    if (info.contains("extra_info")) {
+        extra_info = info["extra_info"].dump();
+    }
+    return true;
+#undef GET_BOOL
+#undef Config_IO
+}
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/parse_info/parse_info_base.h b/lite/src/parse_info/parse_info_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d54ed05410fc06d605e43ec1e049ec7fcbfa02f0
--- /dev/null
+++ b/lite/src/parse_info/parse_info_base.h
@@ -0,0 +1,40 @@
+/**
+ * \file src/parse_info/parse_info_base.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include "lite/global.h"
+#include "mutex"
+
+namespace lite {
+
+struct ParseInfoStaticData {
+    std::unordered_map<std::string, ParseInfoFunc> parse_info_methods;
+    LITE_MUTEX map_mutex;
+};
+
+ParseInfoStaticData& parse_info_static_data();
+
+template <int count>
+struct ParseInfoRegister;
+}  // namespace lite
+
+#define REGIST_PARSE_INFO_FUNCTION(name_, func_) \
+    REGIST_PARSE_INFO_FUNCTION_WITH_NUM(__COUNTER__, name_, func_)
+
+#define REGIST_PARSE_INFO_FUNCTION_WITH_NUM(number_, name_, func_)      \
+    template <>                                                         \
+    struct ParseInfoRegister<number_> {                                 \
+        ParseInfoRegister() { register_parse_info_func(name_, func_); } \
+    };                                                                  \
+    namespace {                                                         \
+    ParseInfoRegister<number_> parse_info_##number_;                    \
+    }
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/parse_model/model_parser.cpp b/lite/src/parse_model/model_parser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de45f48409f162e5920ada98f8077c804e9e6141
--- /dev/null
+++ b/lite/src/parse_model/model_parser.cpp
@@ -0,0 +1,134 @@
+/**
+ * \file src/model_parser.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "model_parser.h"
+#include "decryption/decrypt_base.h"
+#include "parse_info/parse_info_base.h"
+
+using namespace lite;
+using namespace model_parse;
+
+std::string ModelParser::sm_model_tag = "packed_model";
+
+void ModelParser::parse_header() {
+    size_t tag_length = sm_model_tag.size();
+
+    //! parse model tag
+    const char* ptr = static_cast<char*>(m_model.get());
+    std::string tag(static_cast<const char*>(ptr), tag_length);
+    if (sm_model_tag == tag) {
+        m_is_bare_model = false;
+    } else {
+        //! if no tag, the model is bare model, return
+        m_is_bare_model = true;
+        return;
+    }
+
+    uint8_t* buffer = static_cast<uint8_t*>(m_model.get()) + tag_length;
+    auto packed_model = GetPackModel(buffer);
+    auto models = packed_model->models();
+    LITE_ASSERT(models->size() == 1, "Now only support one model");
+    auto model = models->Get(0);
+    m_model_name = model->header()->name()->c_str();
+    m_model_decryption_name =
+            model->header()->model_decryption_method()->c_str();
+    m_info_decryption_name = model->header()->info_decryption_method()->c_str();
+    m_info_parse_func_name = model->header()->info_parse_method()->c_str();
+
+    m_info = model->info();
+    m_model_data = model->data();
+}
+
+bool ModelParser::parse_model_info(
+        Config& network_config, NetworkIO& network_io,
+        std::unordered_map<std::string, LiteAny>& isolated_config_map,
+        std::string& extra_info) const {
+    //! no model info, no parse, direct return
+    if (m_is_bare_model || !m_info) {
+        return false;
+    }
+    size_t info_length = m_info->data()->size();
+    const uint8_t* info_data = m_info->data()->Data();
+    //! decryption the info
+    auto info_ptr = decrypt_memory(info_data, info_length,
+                                   m_info_decryption_name, info_length);
+    //! parse the info
+    LITE_LOCK_GUARD(parse_info_static_data().map_mutex);
+    auto it_parse = parse_info_static_data().parse_info_methods.find(
+            m_info_parse_func_name);
+    if (it_parse == parse_info_static_data().parse_info_methods.end()) {
+        LITE_THROW(ssprintf("can't find model info parse function %s.",
+                            m_info_parse_func_name.c_str()));
+    }
+    auto model_info_parse_func =
+            parse_info_static_data().parse_info_methods[m_info_parse_func_name];
+    //! convert for NetworkIOInner to NetworkIO
+    if (model_info_parse_func) {
+        model_info_parse_func(info_ptr.get(), info_length, m_model_name,
+                              network_config, network_io, isolated_config_map,
+                              extra_info);
+    } else {
+        LITE_THROW(ssprintf("model info parse function of  %s is empty",
+                            m_info_parse_func_name.c_str()));
+    }
+    return true;
+}
+
+std::shared_ptr<void> ModelParser::parse_model(size_t& model_length,
+                                               const Config& config) const {
+    if (m_is_bare_model) {
+        if (config.bare_model_cryption_name.size() == 0) {
+            model_length = m_total_length;
+            return m_model;
+        } else {
+            return decrypt_memory(
+                    static_cast<uint8_t*>(m_model.get()), m_total_length,
+                    config.bare_model_cryption_name, model_length);
+        }
+    }
+    LITE_ASSERT(m_model_data, "packed model parse error!");
+    model_length = m_model_data->data()->size();
+    const uint8_t* model_data = m_model_data->data()->Data();
+    LITE_ASSERT(model_length > 0, "The loaded model is of zero length.");
+    return decrypt_memory(model_data, model_length, m_model_decryption_name,
+                          model_length);
+}
+
+std::shared_ptr<void> ModelParser::decrypt_memory(
+        const uint8_t* data, size_t length, const std::string decryption_name,
+        size_t& result_length) const {
+    const uint8_t* memory_ptr = data;
+    if (decryption_name == "NONE") {
+        result_length = length;
+        return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr),
+                                     [](void*) {});
+    }
+    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
+    auto it = decryption_static_data().decryption_methods.find(decryption_name);
+    if (it == decryption_static_data().decryption_methods.end()) {
+        LITE_THROW(ssprintf("The decryption method %s is not registed yet.",
+                            decryption_name.c_str()));
+    }
+    auto&& func = it->second.first;
+    auto&& key = it->second.second;
+    if (func) {
+        auto model_vector = func(memory_ptr, length, *key);
+        result_length = model_vector.size();
+        auto tmp_model_vector =
+                new std::vector<uint8_t>(std::move(model_vector));
+        return std::shared_ptr<void>(
+                tmp_model_vector->data(),
+                [tmp_model_vector](void*) { delete tmp_model_vector; });
+    } else {
+        LITE_THROW(ssprintf("No decryption function in %s method.",
+                            decryption_name.c_str()));
+    }
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/parse_model/model_parser.h b/lite/src/parse_model/model_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6edb8f0eefa12a7b32d0687f601d04483e46647
--- /dev/null
+++ b/lite/src/parse_model/model_parser.h
@@ -0,0 +1,75 @@
+/**
+ * \file src/model_parser.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+#include "lite/global.h"
+#include "../network_impl_base.h"
+
+#include "pack_model_generated.h"
+#include <flatbuffers/flatbuffers.h>
+
+#include <unordered_map>
+
+namespace lite {
+
+/*!
+ * \brief parse the model and decyt
+ */
+class ModelParser {
+public:
+    ModelParser(std::shared_ptr<void> model_ptr, size_t model_length)
+            : m_model(model_ptr), m_total_length(model_length) {
+        //! parse the header
+        parse_header();
+    }
+
+    //! parse the Info part of the model, update the network_config and
+    //! network_io
+    bool parse_model_info(
+            Config& network_config, NetworkIO& network_io,
+            std::unordered_map<std::string, LiteAny>& isolated_config_map,
+            std::string& extra_info) const;
+
+    //! parse the model and decrypt the model
+    std::shared_ptr<void> parse_model(size_t& model_length,
+                                      const Config& config) const;
+
+private:
+    //! parse the header of the model and store the model related information
+    //! to the menber data
+    void parse_header();
+
+    //! decrypt a memory with length of length and decryption method name
+    //! decrypt_name
+    std::shared_ptr<void> decrypt_memory(const uint8_t* data, size_t length,
+                                         const std::string decryption_name,
+                                         size_t& result_length) const;
+
+private:
+    std::string m_model_name;
+    //! the info and model decryption method name,  the
+    //! decryption func can be found through this name
+    std::string m_info_decryption_name;
+    std::string m_model_decryption_name;
+    //! the function name to parse the model info
+    std::string m_info_parse_func_name;
+    //! if a model is not added json info to the model is not crypted, the
+    //! model is a bare model
+    bool m_is_bare_model = true;
+
+    const model_parse::ModelInfo* m_info = nullptr;
+    const model_parse::ModelData* m_model_data = nullptr;
+
+    std::shared_ptr<void> m_model;
+    size_t m_total_length;
+
+    static std::string sm_model_tag;
+};
+}  // namespace lite
+   // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/parse_model/pack_model.fbs b/lite/src/parse_model/pack_model.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..d0bc442eadafcaaf9db2bc993bc803126a9400f1
--- /dev/null
+++ b/lite/src/parse_model/pack_model.fbs
@@ -0,0 +1,28 @@
+namespace model_parse;
+
+table ModelHeader {
+    name:string;
+    info_decryption_method:string;
+    info_parse_method:string;
+    model_decryption_method:string;
+}
+
+table ModelInfo {
+    data:[ubyte];
+}
+
+table ModelData {
+    data:[ubyte];
+}
+
+table Model {
+    header:ModelHeader;
+    info:ModelInfo;
+    data:ModelData;
+}
+
+table PackModel {
+    models:[Model];
+}
+
+root_type PackModel;
diff --git a/lite/src/tensor.cpp b/lite/src/tensor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6bda653f646ec5b95e5b5b80cc94a439248fa2cf
--- /dev/null
+++ b/lite/src/tensor.cpp
@@ -0,0 +1,339 @@
+/**
+ * \file src/tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite/tensor.h"
+#include "function_base.h"
+#include "tensor_impl_base.h"
+#if LITE_BUILD_WITH_MGE
+#include "megbrain/comp_node.h"
+#include "megbrain/tensor.h"
+#include "mge/function_dft.h"
+#include "mge/tensor_impl.h"
+#endif
+
+#include <memory>
+
+using namespace lite;
+
+size_t Layout::get_elem_size() const {
+    size_t elesize = 1;
+    switch (data_type) {
+        case LiteDataType::LITE_INT64:
+            elesize = 8;
+            break;
+        case LiteDataType::LITE_FLOAT:
+        case LiteDataType::LITE_INT:
+        case LiteDataType::LITE_UINT:
+            elesize = 4;
+            break;
+        case LiteDataType::LITE_HALF:
+        case LiteDataType::LITE_INT16:
+        case LiteDataType::LITE_UINT16:
+            elesize = 2;
+            break;
+        case LiteDataType::LITE_INT8:
+        case LiteDataType::LITE_UINT8:
+            elesize = 1;
+            break;
+        default:
+            LITE_THROW("not support data type.");
+    }
+    return elesize;
+}
+
+bool Layout::operator==(const Layout& other) const {
+    bool equal = true;
+    equal &= (ndim == other.ndim);
+    equal &= (data_type == other.data_type);
+    for (size_t i = 0; i < ndim; i++) {
+        equal &= (shapes[i] == other.shapes[i]);
+    }
+    return equal;
+}
+
+Tensor::~Tensor() = default;
+
+Tensor::Tensor() {
+    LITE_ERROR_HANDLER_BEGIN
+    m_tensor_impl = call_func<TensorImplDft,
+                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
+            "create_tensor");
+    LITE_ERROR_HANDLER_END
+}
+
+Tensor::Tensor(LiteDeviceType device_type, bool is_pinned_host)
+        : m_is_pinned_host(is_pinned_host), m_device_type(device_type) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_tensor_impl = call_func<TensorImplDft,
+                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
+            "create_tensor", device_type, is_pinned_host);
+    LITE_ERROR_HANDLER_END
+}
+
+Tensor::Tensor(LiteDeviceType device_type, const Layout& layout,
+               bool is_pinned_host)
+        : m_is_pinned_host(is_pinned_host),
+          m_layout(layout),
+          m_device_type(device_type) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_tensor_impl = call_func<TensorImplDft,
+                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
+            "create_tensor", device_type, layout, is_pinned_host);
+    LITE_ERROR_HANDLER_END
+}
+
+Tensor::Tensor(int device_id, LiteDeviceType device_type, const Layout& layout,
+               bool is_pinned_host)
+        : m_is_pinned_host(is_pinned_host),
+          m_device_id(device_id),
+          m_layout(layout),
+          m_device_type(device_type) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_tensor_impl = call_func<TensorImplDft,
+                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
+            "create_tensor", device_id, device_type, layout, is_pinned_host);
+    LITE_ERROR_HANDLER_END
+}
+
+Tensor::Tensor(int device_id, int stream_id, LiteDeviceType device_type,
+               bool is_pinned_host)
+        : m_is_pinned_host(is_pinned_host),
+          m_device_id(device_id),
+          m_device_type(device_type) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_tensor_impl = call_func<TensorImplDft,
+                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
+            "create_tensor", device_id, stream_id, device_type, is_pinned_host);
+    LITE_ERROR_HANDLER_END
+}
+
+Tensor::Tensor(LiteBackend backend, LiteDeviceType device_type, int device_id,
+               const Layout& layout, bool is_pinned_host) {
+    if (backend == LiteBackend::LITE_DEFAULT) {
+        m_tensor_impl =
+                call_func<TensorImplDft,
+                          std::shared_ptr<lite::Tensor::TensorImplBase>>(
+                        "create_tensor", device_id, device_type, layout,
+                        is_pinned_host);
+    } else {
+        LITE_MARK_USED_VAR(device_type);
+        LITE_MARK_USED_VAR(is_pinned_host);
+        LITE_MARK_USED_VAR(layout);
+        LITE_MARK_USED_VAR(device_id);
+        LITE_THROW("unknow backend, enum id is : %d.");
+    }
+}
+
+void Tensor::reshape(const std::vector<int>& shape) {
+    LITE_ASSERT(m_layout.ndim > 0, "The tensor to be reshape is empty.");
+    uint32_t length = shape.size();
+    LITE_ASSERT(length < Layout::MAXDIM,
+                "The ndim of reshape input is too large.");
+    Layout new_layout = m_layout;
+    new_layout.ndim = length;
+    size_t total_length =
+            get_tensor_total_size_in_byte() / m_layout.get_elem_size();
+    uint32_t unfixed_number = 0;
+    uint32_t unfixed_index = 0;
+    for (uint32_t i = 0; i < length; i++) {
+        if (shape[i] == -1) {
+            unfixed_number += 1;
+            unfixed_index = i;
+        } else {
+            LITE_ASSERT(shape[i] > 0, "The reshape inputs invalid.");
+            new_layout.shapes[i] = shape[i];
+        }
+    }
+    LITE_ASSERT(unfixed_number <= 1, "The reshape inputs invalid.");
+    if (unfixed_number) {
+        size_t left = total_length;
+        for (uint32_t i = 0; i < length; i++) {
+            if (i == unfixed_index) {
+                continue;
+            } else {
+                LITE_ASSERT(left > 0 && (left % new_layout.shapes[i] == 0),
+                            "The reshape inputs invalid.");
+                left = left / new_layout.shapes[i];
+            }
+        }
+        LITE_ASSERT(left > 0, "The reshape inputs invalid.");
+        new_layout.shapes[unfixed_index] = left;
+    }
+    size_t new_total = 1;
+    for (uint32_t i = 0; i < length; i++) {
+        new_total *= new_layout.shapes[i];
+    }
+    LITE_ASSERT(new_total == total_length, "The reshape inputs invalid.");
+    m_layout = new_layout;
+    m_tensor_impl->reshape(m_layout);
+}
+
+size_t Tensor::get_tensor_total_size_in_byte() const {
+    LITE_ERROR_HANDLER_BEGIN
+    size_t elemsize = m_layout.get_elem_size();
+    size_t total = m_layout.ndim == 0 ? 0 : 1;
+    for (size_t i = 0; i < m_layout.ndim; i++) {
+        total *= m_layout.shapes[i];
+    }
+    return total * elemsize;
+    LITE_ERROR_HANDLER_END
+}
+
+void* Tensor::get_memory_ptr() const {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_layout.ndim != 0,
+                "Tensor layout is not valid when get memory ptr.");
+    return m_tensor_impl->get_memory_ptr();
+    LITE_ERROR_HANDLER_END
+}
+
+void* Tensor::get_memory_ptr(const std::vector<size_t>& idx) const {
+    LITE_ERROR_HANDLER_BEGIN
+    return m_tensor_impl->get_memory_ptr(idx);
+    LITE_ERROR_HANDLER_END
+}
+
+std::shared_ptr<Tensor> Tensor::slice(const std::vector<size_t>& start,
+                                      const std::vector<size_t>& end,
+                                      const std::vector<size_t>& step) {
+    LITE_ERROR_HANDLER_BEGIN
+    auto ret = m_tensor_impl->slice(start, end, step);
+    ret->update_from_implement();
+    return ret;
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::fill_zero() {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_layout.ndim > 0,
+                "fill_zero can't apply on a tensor with empty layout.");
+    m_tensor_impl->fill_zero();
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::share_memory_with(const Tensor& src_tensor) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(src_tensor.m_layout.ndim > 0,
+                "To be shared tensor with empty layout.");
+    m_tensor_impl->share_memory_with(src_tensor.m_tensor_impl.get());
+    update_from_implement();
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::set_layout(const Layout& layout) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_layout = layout;
+    m_tensor_impl->set_layout(layout);
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::reset(void* prepared_data, size_t data_length_in_byte) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_layout.ndim,
+                "Tensor layout is empty, please reset with layout");
+    LITE_ASSERT(data_length_in_byte >= get_tensor_total_size_in_byte(),
+                "the memory reset to the tensor is too small.");
+    m_tensor_impl->reset(prepared_data);
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::reset(void* prepared_data, const Layout& layout) {
+    LITE_ERROR_HANDLER_BEGIN
+    m_layout = layout;
+    m_tensor_impl->reset(prepared_data, layout);
+    LITE_ERROR_HANDLER_END
+}
+
+bool Tensor::is_continue_memory() const {
+    LITE_ERROR_HANDLER_BEGIN
+    return m_tensor_impl->is_continue_memory();
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::copy_from(const Tensor& src) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(src.get_layout().ndim != 0,
+                "when tensor copy, the src tensor layout is empty.");
+    m_tensor_impl->copy_from(src.m_tensor_impl.get());
+    update_from_implement();
+    LITE_ERROR_HANDLER_END
+}
+
+void Tensor::update_from_implement() {
+    LITE_ERROR_HANDLER_BEGIN
+    m_layout = m_tensor_impl->get_layout();
+    m_device_type = m_tensor_impl->get_device_type();
+    m_device_id = m_tensor_impl->get_device_id();
+    m_is_pinned_host = m_tensor_impl->is_pinned_host();
+    LITE_ERROR_HANDLER_END
+}
+
+void LiteAny::type_missmatch(size_t expect, size_t get) const {
+    LITE_THROW(ssprintf(
+            "The type store in LiteAny is not match the visit type, type of "
+            "storage length is %zu, type of visit length is %zu.",
+            expect, get));
+}
+
+std::shared_ptr<Tensor> TensorUtils::concat(const std::vector<Tensor>& tensors,
+                                            int dim, LiteDeviceType dst_device,
+                                            int dst_device_id) {
+    if (tensors.size() <= 0) {
+        return std::make_shared<Tensor>();
+    }
+    if (dst_device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
+        dst_device = tensors.front().get_device_type();
+    }
+    if (dst_device_id == -1) {
+        dst_device_id = tensors.front().get_device_id();
+    }
+    bool is_pinned_host = tensors.front().is_pinned_host();
+    auto layout = tensors.front().get_layout();
+    LITE_ASSERT(static_cast<int>(layout.ndim) > dim,
+                "the dim in concat is error.");
+    size_t sum_in_dim = layout.shapes[dim];
+    for (size_t i = 1; i < tensors.size(); ++i) {
+        auto other_layout = tensors[i].get_layout();
+        LITE_ASSERT(other_layout.ndim == layout.ndim,
+                    "the dim size of tensors is not same!");
+        LITE_ASSERT(other_layout.data_type == layout.data_type,
+                    "the dtype of tensors is not same!");
+        for (size_t j = 0; j < other_layout.ndim; ++j) {
+            if (dim == static_cast<int>(j)) {
+                sum_in_dim += other_layout.shapes[j];
+                continue;
+            }
+            LITE_ASSERT(other_layout.shapes[j] == layout.shapes[j],
+                        "the shape of tensors is not same!");
+        }
+    }
+    layout.shapes[dim] = sum_in_dim;
+    auto result = std::make_shared<Tensor>(dst_device_id, dst_device, layout,
+                                           is_pinned_host);
+    size_t index = 0;
+    std::vector<size_t> start(dim + 1, 0);
+    std::vector<size_t> end(dim + 1, 0);
+    for (int i = 0; i < dim; i++) {
+        end[i] = layout.shapes[i];
+    }
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        auto&& tensor = tensors[i];
+        auto layout = tensor.get_layout();
+        if (layout.shapes[dim] == 0)
+            continue;
+        start[dim] = index;
+        end[dim] = index + layout.shapes[dim];
+        auto&& sub_dst = result->slice(start, end);
+        sub_dst->copy_from(tensor);
+        index += layout.shapes[dim];
+    }
+    return result;
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/tensor_impl_base.h b/lite/src/tensor_impl_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7f1bf0dbca937b248a03859bde64061826ee3d0
--- /dev/null
+++ b/lite/src/tensor_impl_base.h
@@ -0,0 +1,101 @@
+/**
+ * \file src/tensor_impl_base.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite/tensor.h"
+#include "misc.h"
+#include "type_info.h"
+
+#include <unordered_map>
+
+namespace lite {
+
+/*!
+ * \brief implement the Tensor
+ */
+class Tensor::TensorImplBase : public DynTypeObj {
+public:
+    virtual ~TensorImplBase() = default;
+
+    virtual LiteDeviceType get_device_type() const = 0;
+
+    virtual int get_device_id() const = 0;
+
+    virtual LiteBackend get_backend_type() const = 0;
+
+    virtual Layout get_layout() const = 0;
+
+    virtual bool is_pinned_host() const = 0;
+
+    virtual void* get_memory_ptr() const = 0;
+
+    virtual void* get_memory_ptr(const std::vector<size_t>& idx) const = 0;
+
+    virtual void set_layout(const Layout& layout) = 0;
+
+    //! use the user allocated data to reset the memory of the tensor, the
+    //! memory will not be managed by the lite, later, the user should delete
+    //! it.
+    virtual void reset(void* prepared_data) = 0;
+
+    //! use the user allocated data and corresponding layout to reset the data
+    //! and layout of the tensor, the memory will not be managed by lite, later,
+    //! the user should delete it.
+    virtual void reset(void* prepared_data, const Layout& layout) = 0;
+
+    //! reshape the tensor with new shape, keep the data_type the same
+    virtual void reshape(const Layout& layout) = 0;
+
+    //! get a new tensor slice from the origin tensor
+    virtual std::shared_ptr<Tensor> slice(
+            const std::vector<size_t>& start, const std::vector<size_t>& end,
+            const std::vector<size_t>& step = {}) = 0;
+
+    //! set the tensor memory with zero
+    virtual void fill_zero() = 0;
+
+    //! copy tensor form other tensor
+    //! Note: the best way for tensor copy is just set the dst device, left
+    //! layout empty, when copying the dst layout will be set the same with
+    //! src
+    virtual void copy_from(const TensorImplBase* src_impl) = 0;
+
+    //! share memory with other tensor
+    virtual void share_memory_with(const TensorImplBase* src_impl) = 0;
+
+    //! whether the memory of tensor is continue
+    virtual bool is_continue_memory() const = 0;
+};
+
+/*!
+ * \brief friend class of Tensor, for convenient accessing the Network members
+ */
+class TensorHelper {
+public:
+    static inline std::shared_ptr<Tensor::TensorImplBase> implement(
+            const std::shared_ptr<Tensor> tensor) {
+        LITE_ASSERT(tensor);
+        return tensor->m_tensor_impl;
+    }
+    static inline std::shared_ptr<Tensor::TensorImplBase> implement(
+            const Tensor* tensor) {
+        LITE_ASSERT(tensor);
+        return tensor->m_tensor_impl;
+    }
+    static inline void implement(const std::shared_ptr<Tensor> tensor,
+                                 std::shared_ptr<Tensor::TensorImplBase> impl) {
+        LITE_ASSERT(tensor);
+        tensor->m_tensor_impl = impl;
+    }
+};
+
+}  // namespace lite
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/type_info.h b/lite/src/type_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..28785beade407b000f9cad298c9d485c536b0cd5
--- /dev/null
+++ b/lite/src/type_info.h
@@ -0,0 +1,97 @@
+/**
+ * \file src/type_info.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "misc.h"
+
+namespace lite {
+/*!
+ * \brief an object to represent a type
+ *
+ * LITE has a lightweight RTTI system. Each type is represented by the
+ * address of a Typeinfo object, which is stored in the .bss segment.
+ *
+ * LITE_TYPEINFO_OBJ_DECL should be placed into the definition of classes that
+ * need compile-time type support.
+ *
+ * For classes that need RTTI, they should be derived from DynTypeObj
+ */
+struct Typeinfo {
+    //! name of the corresponding type; nullptr if MGB_VERBOSE_TYPEINFO_NAME==0
+    const char* const name;
+
+    /*!
+     * \brief whether this is the type of given object
+     * \tparam T a class with static typeinfo() method
+     */
+    template <typename T>
+    bool is() const {
+        return T::typeinfo() == this;
+    }
+};
+
+/*!
+ * \brief base class to emulate RTTI without compiler support
+ */
+class DynTypeObj {
+public:
+    virtual Typeinfo* dyn_typeinfo() const = 0;
+
+    //! cast this to a final object with type check
+    template <class T>
+    T& cast_final_safe() {
+        LITE_ASSERT(T::typeinfo() == dyn_typeinfo(),
+                    "can not convert type %s to %s", dyn_typeinfo()->name,
+                    T::typeinfo()->name);
+        return *static_cast<T*>(this);
+    }
+
+    template <class T>
+    const T& cast_final_safe() const {
+        return const_cast<DynTypeObj*>(this)->cast_final_safe<T>();
+    }
+
+    //! check whether this is same to given type
+    template <class T>
+    bool same_type() const {
+        return dyn_typeinfo() == T::typeinfo();
+    }
+
+protected:
+    ~DynTypeObj() = default;
+};
+
+//! put in the declaration of a final class inherited from DynTypeObj
+#define LITE_DYN_TYPE_OBJ_FINAL_DECL                                    \
+public:                                                                 \
+    ::lite::Typeinfo* dyn_typeinfo() const override final;              \
+    static inline ::lite::Typeinfo* typeinfo() { return &sm_typeinfo; } \
+                                                                        \
+private:                                                                \
+    static ::lite::Typeinfo sm_typeinfo
+
+#if LITE_ENABLE_LOGGING
+//! get class name from class object
+#define _LITE_TYPEINFO_CLASS_NAME(_cls) #_cls
+#else
+#define _LITE_TYPEINFO_CLASS_NAME(_cls) nullptr
+#endif
+
+//! put in the impl file of a class that needs static typeinfo()
+#define LITE_TYPEINFO_OBJ_IMPL(_cls) \
+    ::lite::Typeinfo _cls::sm_typeinfo { _LITE_TYPEINFO_CLASS_NAME(_cls) }
+
+//! put in the impl file of a final class inherited from DynTypeObj
+#define LITE_DYN_TYPE_OBJ_FINAL_IMPL(_cls)                                \
+    ::lite::Typeinfo* _cls::dyn_typeinfo() const { return &sm_typeinfo; } \
+    LITE_TYPEINFO_OBJ_IMPL(_cls)
+
+}  // namespace lite
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/version_lite.ld b/lite/src/version_lite.ld
new file mode 100644
index 0000000000000000000000000000000000000000..66f30d7694c59a2c2c8cea75b1598a1cba7f532f
--- /dev/null
+++ b/lite/src/version_lite.ld
@@ -0,0 +1,10 @@
+{
+global:
+    extern "C++" {lite::*;};
+    Lite*;
+    LITE*;
+    default_config;
+    default_network_io;
+
+local: *;
+};
diff --git a/lite/test/CMakeLists.txt b/lite/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c5e9d13343ef6a097f814b86ee9dd6c4ad96714
--- /dev/null
+++ b/lite/test/CMakeLists.txt
@@ -0,0 +1,23 @@
+if (MGE_WITH_TEST)
+    file (GLOB_RECURSE SOURCES ./*.cpp main.cpp)
+    add_executable (lite_test  ${SOURCES})
+
+    target_link_libraries(lite_test gtest)
+    target_link_libraries(lite_test lite_static)
+    if(LITE_BUILD_WITH_MGE)
+        # lite_test will depends megbrain interface
+        target_link_libraries(lite_test megbrain)
+    endif()
+
+    if(UNIX)
+        if(APPLE OR ANDROID)
+            target_link_libraries(lite_test dl)
+        else()
+            target_link_libraries(lite_test dl rt)
+        endif()
+    endif()
+
+    install (TARGETS lite_test
+        EXPORT ${LITE_EXPORT_TARGETS}
+        RUNTIME DESTINATION lite/bin)
+endif()
diff --git a/lite/test/main.cpp b/lite/test/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af75a6c4d1f167df27fed7ace1a658a44414c87a
--- /dev/null
+++ b/lite/test/main.cpp
@@ -0,0 +1,33 @@
+/**
+ * \file test/main.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <gtest/gtest.h>
+#include "../src/misc.h"
+#include "lite/global.h"
+
+namespace {
+
+class ResetSeedListener : public ::testing::EmptyTestEventListener {
+    void OnTestStart(const ::testing::TestInfo&) override {}
+};
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    ResetSeedListener listener;
+    auto&& listeners = ::testing::UnitTest::GetInstance()->listeners();
+    ::testing::InitGoogleTest(&argc, argv);
+    listeners.Append(&listener);
+    lite::set_log_level(LiteLogLevel::WARN);
+    auto ret = RUN_ALL_TESTS();
+    listeners.Release(&listener);
+    return ret;
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/npy.h b/lite/test/npy.h
new file mode 100644
index 0000000000000000000000000000000000000000..552cda78f7cc203624b58a16ec0213bc65735b60
--- /dev/null
+++ b/lite/test/npy.h
@@ -0,0 +1,638 @@
+/*
+   Copyright 2017 Leon Merten Lohse
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#ifndef NPY_H
+#define NPY_H
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace npy {
+
+/* Compile-time test for byte order.
+   If your compiler does not define these per default, you may want to define
+   one of these constants manually.
+   Defaults to little endian order. */
+#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN ||                  \
+        defined(__BIG_ENDIAN__) || defined(__ARMEB__) ||                      \
+        defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \
+        defined(__MIBSEB) || defined(__MIBSEB__)
+const bool big_endian = true;
+#else
+const bool big_endian = false;
+#endif
+
+const char magic_string[] = "\x93NUMPY";
+const size_t magic_string_length = 6;
+
+const char little_endian_char = '<';
+const char big_endian_char = '>';
+const char no_endian_char = '|';
+
+constexpr char host_endian_char =
+        (big_endian ? big_endian_char : little_endian_char);
+
+/* npy array length */
+typedef unsigned long int ndarray_len_t;
+
+inline void write_magic(std::ostream& ostream, unsigned char v_major = 1,
+                        unsigned char v_minor = 0) {
+    ostream.write(magic_string, magic_string_length);
+    ostream.put(v_major);
+    ostream.put(v_minor);
+}
+
+inline void read_magic(std::istream& istream, unsigned char& v_major,
+                       unsigned char& v_minor) {
+    char buf[magic_string_length + 2];
+    istream.read(buf, magic_string_length + 2);
+
+    if (!istream) {
+        fprintf(stderr, "io error: failed reading file");
+    }
+
+    if (0 != std::memcmp(buf, magic_string, magic_string_length)) {
+        fprintf(stderr, "this file does not have a valid npy format.");
+    }
+
+    v_major = buf[magic_string_length];
+    v_minor = buf[magic_string_length + 1];
+}
+
+// typestring magic
+struct Typestring {
+private:
+    char c_endian;
+    char c_type;
+    int len;
+
+public:
+    inline std::string str() {
+        const size_t max_buflen = 16;
+        char buf[max_buflen];
+        std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
+        return std::string(buf);
+    }
+
+    Typestring(const std::vector<float>&)
+            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {}
+    Typestring(const std::vector<double>&)
+            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {}
+    Typestring(const std::vector<long double>&)
+            : c_endian{host_endian_char},
+              c_type{'f'},
+              len{sizeof(long double)} {}
+
+    Typestring(const std::vector<char>&)
+            : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {}
+    Typestring(const std::vector<short>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {}
+    Typestring(const std::vector<int>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {}
+    Typestring(const std::vector<long>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {}
+    Typestring(const std::vector<long long>&)
+            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {}
+
+    Typestring(const std::vector<unsigned char>&)
+            : c_endian{no_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned char)} {}
+    Typestring(const std::vector<unsigned short>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned short)} {}
+    Typestring(const std::vector<unsigned int>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned int)} {}
+    Typestring(const std::vector<unsigned long>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned long)} {}
+    Typestring(const std::vector<unsigned long long>&)
+            : c_endian{host_endian_char},
+              c_type{'u'},
+              len{sizeof(unsigned long long)} {}
+
+    Typestring(const std::vector<std::complex<float>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<float>)} {}
+    Typestring(const std::vector<std::complex<double>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<double>)} {}
+    Typestring(const std::vector<std::complex<long double>>&)
+            : c_endian{host_endian_char},
+              c_type{'c'},
+              len{sizeof(std::complex<long double>)} {}
+};
+
+inline void parse_typestring(std::string typestring) {
+    std::regex re("'([<>|])([ifuc])(\\d+)'");
+    std::smatch sm;
+
+    std::regex_match(typestring, sm, re);
+
+    if (sm.size() != 4) {
+        fprintf(stderr, "invalid typestring");
+    }
+}
+
+namespace pyparse {
+
+/**
+  Removes leading and trailing whitespaces
+  */
+inline std::string trim(const std::string& str) {
+    const std::string whitespace = " \t";
+    auto begin = str.find_first_not_of(whitespace);
+
+    if (begin == std::string::npos)
+        return "";
+
+    auto end = str.find_last_not_of(whitespace);
+
+    return str.substr(begin, end - begin + 1);
+}
+
+inline std::string get_value_from_map(const std::string& mapstr) {
+    size_t sep_pos = mapstr.find_first_of(":");
+    if (sep_pos == std::string::npos)
+        return "";
+
+    std::string tmp = mapstr.substr(sep_pos + 1);
+    return trim(tmp);
+}
+
+/**
+   Parses the string representation of a Python dict
+
+   The keys need to be known and may not appear anywhere else in the data.
+ */
+inline std::unordered_map<std::string, std::string> parse_dict(
+        std::string in, std::vector<std::string>& keys) {
+    std::unordered_map<std::string, std::string> map;
+
+    if (keys.size() == 0)
+        return map;
+
+    in = trim(in);
+
+    // unwrap dictionary
+    if ((in.front() == '{') && (in.back() == '}'))
+        in = in.substr(1, in.length() - 2);
+    else {
+        fprintf(stderr, "Not a Python dictionary.");
+    }
+
+    std::vector<std::pair<size_t, std::string>> positions;
+
+    for (auto const& value : keys) {
+        size_t pos = in.find("'" + value + "'");
+
+        if (pos == std::string::npos) {
+            fprintf(stderr, "Missing %s key.", value.c_str());
+        }
+
+        std::pair<size_t, std::string> position_pair{pos, value};
+        positions.push_back(position_pair);
+    }
+
+    // sort by position in dict
+    std::sort(positions.begin(), positions.end());
+
+    for (size_t i = 0; i < positions.size(); ++i) {
+        std::string raw_value;
+        size_t begin{positions[i].first};
+        size_t end{std::string::npos};
+
+        std::string key = positions[i].second;
+
+        if (i + 1 < positions.size())
+            end = positions[i + 1].first;
+
+        raw_value = in.substr(begin, end - begin);
+
+        raw_value = trim(raw_value);
+
+        if (raw_value.back() == ',')
+            raw_value.pop_back();
+
+        map[key] = get_value_from_map(raw_value);
+    }
+
+    return map;
+}
+
+/**
+  Parses the string representation of a Python boolean
+  */
+inline bool parse_bool(const std::string& in) {
+    if (in == "True")
+        return true;
+    if (in == "False")
+        return false;
+
+    fprintf(stderr, "Invalid python boolan.");
+    return false;
+}
+
+/**
+  Parses the string representation of a Python str
+  */
+inline std::string parse_str(const std::string& in) {
+    if ((in.front() == '\'') && (in.back() == '\''))
+        return in.substr(1, in.length() - 2);
+
+    fprintf(stderr, "Invalid python string.");
+    return "";
+}
+
+/**
+  Parses the string represenatation of a Python tuple into a vector of its items
+ */
+inline std::vector<std::string> parse_tuple(std::string in) {
+    std::vector<std::string> v;
+    const char seperator = ',';
+
+    in = trim(in);
+
+    if ((in.front() == '(') && (in.back() == ')'))
+        in = in.substr(1, in.length() - 2);
+    else {
+        fprintf(stderr, "Invalid Python tuple.");
+    }
+
+    std::istringstream iss(in);
+
+    for (std::string token; std::getline(iss, token, seperator);) {
+        v.push_back(token);
+    }
+
+    return v;
+}
+
+template <typename T>
+inline std::string write_tuple(const std::vector<T>& v) {
+    if (v.size() == 0)
+        return "";
+
+    std::ostringstream ss;
+
+    if (v.size() == 1) {
+        ss << "(" << v.front() << ",)";
+    } else {
+        const std::string delimiter = ", ";
+        // v.size() > 1
+        ss << "(";
+        std::copy(v.begin(), v.end() - 1,
+                  std::ostream_iterator<T>(ss, delimiter.c_str()));
+        ss << v.back();
+        ss << ")";
+    }
+
+    return ss.str();
+}
+
+inline std::string write_boolean(bool b) {
+    if (b)
+        return "True";
+    else
+        return "False";
+}
+
+}  // namespace pyparse
+
+inline void parse_header(std::string header, std::string& descr) {
+    /*
+       The first 6 bytes are a magic string: exactly "x93NUMPY".
+       The next 1 byte is an unsigned byte: the major version number of the file
+       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
+       number of the file format, e.g. x00. Note: the version of the file format
+       is not tied to the version of the numpy package. The next 2 bytes form a
+       little-endian unsigned short int: the length of the header data
+       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
+       array's format. It is an ASCII string which contains a Python literal
+       expression of a dictionary. It is terminated by a newline ('n') and
+       padded with spaces
+       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
+       evenly divisible by 16 for alignment purposes. The dictionary contains
+       three keys:
+
+       "descr" : dtype.descr
+       An object that can be passed as an argument to the numpy.dtype()
+       constructor to create the array's dtype. For repeatability and
+       readability, this dictionary is formatted using pprint.pformat() so the
+       keys are in alphabetic order.
+     */
+
+    // remove trailing newline
+    if (header.back() != '\n')
+        fprintf(stderr, "invalid header");
+    header.pop_back();
+
+    // parse the dictionary
+    std::vector<std::string> keys{"descr"};
+    auto dict_map = npy::pyparse::parse_dict(header, keys);
+
+    if (dict_map.size() == 0)
+        fprintf(stderr, "invalid dictionary in header");
+
+    std::string descr_s = dict_map["descr"];
+    parse_typestring(descr_s);
+    // remove
+    descr = npy::pyparse::parse_str(descr_s);
+    return;
+}
+
+inline void parse_header(std::string header, std::string& descr,
+                         bool& fortran_order,
+                         std::vector<ndarray_len_t>& shape) {
+    /*
+       The first 6 bytes are a magic string: exactly "x93NUMPY".
+       The next 1 byte is an unsigned byte: the major version number of the file
+       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
+       number of the file format, e.g. x00. Note: the version of the file format
+       is not tied to the version of the numpy package. The next 2 bytes form a
+       little-endian unsigned short int: the length of the header data
+       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
+       array's format. It is an ASCII string which contains a Python literal
+       expression of a dictionary. It is terminated by a newline ('n') and
+       padded with spaces
+       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
+       evenly divisible by 16 for alignment purposes. The dictionary contains
+       three keys:
+
+       "descr" : dtype.descr
+       An object that can be passed as an argument to the numpy.dtype()
+       constructor to create the array's dtype. "fortran_order" : bool Whether
+       the array data is Fortran-contiguous or not. Since Fortran-contiguous
+       arrays are a common form of non-C-contiguity, we allow them to be written
+       directly to disk for efficiency. "shape" : tuple of int The shape of the
+       array. For repeatability and readability, this dictionary is formatted
+       using pprint.pformat() so the keys are in alphabetic order.
+     */
+
+    // remove trailing newline
+    if (header.back() != '\n')
+        fprintf(stderr, "invalid header");
+    header.pop_back();
+
+    // parse the dictionary
+    std::vector<std::string> keys{"descr", "fortran_order", "shape"};
+    auto dict_map = npy::pyparse::parse_dict(header, keys);
+
+    if (dict_map.size() == 0)
+        fprintf(stderr, "invalid dictionary in header");
+
+    std::string descr_s = dict_map["descr"];
+    std::string fortran_s = dict_map["fortran_order"];
+    std::string shape_s = dict_map["shape"];
+
+    // TODO: extract info from typestring
+    parse_typestring(descr_s);
+    // remove
+    descr = npy::pyparse::parse_str(descr_s);
+
+    // convert literal Python bool to C++ bool
+    fortran_order = npy::pyparse::parse_bool(fortran_s);
+
+    // parse the shape tuple
+    auto shape_v = npy::pyparse::parse_tuple(shape_s);
+    if (shape_v.size() == 0)
+        fprintf(stderr, "invalid shape tuple in header");
+
+    for (auto item : shape_v) {
+        ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item));
+        shape.push_back(dim);
+    }
+}
+
+inline std::string write_header_dict(const std::string& descr,
+                                     bool fortran_order,
+                                     const std::vector<ndarray_len_t>& shape) {
+    std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
+    std::string shape_s = npy::pyparse::write_tuple(shape);
+
+    return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order +
+           ", 'shape': " + shape_s + ", }";
+}
+
+inline void write_header(std::ostream& out, const std::string& descr,
+                         bool fortran_order,
+                         const std::vector<ndarray_len_t>& shape_v) {
+    std::string header_dict = write_header_dict(descr, fortran_order, shape_v);
+
+    size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;
+
+    unsigned char version[2] = {1, 0};
+    if (length >= 255 * 255) {
+        length = magic_string_length + 2 + 4 + header_dict.length() + 1;
+        version[0] = 2;
+        version[1] = 0;
+    }
+    size_t padding_len = 16 - length % 16;
+    std::string padding(padding_len, ' ');
+
+    // write magic
+    write_magic(out, version[0], version[1]);
+
+    // write header length
+    if (version[0] == 1 && version[1] == 0) {
+        char header_len_le16[2];
+        uint16_t header_len = static_cast<uint16_t>(header_dict.length() +
+                                                    padding.length() + 1);
+
+        header_len_le16[0] = (header_len >> 0) & 0xff;
+        header_len_le16[1] = (header_len >> 8) & 0xff;
+        out.write(reinterpret_cast<char*>(header_len_le16), 2);
+    } else {
+        char header_len_le32[4];
+        uint32_t header_len = static_cast<uint32_t>(header_dict.length() +
+                                                    padding.length() + 1);
+
+        header_len_le32[0] = (header_len >> 0) & 0xff;
+        header_len_le32[1] = (header_len >> 8) & 0xff;
+        header_len_le32[2] = (header_len >> 16) & 0xff;
+        header_len_le32[3] = (header_len >> 24) & 0xff;
+        out.write(reinterpret_cast<char*>(header_len_le32), 4);
+    }
+
+    out << header_dict << padding << '\n';
+}
+
+inline std::string read_header(std::istream& istream) {
+    // check magic bytes an version number
+    unsigned char v_major, v_minor;
+    read_magic(istream, v_major, v_minor);
+
+    uint32_t header_length = 0;
+    if (v_major == 1 && v_minor == 0) {
+        char header_len_le16[2];
+        istream.read(header_len_le16, 2);
+        header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
+
+        if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
+            // TODO: display warning
+        }
+    } else if (v_major == 2 && v_minor == 0) {
+        char header_len_le32[4];
+        istream.read(header_len_le32, 4);
+
+        header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) |
+                        (header_len_le32[2] << 16) | (header_len_le32[3] << 24);
+
+        if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
+            // TODO: display warning
+        }
+    } else {
+        fprintf(stderr, "unsupported file format version");
+    }
+
+    auto buf_v = std::vector<char>();
+    buf_v.reserve(header_length);
+    istream.read(buf_v.data(), header_length);
+    std::string header(buf_v.data(), header_length);
+
+    return header;
+}
+
+inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
+    ndarray_len_t size = 1;
+    for (ndarray_len_t i : shape)
+        size *= i;
+
+    return size;
+}
+
+template <typename Scalar>
+inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order,
+                             unsigned int n_dims, const unsigned long shape[],
+                             const std::vector<Scalar>& data) {
+    Typestring typestring_o(data);
+    std::string typestring = typestring_o.str();
+
+    std::ofstream stream(filename, std::ofstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::vector<ndarray_len_t> shape_v(shape, shape + n_dims);
+    write_header(stream, typestring, fortran_order, shape_v);
+
+    auto size = static_cast<size_t>(comp_size(shape_v));
+
+    stream.write(reinterpret_cast<const char*>(data.data()),
+                 sizeof(Scalar) * size);
+}
+
+template <typename Scalar>
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::vector<unsigned long>& shape,
+                               std::vector<Scalar>& data) {
+    bool fortran_order;
+    LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data);
+}
+
+template <typename Scalar>
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::vector<unsigned long>& shape,
+                               bool& fortran_order, std::vector<Scalar>& data) {
+    std::ifstream stream(filename, std::ifstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::string header = read_header(stream);
+
+    // parse header
+    std::string typestr;
+
+    parse_header(header, typestr, fortran_order, shape);
+
+    // check if the typestring matches the given one
+    Typestring typestring_o{data};
+    std::string expect_typestr = typestring_o.str();
+    if (typestr != expect_typestr) {
+        fprintf(stderr, "formatting error: typestrings not matching");
+    }
+
+    // compute the data size based on the shape
+    auto size = static_cast<size_t>(comp_size(shape));
+    data.resize(size);
+
+    // read the data
+    stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size);
+}
+
+inline void LoadArrayFromNumpy(const std::string& filename,
+                               std::string& type_str,
+                               std::vector<ndarray_len_t>& shape,
+                               std::vector<int8_t>& data) {
+    std::ifstream stream(filename, std::ifstream::binary);
+    if (!stream) {
+        fprintf(stderr, "io error: failed to open a file.");
+    }
+
+    std::string header = read_header(stream);
+    bool fortran_order;
+    // parse header
+    parse_header(header, type_str, fortran_order, shape);
+
+    // check if the typestring matches the given one
+    std::string size_str = type_str.substr(type_str.size() - 1);
+    size_t elem_size = atoi(size_str.c_str());
+
+    // compute the data size based on the shape
+    auto byte_size = elem_size * static_cast<size_t>(comp_size(shape));
+    data.resize(byte_size);
+
+    // read the data
+    stream.read(reinterpret_cast<char*>(data.data()), byte_size);
+}
+
+}  // namespace npy
+
+#endif  // NPY_H
diff --git a/lite/test/test_common.h b/lite/test/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..dae6fb296b5f5a864c2a6bec1dcc5758c084d17d
--- /dev/null
+++ b/lite/test/test_common.h
@@ -0,0 +1,184 @@
+/**
+ * \file test/test_common.h
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../src/misc.h"
+#include "../src/mge/network_impl.h"
+#include "../src/mge/common.h"
+#include "lite/network.h"
+#include "lite/tensor.h"
+#include "megbrain/tensor.h"
+#include "megbrain/graph/bases.h"
+#include "megbrain/plugin/opr_io_dump.h"
+#include "megbrain/plugin/profiler.h"
+#include "megbrain/serialization/extern_c_opr.h"
+#include "megbrain/serialization/file.h"
+#include "megbrain/serialization/load_dump_config.h"
+#include "megbrain/serialization/serializer.h"
+#include "megbrain/utils/thin/hash_table.h"
+#include "npy.h"
+
+#include <gtest/gtest.h>
+
+#include <string.h>
+#include <chrono>
+#include <memory>
+#include <random>
+
+namespace lite {
+
+template <typename T>
+static ::testing::AssertionResult compare_memory(const void* memory0,
+                                                 const void* memory1,
+                                                 size_t length,
+                                                 float maxerr = 1e-3) {
+    const T* data_ptr0 = static_cast<const T*>(memory0);
+    const T* data_ptr1 = static_cast<const T*>(memory1);
+    for (size_t i = 0; i < length; i++) {
+        auto diff = std::abs(data_ptr0[i] - data_ptr1[i]);
+        if (diff > maxerr) {
+            return ::testing::AssertionFailure()
+                   << "Unequal value:\n"
+                   << "value 0 = " << data_ptr0[i] << "\n"
+                   << "value 1 = " << data_ptr1[i] << "\n"
+                   << "At index: " << i << "\n";
+        }
+    }
+    return ::testing::AssertionSuccess();
+}
+
+template <typename T>
+void compare_lite_tensor(std::shared_ptr<Tensor> tensor0,
+                         std::shared_ptr<Tensor> tensor1, float maxerr = 1e-3) {
+    size_t elemsize = tensor0->get_layout().get_elem_size();
+    T* data_ptr0 = static_cast<T*>(tensor0->get_memory_ptr());
+    T* data_ptr1 = static_cast<T*>(tensor1->get_memory_ptr());
+    size_t length = tensor0->get_tensor_total_size_in_byte() / elemsize;
+    EXPECT_TRUE(compare_memory<T>(data_ptr0, data_ptr1, length, maxerr));
+}
+
+__attribute__((unused)) static std::shared_ptr<Tensor> get_input_data(
+        std::string path) {
+    std::string type_str;
+    std::vector<npy::ndarray_len_t> stl_shape;
+    std::vector<int8_t> raw;
+    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);
+
+    auto lite_tensor = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU);
+    Layout layout;
+    layout.ndim = stl_shape.size();
+    const std::map<std::string, LiteDataType> type_map = {
+            {"f4", LiteDataType::LITE_FLOAT},
+            {"i4", LiteDataType::LITE_INT},
+            {"i1", LiteDataType::LITE_INT8},
+            {"u1", LiteDataType::LITE_UINT8}};
+    layout.shapes[0] = 1;
+    for (size_t i = 0; i < stl_shape.size(); i++) {
+        layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
+    }
+    for (auto& item : type_map) {
+        if (type_str.find(item.first) != std::string::npos) {
+            layout.data_type = item.second;
+            break;
+        }
+    }
+    lite_tensor->set_layout(layout);
+    size_t length = lite_tensor->get_tensor_total_size_in_byte();
+    void* dest = lite_tensor->get_memory_ptr();
+    memcpy(dest, raw.data(), length);
+    return lite_tensor;
+}
+
+__attribute__((unused)) static std::shared_ptr<Tensor> mgelite_lar(
+        std::string model_path, const Config& config, std::string,
+        std::shared_ptr<Tensor> input) {
+    std::unique_ptr<Network> network = std::make_unique<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto src_ptr = input->get_memory_ptr();
+    auto src_layout = input->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    Layout out_layout = output_tensor->get_layout();
+    auto ret = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU, out_layout);
+    void* out_data = output_tensor->get_memory_ptr();
+    void* dst_data = ret->get_memory_ptr();
+    memcpy(dst_data, out_data, ret->get_tensor_total_size_in_byte());
+    return ret;
+}
+
+__attribute__((unused)) static std::shared_ptr<Tensor> mgb_lar(
+        std::string model_path, const Config& config, std::string input_name,
+        std::shared_ptr<Tensor> input) {
+    LITE_ASSERT(config.bare_model_cryption_name.size() == 0);
+    using namespace mgb;
+    serialization::GraphLoader::LoadConfig mgb_config;
+    mgb_config.comp_node_mapper = [config](CompNode::Locator& loc) {
+        loc = to_compnode_locator(config.device_type);
+    };
+    mgb_config.comp_graph = ComputingGraph::make();
+    auto&& graph_opt = mgb_config.comp_graph->options();
+    if (config.options.weight_preprocess) {
+        graph_opt.graph_opt.enable_weight_preprocess();
+    }
+    graph_opt.comp_node_seq_record_level =
+            config.options.comp_node_seq_record_level;
+
+    auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
+    auto format =
+            serialization::GraphLoader::identify_graph_dump_format(*inp_file);
+    mgb_assert(format.valid(),
+               "invalid model: unknown model format, please make sure input "
+               "file is generated by GraphDumper");
+    auto loader =
+            serialization::GraphLoader::make(std::move(inp_file), format.val());
+    auto load_ret = loader->load(mgb_config, false);
+
+    ComputingGraph::OutputSpec out_spec;
+    std::vector<HostTensorND> output_tensors(load_ret.output_var_list.size());
+    for (size_t i = 0; i < load_ret.output_var_list.size(); i++) {
+        auto cb = [&output_tensors, i](const DeviceTensorND& dv) mutable {
+            output_tensors[i].copy_from(dv);
+        };
+        out_spec.emplace_back(load_ret.output_var_list[i], std::move(cb));
+    }
+    auto func = load_ret.graph_compile(out_spec);
+
+    auto& in = load_ret.tensor_map.find(input_name)->second;
+    in->copy_from(*TensorHelper::implement(input)
+                           ->cast_final_safe<TensorImplDft>()
+                           .host_tensor());
+    func->execute();
+    func->wait();
+
+    std::shared_ptr<Tensor> ret = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            to_lite_layout(output_tensors[0].layout()));
+    auto mge_tensor = TensorHelper::implement(ret)
+                              ->cast_final_safe<TensorImplDft>()
+                              .host_tensor();
+    mge_tensor->copy_from(output_tensors[0]);
+    return ret;
+}
+}  // namespace lite
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_misc.cpp b/lite/test/test_misc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2a0a40f2f1efc0798d45e9cb1827a08dd50eb90
--- /dev/null
+++ b/lite/test/test_misc.cpp
@@ -0,0 +1,115 @@
+/**
+ * \file test/test_misc.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "test_common.h"
+#include "../src/decryption/decrypt_base.h"
+#include "../src/network_impl_base.h"
+
+#include "megbrain/opr/io.h"
+#include "megbrain/tensor.h"
+#include "megbrain/utils/metahelper.h"
+
+#include <gtest/gtest.h>
+
+#include <string.h>
+#include <chrono>
+#include <memory>
+#include <random>
+
+using namespace lite;
+
+TEST(TestMisc, DecryptionRegister) {
+    size_t number = decryption_static_data().decryption_methods.size();
+    //! At least one method is register by lite
+    ASSERT_GE(number, 1);
+    DecryptionFunc func;
+    register_decryption_and_key("AllForTest0", func, {});
+
+    ASSERT_EQ(number + 1, decryption_static_data().decryption_methods.size());
+}
+
+TEST(TestMisc, DecryptionUpdate) {
+    DecryptionFunc func;
+    register_decryption_and_key("AllForTest1", func, {});
+    func = [](const void*, size_t,
+              const std::vector<uint8_t>&) -> std::vector<uint8_t> {
+        return {};
+    };
+    update_decryption_or_key("AllForTest1", func, {});
+    ASSERT_NE(decryption_static_data().decryption_methods["AllForTest1"].first,
+              nullptr);
+    ASSERT_EQ(decryption_static_data()
+                      .decryption_methods["AllForTest1"]
+                      .second->size(),
+              0);
+    update_decryption_or_key("AllForTest1", {}, {1, 2, 3});
+    ASSERT_EQ(decryption_static_data()
+                      .decryption_methods["AllForTest1"]
+                      .second->size(),
+              3);
+}
+
+TEST(TestMisc, SharedSameDeviceTensor) {
+    using namespace mgb;
+    serialization::GraphLoader::LoadConfig mgb_config;
+    mgb_config.comp_node_mapper = [](CompNode::Locator& loc) {
+        loc = to_compnode_locator(LiteDeviceType::LITE_CPU);
+    };
+    mgb_config.comp_graph = ComputingGraph::make();
+    std::string model_path = "./shufflenet.mge";
+
+    auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
+    auto format =
+            serialization::GraphLoader::identify_graph_dump_format(*inp_file);
+    mgb_assert(format.valid(),
+               "invalid model: unknown model format, please make sure input "
+               "file is generated by GraphDumper");
+    auto loader =
+            serialization::GraphLoader::make(std::move(inp_file), format.val());
+    auto load_ret_1 = loader->load(mgb_config, true);
+    auto load_ret_2 = loader->load(mgb_config, true);
+    ASSERT_EQ(load_ret_1.output_var_list.size(),
+              load_ret_2.output_var_list.size());
+
+    ComputingGraph::OutputSpec out_spec_1, out_spec_2;
+    for (size_t i = 0; i < load_ret_1.output_var_list.size(); i++) {
+        out_spec_1.emplace_back(load_ret_1.output_var_list[i], nullptr);
+        out_spec_2.emplace_back(load_ret_2.output_var_list[i], nullptr);
+    }
+    auto func_1 = load_ret_1.graph_compile(out_spec_1);
+    auto func_2 = load_ret_2.graph_compile(out_spec_1);
+    std::vector<cg::OperatorNodeBase*> oprs_1, oprs_2;
+    func_1->iter_opr_seq([&oprs_1](cg::OperatorNodeBase* opr) -> bool {
+        if (opr->try_cast_final<opr::ImmutableTensor>()) {
+            oprs_1.push_back(opr);
+        }
+        return true;
+    });
+    func_1->iter_opr_seq([&oprs_2](cg::OperatorNodeBase* opr) -> bool {
+        if (opr->try_cast_final<opr::ImmutableTensor>()) {
+            oprs_2.push_back(opr);
+        }
+        return true;
+    });
+    ASSERT_EQ(oprs_1.size(), oprs_2.size());
+    for (size_t i = 0; i < oprs_1.size(); i++) {
+        auto tensor_1 =
+                oprs_1[i]->try_cast_final<opr::ImmutableTensor>()->value();
+        auto tensor_2 =
+                oprs_2[i]->try_cast_final<opr::ImmutableTensor>()->value();
+        ASSERT_EQ(tensor_1.raw_ptr(), tensor_2.raw_ptr());
+    }
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4770965a60a2973b8372a59537d54c578e7dbb84
--- /dev/null
+++ b/lite/test/test_network.cpp
@@ -0,0 +1,1007 @@
+/**
+ * \file test/test_network.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "./test_common.h"
+#include "megbrain/tensor.h"
+
+#include <chrono>
+#include <memory>
+#include <random>
+#include <unordered_map>
+using namespace lite;
+
+namespace {
+class CheckAllocator : public lite::Allocator {
+public:
+    //! allocate memory of size in the given device with the given align
+    void* allocate(LiteDeviceType device, int, size_t size,
+                   size_t align) override {
+        LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
+        m_nr_left++;
+        m_nr_allocated++;
+#ifdef WIN32
+        return _aligned_malloc(size, align);
+#elif defined(__ANDROID__) || defined(ANDROID)
+        return memalign(align, size);
+#else
+        void* ptr = nullptr;
+        auto err = posix_memalign(&ptr, align, size);
+        mgb_assert(!err, "failed to malloc %zubytes with align %zu", size,
+                   align);
+        return ptr;
+#endif
+    };
+
+    //! free the memory pointed by ptr in the given device
+    void free(LiteDeviceType device, int, void* ptr) override {
+        m_nr_left--;
+        LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
+#ifdef WIN32
+        _aligned_free(ptr);
+#else
+        ::free(ptr);
+#endif
+    };
+    std::atomic_size_t m_nr_left{0};
+    std::atomic_size_t m_nr_allocated{0};
+};
+}  // namespace
+
+TEST(TestNetWork, Basic) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    auto result_lite = mgelite_lar(model_path, config, "data", lite_tensor);
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, SetDeviceId) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->set_device_id(4);
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    network->forward();
+    network->wait();
+    ASSERT_EQ(input_tensor->get_device_id(), 4);
+    ASSERT_EQ(output_tensor->get_device_id(), 4);
+}
+
+TEST(TestNetWork, GetAllName) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    auto input_names = network->get_all_input_name();
+    auto output_names = network->get_all_output_name();
+
+    ASSERT_EQ(input_names.size(), 1);
+    ASSERT_EQ(output_names.size(), 1);
+    ASSERT_TRUE(input_names[0] == "data");
+    ASSERT_TRUE(output_names[0] ==
+                "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
+}
+
+TEST(TestNetWork, BasicInplaceAndSingleThreadAffinity) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    Runtime::set_cpu_inplace_mode(network);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    int affinity_set = false;
+    Runtime::set_runtime_thread_affinity(network, [&affinity_set](int id) {
+        ASSERT_EQ(id, 0);
+        affinity_set = true;
+    });
+
+    auto src_ptr = lite_tensor->get_memory_ptr();
+    auto src_layout = lite_tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    //! inplace mode not support async mode
+    ASSERT_THROW(network->set_async_callback([]() {}), std::exception);
+
+    network->forward();
+    network->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+
+    ASSERT_EQ(affinity_set, true);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, NetworkShareWeights) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    std::shared_ptr<Network> network2 = std::make_shared<Network>(config);
+    Runtime::set_cpu_inplace_mode(network2);
+
+    Runtime::shared_weight_with_network(network2, network);
+
+    std::shared_ptr<Tensor> input_tensor2 = network2->get_input_tensor(0);
+
+    auto src_ptr = lite_tensor->get_memory_ptr();
+    auto src_layout = lite_tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+    input_tensor2->reset(src_ptr, src_layout);
+    ASSERT_NE(input_tensor, input_tensor2);
+
+    network->forward();
+    network->wait();
+
+    network2->forward();
+    network2->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    std::shared_ptr<Tensor> output_tensor2 = network2->get_output_tensor(0);
+
+    ASSERT_NE(output_tensor->get_memory_ptr(),
+              output_tensor2->get_memory_ptr());
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+    compare_lite_tensor<float>(output_tensor2, result_mgb);
+}
+
+TEST(TestNetWork, SharedRuntimeMem) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+
+    std::shared_ptr<Network> network_src = std::make_shared<Network>(config);
+    std::shared_ptr<Network> network_dst = std::make_shared<Network>(config);
+    Runtime::share_runtime_memory_with(network_dst, network_src);
+    network_src->load_model(model_path);
+    network_dst->load_model(model_path);
+}
+
+TEST(TestNetWork, UserAllocator) {
+    auto allocator = std::make_shared<CheckAllocator>();
+    {
+        Config config;
+        auto lite_tensor = get_input_data("./input_data.npy");
+        std::string model_path = "./shufflenet.mge";
+
+        auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+        std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+        Runtime::set_memory_allocator(network, allocator);
+
+        network->load_model(model_path);
+        std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+        auto src_ptr = lite_tensor->get_memory_ptr();
+        auto src_layout = lite_tensor->get_layout();
+        input_tensor->reset(src_ptr, src_layout);
+
+        network->forward();
+        network->wait();
+
+        ASSERT_GE(allocator->m_nr_allocated, 1);
+        std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+
+        compare_lite_tensor<float>(output_tensor, result_mgb);
+    }
+    ASSERT_EQ(allocator->m_nr_left, 0);
+}
+
+TEST(TestNetWork, BasicMultiThread) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    Runtime::set_cpu_threads_number(network, 2);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+
+    auto src_ptr = lite_tensor->get_memory_ptr();
+    auto src_layout = lite_tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, ThreadAffinity) {
+    size_t nr_threads = 4;
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    Runtime::set_cpu_threads_number(network, nr_threads);
+
+    ASSERT_THROW(Runtime::set_runtime_thread_affinity(network, [](int) {}),
+                 std::exception);
+    network->load_model(model_path);
+    std::vector<std::thread::id> thread_ids(nr_threads);
+    auto affinity = [&](int id) {
+        thread_ids[id] = std::this_thread::get_id();
+    };
+    Runtime::set_runtime_thread_affinity(network, affinity);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    auto src_ptr = lite_tensor->get_memory_ptr();
+    auto src_layout = lite_tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+
+    for (size_t i = 0; i < nr_threads; i++) {
+        for (size_t j = i + 1; j < nr_threads; j++) {
+            ASSERT_NE(thread_ids[i], thread_ids[j]);
+        }
+    }
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, BasicCryptAes) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string model_crypt_path = "./shufflenet_crypt_aes.mge";
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    config.bare_model_cryption_name = "AES_default";
+    auto result_lite =
+            mgelite_lar(model_crypt_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, BasicCryptRc4) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string model_crypt_path = "./shufflenet_crypt_rc4.mge";
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    config.bare_model_cryption_name = "RC4_default";
+    auto result_lite =
+            mgelite_lar(model_crypt_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, PackedCryptRc4) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string model_crypt_path = "./test_packed_model_rc4.lite";
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    auto result_lite =
+            mgelite_lar(model_crypt_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, BasicCryptSfRc4) {
+    Config config;
+    auto lite_tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string model_crypt_path = "./shufflenet_crypt_sfrc4.mge";
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    config.bare_model_cryption_name = "SIMPLE_FAST_RC4_default";
+    auto result_lite =
+            mgelite_lar(model_crypt_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, ResetInput) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, ChangeInputShape) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_layout = Layout{{2, 3, 200, 200}, 4, LiteDataType::LITE_FLOAT};
+    input_tensor->set_layout(src_layout);
+    std::shared_ptr<Tensor> input_tensor2 = network->get_io_tensor(input_name);
+    //! Check memory is equal
+    ASSERT_EQ(input_tensor->get_memory_ptr(), input_tensor2->get_memory_ptr());
+
+    network->forward();
+    network->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto output_layout = output_tensor->get_layout();
+    ASSERT_EQ(output_layout.shapes[0], 2);
+    ASSERT_EQ(output_layout.shapes[1], 1000);
+}
+
+TEST(TestNetWork, ResetOutput) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, AsyncExec) {
+    Config config;
+    config.options.var_sanity_check_first_run = false;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    //! set async mode and callback
+    volatile bool finished = false;
+    network->set_async_callback([&finished]() { finished = true; });
+
+    network->forward();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    ASSERT_GT(count, 0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, CPUDeviceInput) {
+    auto tensor = get_input_data("./input_data.npy");
+    Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.inputs.push_back({input_name, is_host});
+    std::shared_ptr<Network> network = std::make_shared<Network>(IO);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    input_tensor->reset(src_ptr, layout);
+
+    network->forward();
+    network->wait();
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, ShareTensorWith) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    input_tensor->share_memory_with(*tensor);
+
+    network->forward();
+    network->wait();
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, InputCallBack) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    NetworkIO ios;
+    bool is_host = false;
+    ios.inputs.push_back({input_name, is_host});
+    std::shared_ptr<Network> network = std::make_shared<Network>(ios);
+    network->load_model(model_path);
+
+    volatile bool finised_check_input = false;
+    auto input_callback =
+            [&tensor, &finised_check_input,
+             input_name](const std::unordered_map<
+                         std::string, std::pair<IO, std::shared_ptr<Tensor>>>&
+                                 input_map) {
+                ASSERT_EQ(input_map.size(), 1);
+                auto tensor_input = input_map.at(input_name).second;
+                compare_lite_tensor<float>(tensor_input, tensor);
+                finised_check_input = true;
+            };
+
+    network->set_start_callback(input_callback);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    input_tensor->share_memory_with(*tensor);
+
+    network->forward();
+    network->wait();
+
+    ASSERT_TRUE(finised_check_input);
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, OutputCallBack) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>();
+    network->load_model(model_path);
+    auto output_name = network->get_output_name(0);
+
+    volatile bool finised_check_output = false;
+    auto output_callback =
+            [&result_mgb, &finised_check_output,
+             output_name](const std::unordered_map<
+                          std::string, std::pair<IO, std::shared_ptr<Tensor>>>&
+                                  output_map) {
+                ASSERT_EQ(output_map.size(), 1);
+                auto tensor_output = output_map.at(output_name).second;
+                compare_lite_tensor<float>(tensor_output, result_mgb);
+                finised_check_output = true;
+            };
+
+    network->set_finish_callback(output_callback);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    input_tensor->share_memory_with(*tensor);
+
+    network->forward();
+    network->wait();
+
+    ASSERT_TRUE(finised_check_output);
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, OutputShapeOnly) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+
+    NetworkIO IO;
+    bool is_host = true;
+    IO.outputs.push_back({output_name, is_host, LiteIOType::LITE_IO_SHAPE});
+    Config config;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+    std::shared_ptr<Tensor> output_tensor = network->get_io_tensor(output_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+    ASSERT_EQ(output_tensor->get_tensor_total_size_in_byte() / sizeof(float),
+              1000);
+}
+
+TEST(TestNetWork, ProfileIOdump) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+
+    NetworkIO IO;
+    Config config;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+    network->enable_profile_performance("./profile.json");
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+    ASSERT_TRUE(fopen("./profile.json", "r"));
+
+    Runtime::enable_io_txt_dump(network, "./io_txt_dump.txt");
+    network->forward();
+    network->wait();
+    ASSERT_TRUE(fopen("./io_txt_dump.txt", "r"));
+}
+
+TEST(TestNetWork, LoadPackedModel) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./test_packed_model.lite";
+    std::string input_name = "data";
+
+    NetworkIO IO;
+    Config config;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, GetDeviceType) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+
+    Config config;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(model_path);
+    ASSERT_TRUE(network->get_device_type() == LiteDeviceType::LITE_CPU);
+}
+
+TEST(TestNetWork, GetModelExtraInfo) {
+    std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite";
+    Config config;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(model_path);
+    auto& extra_info = network->get_model_extra_info();
+    ASSERT_TRUE(extra_info.size() > 0);
+    printf("extra_info %s \n", extra_info.c_str());
+}
+
+#if LITE_WITH_CUDA
+
+TEST(TestNetWork, BasicDevice) {
+    auto lite_tensor = get_input_data("./input_data.npy");
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::string model_path = "./shufflenet.mge";
+    auto result_lite = mgelite_lar(model_path, config, "data", lite_tensor);
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor);
+    compare_lite_tensor<float>(result_lite, result_mgb);
+}
+
+TEST(TestNetWork, DeviceInput) {
+    auto tensor = get_input_data("./input_data.npy");
+    Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.inputs.push_back({input_name, is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto tensor_cuda = Tensor(LiteDeviceType::LITE_CUDA, layout);
+    tensor_cuda.copy_from(*tensor);
+
+    auto src_ptr = tensor_cuda.get_memory_ptr();
+    input_tensor->reset(src_ptr, layout);
+
+    network->forward();
+    network->wait();
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, ChangeInputShapeDevice) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_layout = Layout{{2, 3, 200, 200}, 4, LiteDataType::LITE_FLOAT};
+    input_tensor->set_layout(src_layout);
+    std::shared_ptr<Tensor> input_tensor2 = network->get_io_tensor(input_name);
+    //! Check memory is equal
+    ASSERT_EQ(input_tensor->get_memory_ptr(), input_tensor2->get_memory_ptr());
+
+    network->forward();
+    network->wait();
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto output_layout = output_tensor->get_layout();
+    ASSERT_EQ(output_layout.shapes[0], 2);
+    ASSERT_EQ(output_layout.shapes[1], 1000);
+}
+
+TEST(TestNetWork, DeviceOutput) {
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.outputs.push_back({output_name, is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+    std::shared_ptr<Tensor> output_tensor_cuda =
+            network->get_io_tensor(output_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    network->forward();
+    network->wait();
+    auto output_tensor = std::make_shared<Tensor>();
+    output_tensor->copy_from(*output_tensor_cuda);
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, WrongIONameDevice) {
+    auto tensor = get_input_data("./input_data.npy");
+    Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    std::string input_name_wrong = "data0";
+    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    std::string output_name_wrong =
+            "w_TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    auto result_mgb = mgb_lar(model_path, {}, input_name, tensor);
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.inputs.push_back({input_name, is_host});
+    IO.outputs.push_back({output_name, is_host});
+    IO.outputs.push_back({output_name_wrong, is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+
+    network->load_model(model_path);
+
+    auto tensor_cuda = Tensor(LiteDeviceType::LITE_CUDA, layout);
+    tensor_cuda.copy_from(*tensor);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+    auto src_ptr = tensor_cuda.get_memory_ptr();
+    auto src_layout = tensor_cuda.get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor_cuda =
+            network->get_io_tensor(output_name);
+
+    network->forward();
+    network->wait();
+    auto output_tensor = std::make_shared<Tensor>();
+    output_tensor->copy_from(*output_tensor_cuda);
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWork, ConfigIONameDevice) {
+    std::string model_path = "./model.mgb";
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.outputs.push_back({"clsfy", is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+    network->compute_only_configured_output();
+    network->load_model(model_path);
+
+    ASSERT_EQ(network->get_all_output_name().size(), 1);
+    ASSERT_EQ(network->get_all_output_name()[0], "clsfy");
+
+    std::shared_ptr<Network> network2 = std::make_shared<Network>(config, IO);
+    network2->load_model(model_path);
+
+    ASSERT_EQ(network2->get_all_output_name().size(), 2);
+}
+
+TEST(TestNetWork, SetDeviceIdDeviceTest) {
+#if LITE_WITH_CUDA
+    if(get_device_count(LITE_CUDA) <= 1)
+        return;
+#endif
+    std::string model_path = "./model.mgb";
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.inputs.push_back({"data", is_host});
+    IO.outputs.push_back({"clsfy", is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+    network->set_device_id(1);
+    network->load_model(model_path);
+    auto inputs_names = network->get_all_input_name();
+    for (auto name : inputs_names) {
+        auto tensor = network->get_io_tensor(name);
+        ASSERT_EQ(tensor->get_device_id(), 1);
+        if (name == "idx") {
+            int* index_ptr = static_cast<int*>(tensor->get_memory_ptr());
+            for (int i = 0; i < 23; i++) {
+                index_ptr[i] = i % 3;
+            }
+        }
+        if (name == "landmark") {
+            float* landmakrk_ptr =
+                    static_cast<float*>(tensor->get_memory_ptr());
+            for (int i = 0; i < 23 * 18 * 2; i++) {
+                landmakrk_ptr[i] = 0.1f;
+            }
+        }
+    }
+    auto outputs_names = network->get_all_output_name();
+    for (auto name : outputs_names) {
+        auto tensor = network->get_io_tensor(name);
+        ASSERT_EQ(tensor->get_device_id(), 1);
+    }
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, SetStreamIdDeviceTest) {
+    std::string model_path = "./model.mgb";
+
+    NetworkIO IO;
+    bool is_host = false;
+    IO.inputs.push_back({"data", is_host});
+    IO.outputs.push_back({"clsfy", is_host});
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config, IO);
+    network->set_stream_id(1);
+    network->load_model(model_path);
+    auto inputs_names = network->get_all_input_name();
+    for (auto name : inputs_names) {
+        auto tensor = network->get_io_tensor(name);
+        if (name == "idx") {
+            int* index_ptr = static_cast<int*>(tensor->get_memory_ptr());
+            for (int i = 0; i < 23; i++) {
+                index_ptr[i] = i % 3;
+            }
+        }
+        if (name == "landmark") {
+            float* landmakrk_ptr =
+                    static_cast<float*>(tensor->get_memory_ptr());
+            for (int i = 0; i < 23 * 18 * 2; i++) {
+                landmakrk_ptr[i] = 0.1f;
+            }
+        }
+    }
+    network->forward();
+    network->wait();
+}
+
+#if CUDART_VERSION >= 10000
+TEST(TestNetWork, DeviceAsyncExec) {
+    auto tensor = get_input_data("./input_data.npy");
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    config.options.var_sanity_check_first_run = false;
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    //! set async mode and callback
+    volatile bool finished = false;
+    network->set_async_callback([&finished]() { finished = true; });
+
+    network->forward();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+
+    ASSERT_GT(count, 0);
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+#endif
+#endif
+#if MGB_ATLAS
+TEST(TestNetWork, AtlasLoadNoDevice) {
+    lite::Config config;
+    config.device_type = LiteDeviceType::LITE_DEVICE_DEFAULT;
+    auto network = std::make_shared<lite::Network>(config);
+    network->load_model("./model_atlas.mgb");
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, AtlasLoadDeviceInput) {
+    lite::NetworkIO networkio;
+    lite::IO input_data_io = {};
+    input_data_io.name = "data";
+    input_data_io.is_host = false;
+    networkio.inputs.emplace_back(input_data_io);
+    lite::IO input_input0_io = {};
+    input_input0_io.name = "input0";
+    input_input0_io.is_host = false;
+    networkio.inputs.emplace_back(input_input0_io);
+    lite::Config config;
+    config.device_type = LiteDeviceType::LITE_DEVICE_DEFAULT;
+    auto network = std::make_shared<lite::Network>(config, networkio);
+    network->load_model("./model_atlas.mgb");
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, AtlasLoadAtlas) {
+    lite::Config config;
+    config.device_type = LiteDeviceType::LITE_ATLAS;
+    auto network = std::make_shared<lite::Network>(config);
+    network->load_model("./model_atlas.mgb");
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, AtlasLoadAtlasDeviceInput) {
+    lite::NetworkIO networkio;
+    lite::IO input_data_io = {};
+    input_data_io.name = "data";
+    input_data_io.is_host = false;
+    networkio.inputs.emplace_back(input_data_io);
+    lite::IO input_input0_io = {};
+    input_input0_io.name = "input0";
+    input_input0_io.is_host = false;
+    networkio.inputs.emplace_back(input_input0_io);
+    lite::Config config;
+    config.device_type = LiteDeviceType::LITE_ATLAS;
+    auto network = std::make_shared<lite::Network>(config, networkio);
+    network->load_model("./model_atlas.mgb");
+    network->forward();
+    network->wait();
+}
+
+TEST(TestNetWork, AtlasDeviceID) {
+    lite::Config config;
+    config.device_type = LiteDeviceType::LITE_ATLAS;
+    auto network = std::make_shared<lite::Network>(config);
+    network->set_device_id(1);
+    network->load_model("./model_atlas.mgb");
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    network->forward();
+    network->wait();
+    ASSERT_EQ(output_tensor->get_device_id(), 1);
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_network_c.cpp b/lite/test/test_network_c.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..419bc6c7a6e9a7c115fbf27a6f10ae00dab36ba5
--- /dev/null
+++ b/lite/test/test_network_c.cpp
@@ -0,0 +1,895 @@
+/**
+ * \file test/test_network_c.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "../src/misc.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../src/common.h"
+#include "../src/mge/network_impl.h"
+
+#include "../lite-c/src/common.h"
+#include "lite-c/global_c.h"
+#include "lite-c/network_c.h"
+#include "lite-c/tensor_c.h"
+
+#include "./test_common.h"
+#include "megbrain/tensor.h"
+
+#include <string.h>
+#include <chrono>
+#include <memory>
+#include <random>
+#include <unordered_map>
+
+namespace {
+
+int affinity_set = false;
+int single_thread_affinity(int) {
+    affinity_set = true;
+    return 0;
+}
+
+std::atomic_size_t m_nr_left{0};
+std::atomic_size_t m_nr_allocated{0};
+
+void* allocate(LiteDeviceType device, int, size_t size, size_t align) {
+    LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
+    m_nr_left++;
+    m_nr_allocated++;
+#ifdef WIN32
+    return _aligned_malloc(size, align);
+#elif defined(__ANDROID__) || defined(ANDROID)
+    return memalign(align, size);
+#else
+    void* ptr = nullptr;
+    auto err = posix_memalign(&ptr, align, size);
+    mgb_assert(!err, "failed to malloc %zu bytes with align %zu", size, align);
+    return ptr;
+#endif
+}
+
+void free(LiteDeviceType device, int, void* ptr) {
+    m_nr_left--;
+    LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
+#ifdef WIN32
+    _aligned_free(ptr);
+#else
+    ::free(ptr);
+#endif
+};
+
+#define NUMBER_THREDS (4)
+std::vector<std::thread::id> thread_ids(NUMBER_THREDS);
+int multi_thread_affinity(int id) {
+    thread_ids[id] = std::this_thread::get_id();
+    return 0;
+};
+
+volatile bool finished = false;
+int finish_callback() {
+    finished = true;
+    return 0;
+}
+
+volatile bool start_checked = false;
+int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors,
+                   size_t size) {
+    start_checked = true;
+    auto check_func = [&]() {
+        ASSERT_EQ(size, 1);
+        ASSERT_EQ(std::string(inputs->name), "data");
+        LiteLayout layout;
+        LITE_get_tensor_layout(*input_tensors, &layout);
+        ASSERT_EQ(layout.ndim, 4);
+        ASSERT_EQ(layout.shapes[1], 3);
+        ASSERT_EQ(layout.shapes[2], 224);
+        ASSERT_EQ(layout.shapes[3], 224);
+    };
+    check_func();
+    return 0;
+}
+
+volatile bool finish_checked = false;
+int finish_callback(const LiteIO* outputs, const LiteTensor* output_tensors,
+                    size_t size) {
+    finish_checked = true;
+    auto check_func = [&]() {
+        ASSERT_EQ(size, 1);
+        ASSERT_EQ(std::string(outputs->name),
+                  "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
+        LiteLayout layout;
+        LITE_get_tensor_layout(*output_tensors, &layout);
+        ASSERT_EQ(layout.shapes[1], 1000);
+    };
+    check_func();
+    return 0;
+}
+
+}  // namespace
+
+#define LITE_CAPI_CHECK(_expr)                 \
+    do {                                       \
+        int _ret = (_expr);                    \
+        if (_ret) {                            \
+            LITE_THROW(LITE_get_last_error()); \
+        }                                      \
+    } while (0)
+
+#define ForwardMgb                                                             \
+    lite::Config config;                                                       \
+    auto lite_tensor = lite::get_input_data("./input_data.npy");               \
+    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); \
+    std::string model_path = "./shufflenet.mge";                               \
+    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor)
+
+#define MakeNetwork                                                  \
+    LiteNetwork c_network;                                           \
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), \
+                                      *default_network_io()))
+
+#define LoadNetwork \
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str()))
+
+#define SetInput                                                            \
+    LiteTensor c_input_tensor, c_output_tensor;                             \
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, "data", LITE_INPUT,       \
+                                       &c_input_tensor));                   \
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,                \
+                                             lite_tensor->get_memory_ptr(), \
+                                             data_length_in_byte))
+
+#define ForwardNetwork                        \
+    LITE_CAPI_CHECK(LITE_forward(c_network)); \
+    LITE_CAPI_CHECK(LITE_wait(c_network))
+
+#define GetOutput                                                           \
+    const char* output_name;                                                \
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));      \
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_OUTPUT, \
+                                       &c_output_tensor));                  \
+    void* output_ptr;                                                       \
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr))
+
+#define CompareResult                                 \
+    EXPECT_TRUE(lite::compare_memory<float>(          \
+            output_ptr, result_mgb->get_memory_ptr(), \
+            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)))
+
+TEST(TestCapiNetWork, BasicResetInput) {
+    ForwardMgb;
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    LITE_destroy_network(c_network);
+}
+
+TEST(TestCapiNetWork, GetAllName) {
+    std::string model_path = "./shufflenet.mge";
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
+    LoadNetwork;
+    size_t input_size, output_size;
+    LITE_get_all_input_name(c_network, &input_size, nullptr);
+    LITE_get_all_output_name(c_network, &output_size, nullptr);
+
+    std::vector<const char*> input_names(input_size);
+    LITE_get_all_input_name(c_network, nullptr, input_names.data());
+    ASSERT_EQ(input_names.size(), 1);
+    ASSERT_TRUE(std::string(input_names[0]) == "data");
+
+    std::vector<const char*> output_names(output_size);
+    LITE_get_all_output_name(c_network, nullptr, output_names.data());
+    ASSERT_TRUE(std::string(output_names[0]) ==
+                "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
+    ASSERT_EQ(output_names.size(), 1);
+    LITE_destroy_network(c_network);
+}
+
+#if LITE_BUILD_WITH_RKNPU
+
+static int GetTop(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass,
+                  uint32_t outputCount, uint32_t topNum) {
+    uint32_t i, j;
+
+#define MAX_TOP_NUM 20
+    if (topNum > MAX_TOP_NUM)
+        return 0;
+
+    memset(pfMaxProb, 0, sizeof(float) * topNum);
+    memset(pMaxClass, 0xff, sizeof(float) * topNum);
+
+    for (j = 0; j < topNum; j++) {
+        for (i = 0; i < outputCount; i++) {
+            if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) ||
+                (i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) ||
+                (i == *(pMaxClass + 4))) {
+                continue;
+            }
+
+            if (pfProb[i] > *(pfMaxProb + j)) {
+                *(pfMaxProb + j) = pfProb[i];
+                *(pMaxClass + j) = i;
+            }
+        }
+    }
+
+    return 1;
+}
+
+TEST(TestCapiNetWork, rknntest_set_info) {
+#define SET_INFO_SIZE 2
+#define TENSOR_TYPE_UINT8 3
+#define TENSOR_FORMAT_NHWC 1
+    LiteConfig config;
+    config.backend = LiteBackend::LITE_RK_NPU;
+    config.device_type = LiteDeviceType::LITE_NPU;
+    config.bare_model_cryption_name = nullptr;
+    auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy");
+    auto true_tensor = lite::get_input_data("./output_data.npy");
+    auto rknn_model = "./model/mobilenet_v1.rknn";
+    
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model));
+
+    size_t input_size, output_size;
+    LITE_get_all_input_name(c_network, &input_size, nullptr);
+    LITE_get_all_output_name(c_network, &output_size, nullptr);
+
+    std::vector<const char*> input_names(input_size);
+    std::vector<const char*> output_names(output_size);
+    LiteTensor c_input_tensor, c_output_tensor;
+
+    LITE_get_all_input_name(c_network, nullptr, input_names.data());
+    LITE_get_all_output_name(c_network, nullptr, output_names.data());
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO,
+                                       &c_input_tensor));
+
+    size_t input_length = 0;
+    LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length);
+
+    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
+    {
+        LiteLayout input_layout;
+        LITE_get_tensor_layout(c_input_tensor, &input_layout);
+        ASSERT_TRUE(input_layout.data_type == LITE_INT8);
+        std::vector<int> input_shape={1,224,224,3};
+        for (size_t i = 0; i < input_layout.ndim; i++) {
+            ASSERT_TRUE(input_layout.shapes[i]=input_shape[i]);
+        }
+    }
+
+    {
+        int size_attr = 0;
+        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, nullptr, nullptr,
+                                                  &size_attr));
+        ASSERT_TRUE(size_attr > 0);
+        const char* keys[size_attr];
+        void* values[size_attr];
+        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, keys, values,
+                                                  &size_attr));
+        ASSERT_TRUE(size_attr > 5);
+        std::unordered_map<std::string, uint32_t> result_map = {
+                {"zp", 0},
+                {"index", 0},
+                {"size_with_stride", 150528},
+                {"stride", 224},
+                {"n_size", 150528},
+                {"n_elems", 150528},
+                {"qnt_type", 2},
+                {"n_dims", 4},
+                {"type", 2},
+                {"fmt", 1},
+                {"dims0", 1},
+                {"dims1", 224},
+                {"dims2", 224},
+                {"dims3", 3},
+        };
+        for (int i = 0; i < size_attr; i++) {
+            std::string key(keys[i]);
+            if (key == "names") {
+                ASSERT_TRUE(std::string("input") ==
+                            std::string(static_cast<const char*>(values[i])));
+            } else if (key == "scale") {
+                float scale = *static_cast<float*>(values[i]);
+                ASSERT_TRUE(std::fabs(scale - 0.007812) < 0.00001);
+            } else if (key == "fl" || key == "pass_through") {
+                uint8_t val = *static_cast<uint8_t*>(values[i]);
+                if (key == "fl") {
+                    ASSERT_TRUE(val == 0);
+                } else {
+                    ASSERT_TRUE(val == 1);
+                }
+            } else {
+                uint32_t val = *static_cast<uint32_t*>(values[i]);
+                ASSERT_TRUE(result_map[std::string(keys[i])]==val);
+            }
+        }
+    }
+    const char* keys[] = {"type", "fmt"};
+    int info_size = SET_INFO_SIZE;
+    int type = TENSOR_TYPE_UINT8;
+    int fmt = TENSOR_FORMAT_NHWC;
+    void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)};
+    LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values,
+                                                info_size));
+    ASSERT_TRUE(std::string(output_names[0]) ==
+                std::string("MobilenetV1/Predictions/Reshape_1"));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
+                                       &c_output_tensor));
+
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
+                                             lite_tensor->get_memory_ptr(),
+                                             data_length_in_byte));
+
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
+                                       &c_output_tensor));
+    //LiteLayout tmp_output_layout;
+    //LITE_get_tensor_layout(c_output_tensor, &tmp_output_layout);
+    //tmp_output_layout.data_type = LiteDataType::LITE_FLOAT;
+
+    //LITE_set_tensor_layout(c_output_tensor, tmp_output_layout);
+    {
+        const char* keys[] = {"want_float"};
+        uint8_t want_float = 1;
+        void* values[] = {static_cast<void*>(&want_float)};
+        LITE_CAPI_CHECK(
+                LITE_set_tensor_information(c_output_tensor, keys, values, 1));
+    }
+
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    ASSERT_TRUE(std::string(output_names[0]) == "MobilenetV1/Predictions/Reshape_1");
+    ASSERT_EQ(output_names.size(), 1);
+    {
+        LiteLayout output_layout;
+        LITE_get_tensor_layout(c_output_tensor, &output_layout);
+        ASSERT_TRUE(output_layout.data_type == LITE_FLOAT);
+        int size_attr = 0;
+
+        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, nullptr, nullptr,
+                                                  &size_attr));
+        ASSERT_TRUE(size_attr > 0);
+        const char* keys[size_attr];
+        void* values[size_attr];
+        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, keys, values,
+                                                  &size_attr));
+        ASSERT_TRUE(size_attr > 5);
+        std::unordered_map<std::string, uint32_t> result_map = {
+                {"zp", 0},
+                {"index", 0},
+                {"size_with_stride", 2002},
+                {"stride", 0},
+                {"n_size", 2002},
+                {"n_elems", 1001},
+                {"qnt_type", 2},
+                {"n_dims", 2},
+                {"type", 0},
+                {"fmt", 2},
+                {"dims0", 1},
+                {"dims1", 1001},
+        };
+        for (int i = 0; i < size_attr; i++) {
+            std::string key(keys[i]);
+            if (key == "names") {
+                ASSERT_TRUE("MobilenetV1/Predictions/Reshape_1" ==
+                            std::string(static_cast<const char*>(values[i])));
+
+            } else if (key == "scale") {
+                float scale = *static_cast<float*>(values[i]);
+                ASSERT_TRUE(std::fabs(scale - 1.0) < 0.00001);
+            } else if (key == "fl" || key == "pass_through") {
+                uint8_t val = *static_cast<uint8_t*>(values[i]);
+                    ASSERT_TRUE(val == 0);
+            } else {
+                uint32_t val = *static_cast<uint32_t*>(values[i]);
+                ASSERT_TRUE(result_map[std::string(keys[i])]==val);
+            }
+        }
+    }
+    {
+        uint32_t MaxClass[5];
+        float fMaxProb[5];
+        void* output_ptr;
+        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
+        float* buffer = (float*)output_ptr;
+        uint32_t sz = true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
+
+        GetTop(buffer, fMaxProb, MaxClass, sz, 5);
+
+        std::vector<uint32_t> result_class = {
+                286, 464, 282, 357, 285,
+        };
+        std::vector<float> result_prob = {
+                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
+        };
+
+        for (int i = 0; i < 5; i++) {
+            ASSERT_TRUE(result_class[i] == MaxClass[i]);
+            ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
+        }
+    }
+
+    {
+        float* true_data = static_cast<float*>(true_tensor->get_memory_ptr());
+        void* output_ptr;
+        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
+        float* data1 = static_cast<float*>(output_ptr);
+        size_t length =
+                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
+        for (size_t i = 0; i < length; i++) {
+            ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3);
+        }
+    }
+    LITE_destroy_network(c_network);
+#undef SET_INFO_SIZE
+#undef TENSOR_FORMAT_NHWC
+#undef TENSOR_TYPE_UINT8
+}
+
+TEST(TestCapiNetWork, rknntest_set_info_two_input) {
+#define SET_INFO_SIZE 2
+#define TENSOR_TYPE_UINT8 3
+#define TENSOR_FORMAT_NHWC 1
+    LiteConfig config;
+    config.backend = LiteBackend::LITE_RK_NPU;
+    config.device_type = LiteDeviceType::LITE_NPU;
+    config.bare_model_cryption_name = nullptr;
+    auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy");
+    auto lite_tensor_dog = lite::get_input_data("./model/dog_224x224.npy");
+    auto true_tensor = lite::get_input_data("./output_data.npy");
+    auto rknn_model = "./model/mobilenet_v1.rknn";
+
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config));
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model));
+
+    size_t input_size, output_size;
+    LITE_get_all_input_name(c_network, &input_size, nullptr);
+    LITE_get_all_output_name(c_network, &output_size, nullptr);
+
+    std::vector<const char*> input_names(input_size);
+    std::vector<const char*> output_names(output_size);
+    LiteTensor c_input_tensor, c_output_tensor;
+
+    LITE_get_all_input_name(c_network, nullptr, input_names.data());
+    LITE_get_all_output_name(c_network, nullptr, output_names.data());
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO,
+                                       &c_input_tensor));
+
+    size_t input_length = 0;
+    LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length);
+
+    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
+    {
+        LiteLayout input_layout;
+        LITE_get_tensor_layout(c_input_tensor, &input_layout);
+        ASSERT_TRUE(input_layout.data_type == LITE_INT8);
+        std::vector<int> input_shape = {1, 224, 224, 3};
+        for (size_t i = 0; i < input_layout.ndim; i++) {
+            ASSERT_TRUE(input_layout.shapes[i] = input_shape[i]);
+        }
+    }
+
+    const char* keys[] = {"type", "fmt"};
+    int info_size = SET_INFO_SIZE;
+    int type = TENSOR_TYPE_UINT8;
+    int fmt = TENSOR_FORMAT_NHWC;
+    void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)};
+    LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values,
+                                                info_size));
+    ASSERT_TRUE(std::string(output_names[0]) ==
+                std::string("MobilenetV1/Predictions/Reshape_1"));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
+                                       &c_output_tensor));
+
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
+                                             lite_tensor->get_memory_ptr(),
+                                             data_length_in_byte));
+
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
+                                       &c_output_tensor));
+    {
+        const char* keys[] = {"want_float"};
+        uint8_t want_float = 1;
+        void* values[] = {static_cast<void*>(&want_float)};
+        LITE_CAPI_CHECK(
+                LITE_set_tensor_information(c_output_tensor, keys, values, 1));
+    }
+
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+
+    ASSERT_TRUE(std::string(output_names[0]) ==
+                "MobilenetV1/Predictions/Reshape_1");
+    ASSERT_EQ(output_names.size(), 1);
+    {
+        uint32_t MaxClass[5];
+        float fMaxProb[5];
+        void* output_ptr;
+        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
+        float* buffer = (float*)output_ptr;
+        uint32_t sz =
+                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
+
+        GetTop(buffer, fMaxProb, MaxClass, sz, 5);
+
+        std::vector<uint32_t> result_class = {
+                286, 464, 282, 357, 285,
+        };
+        std::vector<float> result_prob = {
+                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
+        };
+
+        for (int i = 0; i < 5; i++) {
+            ASSERT_TRUE(result_class[i] == MaxClass[i]);
+            ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
+        }
+    }
+
+    {
+        float* true_data = static_cast<float*>(true_tensor->get_memory_ptr());
+        void* output_ptr;
+        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
+        float* data1 = static_cast<float*>(output_ptr);
+        size_t length =
+                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
+        for (size_t i = 0; i < length; i++) {
+            ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3);
+        }
+    }
+
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
+                                             lite_tensor_dog->get_memory_ptr(),
+                                             data_length_in_byte));
+    LITE_CAPI_CHECK(LITE_forward(c_network));
+    LITE_CAPI_CHECK(LITE_wait(c_network));
+    ASSERT_TRUE(std::string(output_names[0]) ==
+                "MobilenetV1/Predictions/Reshape_1");
+    ASSERT_EQ(output_names.size(), 1);
+    {
+        uint32_t MaxClass[5];
+        float fMaxProb[5];
+        void* output_ptr;
+        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
+        float* buffer = (float*)output_ptr;
+        uint32_t sz =
+                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
+
+        GetTop(buffer, fMaxProb, MaxClass, sz, 5);
+
+        std::vector<float> result_prob = {
+                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
+        };
+
+        for (int i = 0; i < 5; i++) {
+            ASSERT_FALSE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
+        }
+    }
+
+    LITE_destroy_network(c_network);
+#undef SET_INFO_SIZE
+#undef TENSOR_FORMAT_NHWC
+#undef TENSOR_TYPE_UINT8
+}
+#endif
+
+TEST(TestCapiNetWork, BasicResetOutput) {
+    ForwardMgb;
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
+    LoadNetwork;
+    SetInput;
+    LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
+    std::shared_ptr<float> ptr(new float[1000],
+                               [](float* ptr) { delete[] ptr; });
+    const char* output_name;
+    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
+                                       &c_output_tensor));
+    LITE_CAPI_CHECK(
+            LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));
+
+    ForwardNetwork;
+
+    EXPECT_TRUE(lite::compare_memory<float>(
+            ptr.get(), result_mgb->get_memory_ptr(),
+            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)));
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, BasicInplaceAndSingleThreadAffinity) {
+    ForwardMgb;
+    MakeNetwork;
+    //! config the network with cpu inplace mode
+    LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network));
+    LoadNetwork;
+    //! set single thread affinith callback
+    LITE_CAPI_CHECK(LITE_set_runtime_thread_affinity(c_network,
+                                                     single_thread_affinity));
+    SetInput;
+    ForwardNetwork;
+    ASSERT_EQ(affinity_set, true);
+    affinity_set = false;
+    GetOutput;
+    CompareResult;
+    LITE_destroy_network(c_network);
+}
+
+TEST(TestCapiNetWork, UserAllocator) {
+    ForwardMgb;
+    MakeNetwork;
+    LITE_CAPI_CHECK(LITE_set_memory_allocator(c_network, allocate, free));
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+
+    ASSERT_GE(m_nr_allocated, 1);
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+    ASSERT_EQ(m_nr_left, 0);
+}
+
+TEST(TestCapiNetWork, BasicMultiThread) {
+    ForwardMgb;
+    MakeNetwork;
+    LITE_CAPI_CHECK(LITE_set_cpu_threads_number(c_network, NUMBER_THREDS));
+    LoadNetwork;
+    LITE_CAPI_CHECK(
+            LITE_set_runtime_thread_affinity(c_network, multi_thread_affinity));
+    SetInput;
+    ForwardNetwork;
+    for (size_t i = 0; i < NUMBER_THREDS; i++) {
+        for (size_t j = i + 1; j < NUMBER_THREDS; j++) {
+            ASSERT_NE(thread_ids[i], thread_ids[j]);
+        }
+    }
+    for (size_t i = 0; i < NUMBER_THREDS; i++) {
+        thread_ids[i] = std::thread::id();
+    }
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, DeviceIO) {
+    ForwardMgb;
+    LiteNetwork c_network;
+    LiteIO input_io = default_io;
+    input_io.is_host = true;
+    input_io.name = "data";
+    LiteNetworkIO network_io = *default_network_io();
+    network_io.inputs = &input_io;
+    network_io.input_size = 1;
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), network_io));
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, StartCallBack) {
+    ForwardMgb;
+    MakeNetwork;
+    LoadNetwork;
+    LITE_CAPI_CHECK(LITE_set_start_callback(c_network, start_callback));
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    ASSERT_TRUE(start_checked);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, FinishCallBack) {
+    ForwardMgb;
+    MakeNetwork;
+    LoadNetwork;
+    LITE_CAPI_CHECK(LITE_set_finish_callback(c_network, finish_callback));
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    ASSERT_TRUE(finish_checked);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, BasicCryptAes) {
+    ForwardMgb;
+
+    LiteConfig c_config = *default_config();
+    c_config.bare_model_cryption_name = "AES_default";
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, c_config, *default_network_io()));
+    std::string model_crypt_path = "./shufflenet_crypt_aes.mge";
+
+    LITE_CAPI_CHECK(
+            LITE_load_model_from_path(c_network, model_crypt_path.c_str()));
+
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, PackedCryptRc4) {
+    ForwardMgb;
+    MakeNetwork;
+
+    std::string model_crypt_path = "./test_packed_model_rc4.lite";
+    LITE_CAPI_CHECK(
+            LITE_load_model_from_path(c_network, model_crypt_path.c_str()));
+
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, AsyncExec) {
+    finished = false;
+    ForwardMgb;
+    LiteNetwork c_network;
+    LiteConfig c_config = *default_config();
+    c_config.options.var_sanity_check_first_run = false;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, c_config, *default_network_io()));
+    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback));
+    LoadNetwork;
+    SetInput;
+
+    LITE_forward(c_network);
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    ASSERT_GT(count, 0);
+    finished = false;
+
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, OutputShapeOnly) {
+    ForwardMgb;
+    LiteNetwork c_network;
+    LiteNetworkIO c_network_io = *default_network_io();
+    LiteIO io_output = default_io;
+    io_output.io_type = LiteIOType::LITE_IO_SHAPE;
+    io_output.name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
+    c_network_io.outputs = &io_output;
+    c_network_io.output_size = 1;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network, *default_config(), c_network_io));
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    size_t length = 0;
+    LITE_CAPI_CHECK(
+            LITE_get_tensor_total_size_in_byte(c_output_tensor, &length));
+    ASSERT_EQ(length / sizeof(float), 1000);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, ProfileIOdump) {
+    ForwardMgb;
+    MakeNetwork;
+    LITE_CAPI_CHECK(
+            LITE_enable_profile_performance(c_network, "./profile.json"));
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+    ASSERT_TRUE(fopen("./profile.json", "r"));
+
+    LITE_CAPI_CHECK(LITE_enable_io_txt_dump(c_network, "./io_txt_dump.txt"));
+    ForwardNetwork;
+    ASSERT_TRUE(fopen("./io_txt_dump.txt", "r"));
+
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, GetDeviceType) {
+    lite::Config config;
+    auto lite_tensor = lite::get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    MakeNetwork;
+    LoadNetwork;
+    LiteDeviceType devicetype;
+    LITE_CAPI_CHECK(LITE_get_device_type(c_network, &devicetype));
+    ASSERT_TRUE(devicetype == LiteDeviceType::LITE_CPU);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, GetModelExtraInfo) {
+    lite::Config config;
+    std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite";
+    MakeNetwork;
+    LITE_load_model_from_path(c_network, model_path.c_str());
+    const char* info = nullptr;
+    int info_size = 0;
+    LITE_CAPI_CHECK(LITE_get_model_extra_info(c_network, &info, &info_size));
+    ASSERT_TRUE(info_size > 0);
+    printf("info %s \n", info);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, TestWorkSpaceLimit) {
+    lite::Config config;
+    auto lite_tensor = lite::get_input_data("./input_data.npy");
+    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
+    std::string model_path = "./shufflenet.mge";
+    MakeNetwork;
+    LoadNetwork;
+    printf("go to config workspace limit\n");
+    LITE_CAPI_CHECK(LITE_set_network_algo_workspace_limit(c_network, 1000));
+    SetInput;
+    ForwardNetwork;
+
+    GetOutput;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
+TEST(TestCapiNetWork, TestShareWeights) {
+    ForwardMgb;
+    MakeNetwork;
+    LoadNetwork;
+    SetInput;
+    ForwardNetwork;
+
+    GetOutput;
+    CompareResult;
+
+    LiteNetwork c_network2;
+    LITE_CAPI_CHECK(
+            LITE_make_network(&c_network2, *default_config(), *default_network_io()));
+    LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network2));
+    LITE_CAPI_CHECK(LITE_shared_weight_with_network(c_network2, c_network));
+    int is_cpu_inplace_mode = false;
+    LITE_CAPI_CHECK(LITE_is_cpu_inplace_mode(c_network2, &is_cpu_inplace_mode));
+    ASSERT_EQ(is_cpu_inplace_mode, true);
+
+    LiteTensor c_input_tensor2, c_output_tensor2;
+    LITE_CAPI_CHECK(
+            LITE_get_io_tensor(c_network2, "data", LITE_IO, &c_input_tensor2));
+    LITE_CAPI_CHECK(LITE_reset_tensor_memory(
+            c_input_tensor2, lite_tensor->get_memory_ptr(),
+            lite_tensor->get_tensor_total_size_in_byte()));
+    LITE_CAPI_CHECK(LITE_forward(c_network2));
+    LITE_CAPI_CHECK(LITE_wait(c_network2));
+    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network2, output_name, LITE_IO,
+                                       &c_output_tensor2));
+    void* output_ptr2;
+    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor2, &output_ptr2));
+
+    EXPECT_TRUE(lite::compare_memory<float>(
+            output_ptr2, result_mgb->get_memory_ptr(),
+            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)));
+
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network2));
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_network_options.cpp b/lite/test/test_network_options.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ff344ff2254f587ec50cecf3e130bc84ce8bfba
--- /dev/null
+++ b/lite/test/test_network_options.cpp
@@ -0,0 +1,351 @@
+/**
+ * \file test/test_network_options.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../src/common.h"
+#include "../src/misc.h"
+#include "../src/mge/network_impl.h"
+#include "lite/global.h"
+
+#include "megbrain/tensor.h"
+#include "test_common.h"
+
+#include <string.h>
+#include <chrono>
+#include <memory>
+#include <random>
+
+using namespace lite;
+
+TEST(TestNetWorkOptions, no_var_sanity_check_and_record) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.var_sanity_check_first_run = false;
+    config.options.comp_node_seq_record_level = 1;
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, const_shape) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.var_sanity_check_first_run = false;
+    config.options.const_shape = true;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, NCHW44) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.var_sanity_check_first_run = false;
+    config.options.enable_nchw44 = true;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    Runtime::set_network_algo_policy(
+            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
+                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, test_cache) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    set_persistent_cache("./algo_cache.txt", true);
+    network->load_model(model_path);
+    Runtime::set_network_algo_policy(
+            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
+                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+
+    dump_persistent_cache("./algo_cache.txt");
+    ASSERT_TRUE(fopen("./algo_cache.txt", "r"));
+
+    set_persistent_cache("./algo_cache.txt");
+    network->forward();
+    network->wait();
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, FastRunIgnorBatch) {
+    Config config;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    set_persistent_cache("./algo_cache.txt");
+    network->load_model(model_path);
+    Runtime::set_network_algo_policy(
+            network,
+            LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
+                    LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE,
+            1, true);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+
+    dump_persistent_cache("./algo_cache.txt");
+    ASSERT_TRUE(fopen("./algo_cache.txt", "r"));
+}
+
+#if LITE_WITH_CUDA
+TEST(TestNetWorkOptions, NCHW4) {
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.enable_nchw4 = 1;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, NCHW32) {
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.enable_nchw32 = 1;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    Runtime::set_network_algo_policy(
+            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
+                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+
+TEST(TestNetWorkOptions, jit_level) {
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    config.options.jit_level = 1;
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+#endif
+
+#if MGB_ENABLE_TENSOR_RT && LITE_WITH_CUDA
+TEST(TestNetWorkOptions, TensorRT) {
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+    Runtime::use_tensorrt(network);
+
+    set_tensor_rt_cache("./tensorrt_cache.txt");
+    network->load_model(model_path);
+
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    auto result_tensor = std::make_shared<Tensor>(
+            LiteDeviceType::LITE_CPU,
+            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+
+    void* out_data = result_tensor->get_memory_ptr();
+    output_tensor->reset(out_data, result_tensor->get_layout());
+
+    network->forward();
+    network->wait();
+    dump_tensor_rt_cache();
+    ASSERT_TRUE(fopen("./tensorrt_cache.txt", "r"));
+    compare_lite_tensor<float>(output_tensor, result_mgb);
+}
+#endif
+#endif
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_tensor.cpp b/lite/test/test_tensor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5ce61b0e986be3deb1cc3ed12d18ec2c95a46ce
--- /dev/null
+++ b/lite/test/test_tensor.cpp
@@ -0,0 +1,589 @@
+/**
+ * \file test/test_tensor.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../src/misc.h"
+#include "../src/mge/common.h"
+#include "../src/mge/network_impl.h"
+#include "lite/tensor.h"
+
+#include <gtest/gtest.h>
+
+#include <string.h>
+#include <memory>
+
+using namespace lite;
+
+TEST(TestTensor, Basic) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1(LiteDeviceType::LITE_CPU);
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
+    //! mge tensor has created
+    ASSERT_TRUE(TensorHelper::implement(&tensor1));
+    ASSERT_TRUE(TensorHelper::implement(&tensor2));
+    ASSERT_TRUE(TensorHelper::implement(&tensor3));
+    //! check member
+    ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU);
+    ASSERT_EQ(tensor2.get_layout(), layout);
+    ASSERT_EQ(tensor3.get_layout(), layout);
+    //! check the real tensor
+    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
+    ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
+
+    ASSERT_TRUE(TensorHelper::implement(&tensor1)
+                        ->cast_final_safe<TensorImplDft>()
+                        .host_tensor());
+
+    ASSERT_FALSE(TensorHelper::implement(&tensor1)
+                         ->cast_final_safe<TensorImplDft>()
+                         .dev_tensor());
+    ASSERT_FALSE(TensorHelper::implement(&tensor1)
+                         ->cast_final_safe<TensorImplDft>()
+                         .dev_tensor());
+    ASSERT_TRUE(TensorHelper::implement(&tensor1)
+                        ->cast_final_safe<TensorImplDft>()
+                        .host_tensor());
+}
+
+TEST(TestTensor, SetLayoutReAlloc) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1;
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
+    auto old_ptr2 = tensor2.get_memory_ptr();
+    auto old_ptr3 = tensor3.get_memory_ptr();
+
+    //! layout set through
+    Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
+    tensor1.set_layout(layout1);
+    tensor2.set_layout(layout1);
+    tensor3.set_layout(layout1);
+    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
+    ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
+    auto layout2 = TensorHelper::implement(&tensor2)
+                           ->cast_final_safe<TensorImplDft>()
+                           .host_tensor()
+                           ->layout();
+    auto layout3 = TensorHelper::implement(&tensor3)
+                           ->cast_final_safe<TensorImplDft>()
+                           .host_tensor()
+                           ->layout();
+    ASSERT_EQ(to_lite_layout(layout2), layout1);
+    ASSERT_EQ(to_lite_layout(layout3), layout1);
+
+    auto new_ptr2 = tensor2.get_memory_ptr();
+    auto new_ptr3 = tensor3.get_memory_ptr();
+
+    ASSERT_EQ(old_ptr2, new_ptr2);
+    ASSERT_EQ(old_ptr3, new_ptr3);
+}
+
+TEST(TestTensor, Reset) {
+    Layout layout{{3, 20}, 2, LiteDataType::LITE_FLOAT};
+    Tensor tensor1;
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
+
+    auto old_ptr2 = tensor2.get_memory_ptr();
+    auto old_ptr3 = tensor3.get_memory_ptr();
+    //! make sure memory is allocted
+    ASSERT_NO_THROW(memcpy(old_ptr2, old_ptr3, 3 * 20 * 2));
+
+    std::shared_ptr<float> new_ptr2(new float[3 * 20],
+                                    [](float* ptr) { delete[] ptr; });
+    std::shared_ptr<float> new_ptr3(new float[3 * 20],
+                                    [](float* ptr) { delete[] ptr; });
+    tensor1.reset(new_ptr2.get(), layout);
+    tensor2.reset(new_ptr2.get(), 3 * 20 * 4);
+    tensor3.reset(new_ptr3.get(), 3 * 20 * 4);
+    //! After reset the original mem is freed
+    /*ASSERT_EXIT((memcpy(old_ptr2, old_ptr3, 3 * 20 * 2), exit(0)),
+                ::testing::KilledBySignal(SIGSEGV), ".*");*/
+
+    ASSERT_EQ(tensor2.get_memory_ptr(), new_ptr2.get());
+    ASSERT_EQ(tensor3.get_memory_ptr(), new_ptr3.get());
+
+    ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2));
+
+    Layout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT};
+    std::shared_ptr<float> ptr2(new float[6 * 20],
+                                [](float* ptr) { delete[] ptr; });
+    std::shared_ptr<float> ptr3(new float[6 * 20],
+                                [](float* ptr) { delete[] ptr; });
+    tensor2.reset(ptr2.get(), layout1);
+    tensor3.reset(ptr3.get(), layout1);
+
+    //! memory is not freed by Tensor reset
+    ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2));
+    auto host_layout2 = TensorHelper::implement(&tensor2)
+                                ->cast_final_safe<TensorImplDft>()
+                                .host_tensor()
+                                ->layout();
+    auto host_layout3 = TensorHelper::implement(&tensor3)
+                                ->cast_final_safe<TensorImplDft>()
+                                .host_tensor()
+                                ->layout();
+
+    ASSERT_EQ(to_lite_layout(host_layout2), layout1);
+    ASSERT_EQ(to_lite_layout(host_layout3), layout1);
+}
+
+TEST(TestTensor, CrossCNCopy) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1(LiteDeviceType::LITE_CPU);
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+    auto old_ptr2 = tensor2.get_memory_ptr();
+    auto old_ptr3 = tensor3.get_memory_ptr();
+
+    //! test source tenor is empty
+    ASSERT_THROW(tensor2.copy_from(tensor1), std::exception);
+    tensor1.copy_from(tensor2);
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
+    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
+}
+
+TEST(TestTensor, SharedTensorMemory) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1(LiteDeviceType::LITE_CPU);
+    {
+        Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+        tensor1.share_memory_with(tensor2);
+        auto ptr1 = tensor1.get_memory_ptr();
+        auto ptr2 = tensor2.get_memory_ptr();
+        ASSERT_EQ(ptr1, ptr2);
+    }
+    // check after tensor2 destroy, tensor1 can also visit
+    auto ptr1 = static_cast<float*>(tensor1.get_memory_ptr());
+    size_t length = tensor1.get_tensor_total_size_in_byte() /
+                    tensor1.get_layout().get_elem_size();
+    for (size_t i = 0; i < length; i++) {
+        ptr1[i] = i;
+    }
+}
+
+TEST(TestTensor, Reshape) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    auto ptr = tensor2.get_memory_ptr();
+
+    //! test wrong case
+    ASSERT_THROW(tensor2.reshape({-1, -1, 3 * 224 * 224}), std::exception);
+    ASSERT_THROW(tensor2.reshape({-1, 3, 3 * 224 * 224}), std::exception);
+    ASSERT_THROW(tensor2.reshape({1, 3, 3 * 224 * 224}), std::exception);
+    ASSERT_THROW(tensor2.reshape({3, 3, 3 * 224 * 224}), std::exception);
+
+    tensor2.reshape({3 * 224 * 224});
+    ASSERT_EQ(tensor2.get_layout().ndim, 1);
+    ASSERT_EQ(tensor2.get_layout().data_type, LiteDataType::LITE_FLOAT);
+    ASSERT_EQ(tensor2.get_layout().shapes[0], 3 * 224 * 224);
+    tensor2.reshape({-1, 224, 224});
+    ASSERT_EQ(tensor2.get_layout().ndim, 3);
+    ASSERT_EQ(tensor2.get_layout().shapes[0], 3);
+    ASSERT_EQ(tensor2.get_layout().shapes[1], 224);
+
+    ASSERT_EQ(tensor2.get_memory_ptr(), ptr);
+}
+
+TEST(TestTensor, Slice) {
+    Layout layout{{20, 20}, 2};
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    auto ptr = tensor2.get_memory_ptr();
+
+    //! test source tenor is empty
+    ASSERT_THROW(tensor2.slice({5, 10, 10}, {10, 15}), std::exception);
+    ASSERT_THROW(tensor2.slice({5, 10}, {10, 15}, {5}), std::exception);
+    ASSERT_THROW(tensor2.slice({5, 10}, {10, 15, 10}), std::exception);
+    for (int i = 0; i < 20 * 20; i++) {
+        *(static_cast<float*>(ptr) + i) = i;
+    }
+    auto check = [&](size_t start, size_t end, size_t step) {
+        Tensor tensor3;
+        tensor3.copy_from(
+                *tensor2.slice({start, start}, {end, end}, {step, step}));
+        float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr());
+        for (size_t i = start; i < end; i += step) {
+            for (size_t j = start; j < end; j += step) {
+                ASSERT_EQ(float(i * 20 + j), *new_ptr);
+                ++new_ptr;
+            }
+        }
+    };
+    check(5, 10, 1);
+    check(5, 11, 2);
+    check(2, 18, 4);
+
+    Tensor tensor3;
+    tensor3.copy_from(*tensor2.slice({3}, {9}, {2}));
+    float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr());
+    for (size_t i = 3; i < 9; i += 2) {
+        for (size_t j = 0; j < 20; j++) {
+            ASSERT_EQ(float(i * 20 + j), *new_ptr);
+            ++new_ptr;
+        }
+    }
+}
+
+TEST(TestTensor, SliceCopy) {
+    Layout layout{{20, 20}, 2};
+    Tensor tensor(LiteDeviceType::LITE_CPU, layout);
+    //! alloc memory
+    auto ptr = static_cast<float*>(tensor.get_memory_ptr());
+
+    Layout layout_slice{{20, 10}, 2};
+    Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice);
+    auto ptr0 = tensor0.get_memory_ptr();
+    for (int i = 0; i < 10 * 20; i++) {
+        *(static_cast<float*>(ptr0) + i) = i;
+    }
+    Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice);
+    auto ptr1 = tensor1.get_memory_ptr();
+    for (int i = 0; i < 10 * 20; i++) {
+        *(static_cast<float*>(ptr1) + i) = i + 200;
+    }
+
+    auto slice0 = tensor.slice({0, 0}, {20, 10});
+    auto slice1 = tensor.slice({0, 10}, {20, 20});
+
+    slice0->copy_from(tensor0);
+    slice1->copy_from(tensor1);
+
+    ASSERT_FALSE(slice0->is_continue_memory());
+    ASSERT_FALSE(slice1->is_continue_memory());
+
+    for (size_t i = 0; i < 20; i++) {
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(i * 10 + j), *ptr);
+            ++ptr;
+        }
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(i * 10 + j + 200), *ptr);
+            ++ptr;
+        }
+    }
+    slice0->fill_zero();
+    Tensor tmp;
+    tmp.copy_from(*slice0);
+    float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr());
+    for (size_t i = 0; i < 20; i++) {
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(0), *tmp_ptr);
+            ++tmp_ptr;
+        }
+    }
+}
+
+TEST(TestTensor, GetPtrOffset) {
+    Layout layout{{20, 20}, 2};
+    Tensor tensor(LiteDeviceType::LITE_CPU, layout);
+    //! alloc memory
+    auto ptr = static_cast<float*>(tensor.get_memory_ptr());
+
+    auto ptr_offset = tensor.get_memory_ptr({10, 10});
+    ASSERT_EQ(ptr_offset, ptr + 10 * 20 + 10);
+
+    auto slice0 = tensor.slice({0, 0}, {20, 10});
+    auto slice1 = tensor.slice({0, 10}, {20, 20});
+
+    ASSERT_FALSE(slice0->is_continue_memory());
+    ASSERT_FALSE(slice1->is_continue_memory());
+
+    auto ptr_offset_slice0 = slice0->get_memory_ptr({6, 5});
+    auto ptr_offset_slice1 = slice1->get_memory_ptr({2, 5});
+
+    ASSERT_EQ(ptr_offset_slice0, ptr + 6 * 20 + 5);
+    ASSERT_EQ(ptr_offset_slice1, ptr + 2 * 20 + 10 + 5);
+}
+
+TEST(TestTensor, Concat) {
+    Layout layout{{5, 5, 5}, 3};
+    std::vector<Tensor> tensors;
+    for (int i = 0; i < 4; i++) {
+        Tensor tensor(LiteDeviceType::LITE_CPU, layout);
+        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
+        for (int n = 0; n < 5 * 5 * 5; n++) {
+            ptr[n] = i;
+        }
+        tensors.push_back(tensor);
+    }
+    auto check = [&](int dim) {
+        auto new_tensor = TensorUtils::concat(tensors, dim);
+        auto ptr = static_cast<float*>(new_tensor->get_memory_ptr());
+        size_t stride = std::pow(5, (3 - dim));
+        for (int i = 0; i < 4; i++) {
+            for (size_t j = 0; j < stride; j++) {
+                ASSERT_EQ(ptr[i * stride + j], i);
+            }
+        }
+    };
+    check(0);
+    check(1);
+    check(2);
+}
+
+#if LITE_WITH_CUDA
+TEST(TestTensor, BasicDevice) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1(LiteDeviceType::LITE_CUDA, layout);
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    //! mge tensor has created
+    ASSERT_TRUE(TensorHelper::implement(&tensor1));
+    ASSERT_TRUE(TensorHelper::implement(&tensor2));
+
+    //! check member
+    ASSERT_EQ(tensor1.get_device_type(), LiteDeviceType::LITE_CUDA);
+    ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU);
+    ASSERT_EQ(tensor2.get_layout(), layout);
+    //! check the real tensor
+    ASSERT_EQ(tensor1.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
+    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
+
+    ASSERT_TRUE(TensorHelper::implement(&tensor2)
+                        ->cast_final_safe<TensorImplDft>()
+                        .host_tensor());
+
+    ASSERT_FALSE(TensorHelper::implement(&tensor2)
+                         ->cast_final_safe<TensorImplDft>()
+                         .dev_tensor());
+    ASSERT_TRUE(TensorHelper::implement(&tensor1)
+                        ->cast_final_safe<TensorImplDft>()
+                        .dev_tensor());
+    ASSERT_FALSE(TensorHelper::implement(&tensor1)
+                         ->cast_final_safe<TensorImplDft>()
+                         .host_tensor());
+}
+
+TEST(TestTensor, SetLayoutReAllocDevice) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor2(LiteDeviceType::LITE_CUDA, layout);
+    auto old_ptr2 = tensor2.get_memory_ptr();
+
+    //! layout set through
+    Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
+    tensor2.set_layout(layout1);
+    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
+    auto layout2 = TensorHelper::implement(&tensor2)
+                           ->cast_final_safe<TensorImplDft>()
+                           .dev_tensor()
+                           ->layout();
+    ASSERT_EQ(to_lite_layout(layout2), layout1);
+
+    auto new_ptr2 = tensor2.get_memory_ptr();
+
+    ASSERT_EQ(old_ptr2, new_ptr2);
+}
+
+TEST(TestTensor, CrossCNCopyDevice) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor0;
+    Tensor tensor1(LiteDeviceType::LITE_CPU);
+    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
+    Tensor tensor3(LiteDeviceType::LITE_CUDA, layout);
+
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    auto old_ptr2 = tensor2.get_memory_ptr();
+    auto old_ptr3 = tensor3.get_memory_ptr();
+    ASSERT_THROW(tensor3.copy_from(tensor1), std::exception);
+
+    tensor1.copy_from(tensor3);
+    tensor0.copy_from(tensor3);
+
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
+    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
+}
+
+TEST(TestTensor, PinnedHostMem) {
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor1(LiteDeviceType::LITE_CPU);
+    bool is_pinned_host = true;
+    Tensor tensor2(LiteDeviceType::LITE_CUDA, layout, is_pinned_host);
+    Tensor tensor3(LiteDeviceType::LITE_CUDA, layout);
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    ASSERT_EQ(tensor2.is_pinned_host(), true);
+    ASSERT_EQ(tensor3.is_pinned_host(), false);
+
+    auto old_ptr2 = tensor2.get_memory_ptr();
+    auto old_ptr3 = tensor3.get_memory_ptr();
+
+    //! test source tenor is empty
+    ASSERT_THROW(tensor2.copy_from(tensor1), std::exception);
+    tensor1.copy_from(tensor2);
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
+    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
+}
+
+TEST(TestTensor, DeviceId) {
+    if(get_device_count(LITE_CUDA) <= 1)
+        return;
+    Layout layout{{1, 3, 224, 224}, 4};
+    Tensor tensor2(0, LiteDeviceType::LITE_CUDA, layout);
+    Tensor tensor3(1, LiteDeviceType::LITE_CUDA, layout);
+
+    tensor2.copy_from(tensor3);
+    tensor3.copy_from(tensor2);
+
+    Tensor tensor1;
+    tensor1.copy_from(tensor2);
+    tensor1.copy_from(tensor3);
+}
+
+TEST(TestTensor, SliceDevice) {
+    Layout layout{{20, 20}, 2};
+    Tensor host_tensor0;
+    Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout);
+    host_tensor0.copy_from(dev_tensor0);
+    auto ptr = host_tensor0.get_memory_ptr();
+
+    for (int i = 0; i < 20 * 20; i++) {
+        *(static_cast<float*>(ptr) + i) = i;
+    }
+    dev_tensor0.copy_from(host_tensor0);
+
+    auto check = [&](size_t start, size_t end, size_t step) {
+        Tensor host_tensor;
+        host_tensor.copy_from(
+                *dev_tensor0.slice({start, start}, {end, end}, {step, step}));
+        float* new_ptr = static_cast<float*>(host_tensor.get_memory_ptr());
+        for (size_t i = start; i < end; i += step) {
+            for (size_t j = start; j < end; j += step) {
+                ASSERT_EQ(float(i * 20 + j), *new_ptr);
+                ++new_ptr;
+            }
+        }
+    };
+    check(5, 10, 1);
+    check(5, 11, 2);
+    check(2, 18, 4);
+}
+
+TEST(TestTensor, MemSetDevice) {
+    Layout layout{{20, 20}, 2, LiteDataType::LITE_INT8};
+    Tensor host_tensor0(LiteDeviceType::LITE_CPU, layout);
+    Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout);
+    auto check = [&](uint8_t val, const Tensor& tensor) {
+        auto ptr = static_cast<uint8_t*>(tensor.get_memory_ptr());
+        for (int i = 0; i < 20 * 20; i++) {
+            ASSERT_EQ(val, *(ptr + i));
+        }
+    };
+    host_tensor0.fill_zero();
+    check(0, host_tensor0);
+
+    Tensor host_tensor1;
+    dev_tensor0.fill_zero();
+    host_tensor1.copy_from(dev_tensor0);
+    check(0, host_tensor1);
+}
+
+TEST(TestTensor, DeviceSliceCopy) {
+    Layout layout{{20, 20}, 2};
+    Tensor tensor(LiteDeviceType::LITE_CUDA, layout);
+    //! alloc memory
+    tensor.get_memory_ptr();
+
+    Layout layout_slice{{20, 10}, 2};
+    Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice);
+    auto ptr0 = tensor0.get_memory_ptr();
+    for (int i = 0; i < 10 * 20; i++) {
+        *(static_cast<float*>(ptr0) + i) = i;
+    }
+    Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice);
+    auto ptr1 = tensor1.get_memory_ptr();
+    for (int i = 0; i < 10 * 20; i++) {
+        *(static_cast<float*>(ptr1) + i) = i + 200;
+    }
+
+    auto slice0 = tensor.slice({0, 0}, {20, 10});
+    auto slice1 = tensor.slice({0, 10}, {20, 20});
+
+    slice0->copy_from(tensor0);
+    slice1->copy_from(tensor1);
+
+    ASSERT_FALSE(slice0->is_continue_memory());
+    ASSERT_FALSE(slice1->is_continue_memory());
+
+    Tensor host_tensor;
+    host_tensor.copy_from(tensor);
+    auto ptr = static_cast<float*>(host_tensor.get_memory_ptr());
+
+    for (size_t i = 0; i < 20; i++) {
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(i * 10 + j), *ptr);
+            ++ptr;
+        }
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(i * 10 + j + 200), *ptr);
+            ++ptr;
+        }
+    }
+    slice0->fill_zero();
+    Tensor tmp;
+    tmp.copy_from(*slice0);
+    float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr());
+    for (size_t i = 0; i < 20; i++) {
+        for (size_t j = 0; j < 10; j++) {
+            ASSERT_EQ(float(0), *tmp_ptr);
+            ++tmp_ptr;
+        }
+    }
+}
+
+TEST(TestTensor, ConcatDevice) {
+    Layout layout{{5, 5, 5}, 3};
+    std::vector<Tensor> tensors;
+    for (int i = 0; i < 4; i++) {
+        Tensor tensor(LiteDeviceType::LITE_CPU, layout);
+        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
+        for (int n = 0; n < 5 * 5 * 5; n++) {
+            ptr[n] = i;
+        }
+        tensors.push_back(tensor);
+    }
+    auto check = [&](int dim) {
+        auto new_tensor =
+                TensorUtils::concat(tensors, dim, LiteDeviceType::LITE_CUDA, 0);
+
+        Tensor tensor(LiteDeviceType::LITE_CPU);
+        tensor.copy_from(*new_tensor);
+        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
+        size_t stride = std::pow(5, (3 - dim));
+        for (int i = 0; i < 4; i++) {
+            for (size_t j = 0; j < stride; j++) {
+                ASSERT_EQ(ptr[i * stride + j], i);
+            }
+        }
+        ASSERT_EQ(new_tensor->get_device_type(), LiteDeviceType::LITE_CUDA);
+        ASSERT_EQ(new_tensor->get_device_id(), 0);
+    };
+    check(0);
+    check(1);
+    check(2);
+}
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/test/test_tensor_c.cpp b/lite/test/test_tensor_c.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c206043516ff36d38730451e2b5a989d0989d66
--- /dev/null
+++ b/lite/test/test_tensor_c.cpp
@@ -0,0 +1,316 @@
+/**
+ * \file test/test_tensor_c.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include "lite_build_config.h"
+
+#if LITE_BUILD_WITH_MGE
+#include "../src/misc.h"
+#include "lite-c/global_c.h"
+#include "lite-c/tensor_c.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+
+TEST(TestCapiTensor, Basic) {
+    LiteTensor c_tensor0, c_tensor1;
+    LiteTensorDesc description = default_desc;
+    LITE_make_tensor(description, &c_tensor0);
+    int is_pinned_host = false;
+    LITE_is_pinned_host(c_tensor0, &is_pinned_host);
+    ASSERT_FALSE(is_pinned_host);
+    LiteDeviceType device_type;
+    LITE_get_tensor_device_type(c_tensor0, &device_type);
+    ASSERT_EQ(device_type, LiteDeviceType::LITE_CPU);
+    size_t length = 0;
+    LITE_get_tensor_total_size_in_byte(c_tensor0, &length);
+    ASSERT_EQ(length, 0);
+
+    LiteLayout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    description.device_type = LiteDeviceType::LITE_CPU;
+    description.layout = layout;
+    description.is_pinned_host = true;
+    LITE_make_tensor(description, &c_tensor1);
+    LITE_is_pinned_host(c_tensor1, &is_pinned_host);
+    ASSERT_TRUE(is_pinned_host);
+    LITE_get_tensor_total_size_in_byte(c_tensor1, &length);
+    ASSERT_EQ(length, 1 * 3 * 224 * 224 * 4);
+
+    LiteLayout get_layout;
+    LITE_get_tensor_layout(c_tensor1, &get_layout);
+    ASSERT_EQ(get_layout.ndim, layout.ndim);
+    ASSERT_EQ(get_layout.data_type, layout.data_type);
+    ASSERT_EQ(get_layout.shapes[0], layout.shapes[0]);
+    ASSERT_EQ(get_layout.shapes[1], layout.shapes[1]);
+    ASSERT_EQ(get_layout.shapes[2], layout.shapes[2]);
+    ASSERT_EQ(get_layout.shapes[3], layout.shapes[3]);
+
+    //! test error
+    ASSERT_EQ(LITE_is_pinned_host(c_tensor0, nullptr), -1);
+    ASSERT_NE(strlen(LITE_get_last_error()), 0);
+    printf("The last error is: %s\n", LITE_get_last_error());
+
+    LITE_destroy_tensor(c_tensor0);
+    LITE_destroy_tensor(c_tensor1);
+}
+
+TEST(TestCapiTensor, SetLayoutReAlloc) {
+    LiteTensor c_tensor0;
+    LiteTensorDesc description = default_desc;
+    description.layout =
+            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    void *old_ptr, *new_ptr;
+    LITE_get_tensor_memory(c_tensor0, &old_ptr);
+
+    LiteLayout new_layout =
+            LiteLayout{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
+    LITE_set_tensor_layout(c_tensor0, new_layout);
+    LITE_get_tensor_memory(c_tensor0, &new_ptr);
+
+    size_t length = 0;
+    LITE_get_tensor_total_size_in_byte(c_tensor0, &length);
+
+    ASSERT_EQ(length, 1 * 3 * 100 * 100);
+    ASSERT_EQ(old_ptr, new_ptr);
+}
+
+TEST(TestCapiTensor, Reset) {
+    LiteTensor c_tensor0, c_tensor1;
+    LiteTensorDesc description = default_desc;
+    description.layout = LiteLayout{{3, 20}, 2, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    LITE_make_tensor(description, &c_tensor1);
+    void *old_ptr0, *old_ptr1;
+    LITE_get_tensor_memory(c_tensor0, &old_ptr0);
+    LITE_get_tensor_memory(c_tensor1, &old_ptr1);
+    //! make sure memory is allocted
+    ASSERT_NO_THROW(memcpy(old_ptr0, old_ptr1, 3 * 20 * 4));
+
+    std::shared_ptr<float> new_ptr0(new float[3 * 20],
+                                    [](float* ptr) { delete[] ptr; });
+    std::shared_ptr<float> new_ptr1(new float[3 * 20],
+                                    [](float* ptr) { delete[] ptr; });
+    LITE_reset_tensor_memory(c_tensor0, new_ptr0.get(), 3 * 20 * 4);
+    LITE_reset_tensor_memory(c_tensor1, new_ptr1.get(), 3 * 20 * 4);
+    void *tmp_ptr0, *tmp_ptr1;
+    LITE_get_tensor_memory(c_tensor0, &tmp_ptr0);
+    LITE_get_tensor_memory(c_tensor1, &tmp_ptr1);
+    ASSERT_EQ(tmp_ptr0, new_ptr0.get());
+    ASSERT_EQ(tmp_ptr1, new_ptr1.get());
+
+    ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4));
+
+    LiteLayout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT};
+    std::shared_ptr<float> ptr2(new float[6 * 20],
+                                [](float* ptr) { delete[] ptr; });
+    std::shared_ptr<float> ptr3(new float[6 * 20],
+                                [](float* ptr) { delete[] ptr; });
+    LITE_reset_tensor(c_tensor0, layout1, new_ptr0.get());
+    LITE_reset_tensor(c_tensor1, layout1, new_ptr1.get());
+
+    //! memory is not freed by Tensor reset
+    ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4));
+
+    LiteLayout tmp_layout0, tmp_layout1;
+    LITE_get_tensor_layout(c_tensor0, &tmp_layout0);
+    LITE_get_tensor_layout(c_tensor1, &tmp_layout1);
+    ASSERT_EQ(tmp_layout0.ndim, tmp_layout1.ndim);
+    ASSERT_EQ(tmp_layout0.data_type, tmp_layout1.data_type);
+    ASSERT_EQ(tmp_layout0.shapes[0], tmp_layout1.shapes[0]);
+    ASSERT_EQ(tmp_layout0.shapes[1], tmp_layout1.shapes[1]);
+
+    LITE_destroy_tensor(c_tensor0);
+    LITE_destroy_tensor(c_tensor1);
+}
+
+TEST(TestCapiTensor, CrossCNCopy) {
+    LiteTensor c_tensor0, c_tensor1, c_tensor2;
+    LiteTensorDesc description = default_desc;
+    LITE_make_tensor(description, &c_tensor0);
+
+    description.layout =
+            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor1);
+    LITE_make_tensor(description, &c_tensor2);
+
+    LITE_tensor_copy(c_tensor1, c_tensor2);
+    LITE_tensor_copy(c_tensor2, c_tensor1);
+    void *old_ptr1, *old_ptr2, *new_ptr1, *new_ptr2;
+    LITE_get_tensor_memory(c_tensor1, &old_ptr1);
+    LITE_get_tensor_memory(c_tensor2, &old_ptr2);
+
+    //! test source tenor is empty
+    ASSERT_EQ(LITE_tensor_copy(c_tensor1, c_tensor0), -1);
+    ASSERT_NE(strlen(LITE_get_last_error()), 0);
+    printf("The last error is: %s\n", LITE_get_last_error());
+
+    LITE_tensor_copy(c_tensor0, c_tensor1);
+    LITE_tensor_copy(c_tensor1, c_tensor2);
+    LITE_tensor_copy(c_tensor2, c_tensor0);
+
+    LITE_get_tensor_memory(c_tensor1, &new_ptr1);
+    LITE_get_tensor_memory(c_tensor2, &new_ptr2);
+
+    ASSERT_EQ(old_ptr1, new_ptr1);
+    ASSERT_EQ(old_ptr2, new_ptr2);
+
+    LITE_destroy_tensor(c_tensor0);
+    LITE_destroy_tensor(c_tensor1);
+    LITE_destroy_tensor(c_tensor2);
+}
+
+TEST(TestCapiTensor, ShareMemoryWith) {
+    LiteTensor c_tensor0, c_tensor1;
+    LiteTensorDesc description = default_desc;
+    LITE_make_tensor(description, &c_tensor0);
+
+    description.layout =
+            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor1);
+
+    ASSERT_EQ(LITE_tensor_share_memory_with(c_tensor1, c_tensor0), -1);
+    LITE_tensor_share_memory_with(c_tensor0, c_tensor1);
+    void *ptr0, *ptr1;
+    LITE_get_tensor_memory(c_tensor0, &ptr0);
+    LITE_get_tensor_memory(c_tensor1, &ptr1);
+
+    ASSERT_EQ(ptr0, ptr1);
+
+    LITE_destroy_tensor(c_tensor0);
+    LITE_destroy_tensor(c_tensor1);
+}
+
+TEST(TestCapiTensor, Reshape) {
+    LiteTensor c_tensor0;
+    LiteTensorDesc description = default_desc;
+    description.layout =
+            LiteLayout{{8, 8, 100, 100}, 4, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    void* old_ptr;
+    LITE_get_tensor_memory(c_tensor0, &old_ptr);
+
+    auto check = [&](std::vector<size_t> expect, const LiteTensor& tensor) {
+        LiteLayout get_layout;
+        LITE_get_tensor_layout(tensor, &get_layout);
+        ASSERT_EQ(get_layout.ndim, expect.size());
+        for (size_t i = 0; i < expect.size(); i++) {
+            ASSERT_EQ(get_layout.shapes[i], expect[i]);
+        }
+        void* new_ptr;
+        LITE_get_tensor_memory(tensor, &new_ptr);
+        ASSERT_EQ(old_ptr, new_ptr);
+    };
+    {
+        int shape[2] = {-1, 50};
+        LITE_tensor_reshape(c_tensor0, shape, 2);
+        check({8 * 8 * 100 * 2, 50}, c_tensor0);
+    }
+    {
+        int shape[3] = {64, 100, 100};
+        LITE_tensor_reshape(c_tensor0, shape, 3);
+        check({8 * 8, 100, 100}, c_tensor0);
+    }
+    {
+        int shape[3] = {16, 100, -1};
+        LITE_tensor_reshape(c_tensor0, shape, 3);
+        check({16, 100, 400}, c_tensor0);
+    }
+    LITE_destroy_tensor(c_tensor0);
+}
+
+TEST(TestCapiTensor, Slice) {
+    LiteTensor c_tensor0;
+    LiteTensorDesc description = default_desc;
+    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    void* old_ptr;
+    LITE_get_tensor_memory(c_tensor0, &old_ptr);
+    for (size_t i = 0; i < 20 * 20; i++) {
+        *(static_cast<float*>(old_ptr) + i) = i;
+    }
+    auto check = [&](size_t start, size_t end, size_t step, bool have_step) {
+        LiteTensor tensor, slice_tensor;
+        LITE_make_tensor(default_desc, &tensor);
+        size_t start_ptr[2] = {start, start};
+        size_t end_ptr[2] = {end, end};
+        size_t step_ptr[2] = {step, step};
+
+        if (have_step) {
+            LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, step_ptr, 2,
+                              &slice_tensor);
+        } else {
+            LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, nullptr, 2,
+                              &slice_tensor);
+        }
+        int is_continue = true;
+        LITE_is_memory_continue(slice_tensor, &is_continue);
+        ASSERT_FALSE(is_continue);
+
+        LITE_tensor_copy(tensor, slice_tensor);
+        void* new_ptr;
+        LITE_get_tensor_memory(tensor, &new_ptr);
+        float* ptr = static_cast<float*>(new_ptr);
+        for (size_t i = start; i < end; i += step) {
+            for (size_t j = start; j < end; j += step) {
+                ASSERT_EQ(float(i * 20 + j), *ptr);
+                ++ptr;
+            }
+        }
+        LITE_destroy_tensor(tensor);
+    };
+    check(1, 8, 1, true);
+    check(1, 8, 1, false);
+    check(2, 10, 2, true);
+    check(10, 18, 4, true);
+    check(10, 18, 1, false);
+    LITE_destroy_tensor(c_tensor0);
+}
+
+TEST(TestCapiTensor, Memset) {
+    LiteTensor c_tensor0;
+    LiteTensorDesc description = default_desc;
+    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    void* ptr;
+    uint8_t* uint8_ptr;
+    LITE_get_tensor_memory(c_tensor0, &ptr);
+    LITE_tensor_fill_zero(c_tensor0);
+    uint8_ptr = static_cast<uint8_t*>(ptr);
+    for (size_t i = 0; i < 20 * 20; i++) {
+        ASSERT_EQ(0, *uint8_ptr);
+        uint8_ptr++;
+    }
+
+    LITE_destroy_tensor(c_tensor0);
+}
+
+TEST(TestCapiTensor, GetMemoryByIndex) {
+    LiteTensor c_tensor0;
+    LiteTensorDesc description = default_desc;
+    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
+    LITE_make_tensor(description, &c_tensor0);
+    void *ptr0, *ptr1, *ptr2, *ptr3;
+    LITE_get_tensor_memory(c_tensor0, &ptr0);
+    size_t index0[] = {3, 4};
+    LITE_get_tensor_memory_with_index(c_tensor0, &index0[0], 2, &ptr1);
+    size_t index1[] = {5, 7};
+    LITE_get_tensor_memory_with_index(c_tensor0, &index1[0], 2, &ptr2);
+    size_t index2[] = {5};
+    LITE_get_tensor_memory_with_index(c_tensor0, &index2[0], 1, &ptr3);
+
+    ASSERT_EQ(ptr1, static_cast<float*>(ptr0) + 3 * 20 + 4);
+    ASSERT_EQ(ptr2, static_cast<float*>(ptr0) + 5 * 20 + 7);
+    ASSERT_EQ(ptr3, static_cast<float*>(ptr0) + 5 * 20);
+
+    LITE_destroy_tensor(c_tensor0);
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/tools/aes_encrypt.sh b/lite/tools/aes_encrypt.sh
new file mode 100755
index 0000000000000000000000000000000000000000..37900e1a25ae1e4ae69ed76b161fe064cec3b5b8
--- /dev/null
+++ b/lite/tools/aes_encrypt.sh
@@ -0,0 +1,26 @@
+#! /bin/bash -e
+set -e
+
+if [ $# -lt 2 ] ; then
+echo "USAGE: $0 src dst"
+echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl"
+echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl key"
+exit 1;
+fi
+
+IV=`openssl rand -hex 16`
+
+Key=000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
+if [ $# == 3 ] ; then
+Key=$3
+fi
+
+# get file size
+size=`wc -c $1`
+
+echo "encrypt aes-256-cbc ..."
+openssl enc -e -aes-256-cbc -in $1 -out $1.tmp -K $Key -iv $IV
+echo $IV | xxd -r -p | cat - $1.tmp > $2 
+# write size into file
+printf "%016x" ${size%\ *} | xxd -r -p >> $2
+rm -f $1.tmp
diff --git a/lite/tools/dump_model_mgb.py b/lite/tools/dump_model_mgb.py
new file mode 100755
index 0000000000000000000000000000000000000000..0f34d1b8c4dac9f14fbb46d3862720ad480a7e6b
--- /dev/null
+++ b/lite/tools/dump_model_mgb.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env mdl
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from megskull.graph import NodeFilter, FpropEnv
+from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization
+from megskull.utils.logconf import get_logger
+from meghair.utils import io
+import megbrain as mgb
+
+import argparse
+import struct
+import re
+import os
+
+import numpy as np
+import cv2
+
+logger = get_logger(__name__)
+
+def optimize_for_inference(args, outputs):
+    args_map = {
+        'enable_io16xc32': 'f16_io_f32_comp',
+        'enable_ioc16': 'f16_io_comp',
+        'enable_hwcd4': 'use_nhwcd4',
+        'enable_nchw4': 'use_nchw4',
+        'enable_nchw88': 'use_nchw88',
+        'enable_nchw44': 'use_nchw44',
+        'enable_nchw44_dot': 'use_nchw44_dot',
+        'enable_nchw32': 'use_nchw32',
+        'enable_chwn4': 'use_chwn4',
+        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
+        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
+    }
+    kwargs = {}
+    for k, v in args_map.items():
+        if getattr(args, k):
+            assert args.optimize_for_inference, (
+                'optimize_for_inference should be set when {} is given'.format(
+                    k))
+            kwargs[v] = True
+
+    if args.optimize_for_inference:
+        return mgb.optimize_for_inference(outputs, **kwargs)
+
+    return outputs
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Dump the Python Megbrain model to C++ model, by the way '
+        'optimizing for inference',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('input', help='input pkl model file ')
+    parser.add_argument('-o', '--output', help='output file', required=True)
+    parser.add_argument('--init-bn', action='store_true',
+                        help='initialize untrained batch-normalization, to '
+                        'avoid NaN or Inf results')
+    parser.add_argument('--silent', action='store_true',
+                        help='set verbose to False in AssertEqual opr')
+    parser.add_argument('--optimize-for-inference', action='store_true',
+                        help='enbale optimization for inference')
+    parser.add_argument('--discard-var-name', action='store_true',
+                        help='discard variable and param names in the '
+                        'generated output')
+    parser.add_argument('--output-strip-info', action='store_true',
+                        help='output code strip information')
+    parser.add_argument('--enable-io16xc32', action='store_true',
+                        help='transform the mode to float16 io float32 compute')
+    parser.add_argument('--enable-ioc16', action='store_true',
+                        help='transform the dtype of the model to float16 io '
+                        'and compute')
+    parser.add_argument('--enable-fuse-conv-bias-nonlinearity',
+                        action='store_true',
+                        help='fuse convolution bias and nonlinearity opr to a '
+                        'conv_bias opr and compute')
+    parser.add_argument('--enable-hwcd4', action='store_true',
+                        help='transform the model format from NCHW to NHWCD4 '
+                        'for inference; you may need to disable CUDA and set '
+                        'MGB_USE_MEGDNN_DBG=2')
+    parser.add_argument('--enable-nchw4', action='store_true',
+                        help='transform the model format from NCHW to NCHW4 '
+                        'for inference')
+    parser.add_argument('--enable-nchw88', action='store_true',
+                        help='transform the model format from NCHW to NCHW88 '
+                        'for inference')
+    parser.add_argument('--enable-nchw44', action='store_true',
+                        help='transform the model format from NCHW to NCHW44 '
+                        'for inference')
+    parser.add_argument('--enable-nchw44-dot', action='store_true',
+                        help='transform the model format from NCHW to NCHW44_DOT '
+                        'for optimizing armv8.2 dot in inference')
+    parser.add_argument('--enable-chwn4', action='store_true',
+                        help='transform the model format to CHWN4 '
+                        'for inference, mainly used for nvidia tensorcore')
+    parser.add_argument('--enable-nchw32', action='store_true',
+                        help='transform the model format from NCHW4 to NCHW32 '
+                        'for inference on nvidia TensoCore')
+    parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true',
+                        help='fuse conv_bias with z input for inference on '
+                        'nvidia GPU (this optimization pass will result in mismatch '
+                        'of the precision of output of training and inference)')
+    args = parser.parse_args()
+
+    env = FpropEnv(verbose_fprop=False)
+
+
+    outputs = io.load_network(args.input).outputs
+
+    output_mgbvars = list(map(env.get_mgbvar, outputs))
+
+    output_mgbvars = optimize_for_inference(args, output_mgbvars)
+
+    if args.discard_var_name:
+        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
+    else:
+        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)
+
+    stat = mgb.serialize_comp_graph_to_file(
+        args.output, output_mgbvars, append=False,
+        output_strip_info=args.output_strip_info,
+        **sereg_kwargs)
+    logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.
+                format(stat.tot_bytes / 1024,
+                       (stat.tot_bytes - stat.tensor_value_bytes) / 1024))
+
+if __name__ == '__main__':
+    main()
diff --git a/lite/tools/pack_model/encrypt_info_and_model.sh b/lite/tools/pack_model/encrypt_info_and_model.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b1e18fa519045077acc876d4ed5c9cceb32ea501
--- /dev/null
+++ b/lite/tools/pack_model/encrypt_info_and_model.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+set -e
+
+function usage() {
+    echo "$0 args1 args2 .."
+    echo "available args detail:"
+    echo "-i info.json : input info.json file"
+    echo "-m model: model name"
+    echo "-e encryption mode: encryption mode rc4 encrypt_predefined_rc4 "
+    echo "-o output name: output name"
+    echo "-n input model name: input model name match with info.json"
+    echo "-h : show usage"
+    exit -1
+}
+
+while getopts "i:m:e:o:n:h" arg
+do
+    case $arg in
+        i)
+            INFO_NAME=$OPTARG
+            ;;
+        m)
+            MODEL_NAME=$OPTARG
+            ;;
+        n)
+            INPUT_MODEL_NAME=$OPTARG
+            ;;
+        e)
+            ENCRYPT_MODE=$OPTARG
+            ;;
+        o)
+            OUTPUT_NAME=$OPTARG
+            ;;
+        h)
+            usage
+            ;;
+        \?)
+            echo "show usage"
+            usage
+            ;;
+    esac
+done
+echo "----------------------------------------------------"
+echo "commad args summary:"
+echo "INFO_NAME: $INFO_NAME"
+echo "MODEL_NAME: $MODEL_NAME"
+echo "ENCRYPT_MODE: $ENCRYPT_MODE"
+echo "OUTPUT_NAME: $OUTPUT_NAME"
+echo "INPUT_MODEL_NAME: $INPUT_MODEL_NAME"
+echo "----------------------------------------------------"
+
+if [[ $INFO_NAME == '' ]]; then
+    echo "INFO_NAME is NULL,exit now..."
+    exit -1
+fi
+if [[ $MODEL_NAME == '' ]]; then
+    echo "MODEL_NAME is NULL,exit now..."
+    exit -1
+fi
+if [[ $INPUT_MODEL_NAME == '' ]]; then
+    echo "INPUT_MODEL_NAME is NULL,exit now..."
+    exit -1
+fi
+if [[ $OUTPUT_NAME == '' ]]; then
+    echo "OUTPUT_NAME is NULL,exit now..."
+    exit -1
+fi
+ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod
+ENCRYPT_MODEL_NAME=$MODEL_NAME.pr_rc4.emod
+./rc4_encryptor $ENCRYPT_MODE $INFO_NAME $INFO_NAME.pr_rc4.emod
+./rc4_encryptor $ENCRYPT_MODE $MODEL_NAME $MODEL_NAME.pr_rc4.emod
+
+
+ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod
+python3 pack_model_and_info.py --input-model=$ENCRYPT_MODEL_NAME --model-name=$INPUT_MODEL_NAME --model-cryption="RC4_default" --info-cryption="RC4_default" --input-info=$ENCRYPT_INFO_NAME --info-parser="LITE_default" -o $OUTPUT_NAME
diff --git a/lite/tools/pack_model/pack_model_and_info.py b/lite/tools/pack_model/pack_model_and_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..06378f05f06a0491c98e6111a787920e47e96930
--- /dev/null
+++ b/lite/tools/pack_model/pack_model_and_info.py
@@ -0,0 +1,135 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# This file is part of MegEngine, a deep learning framework developed by
+# Megvii.
+#
+# copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+
+import argparse
+import struct
+import os
+import subprocess
+
+import flatbuffers
+
+def generate_flatbuffer():
+    status, path = subprocess.getstatusoutput('which flatc')
+    if not status:
+        cwd = os.path.dirname(os.path.dirname(__file__))
+        fbs_file = os.path.abspath(os.path.join(cwd,
+            "../../src/parse_model/pack_model.fbs"))
+        cmd = path + ' -p -b '+fbs_file
+        ret, _ = subprocess.getstatusoutput(str(cmd))
+        if ret:
+            raise Exception("flatc generate error!")
+    else:
+        raise Exception('no flatc in current environment, please build flatc '
+                'and put in the system PATH!')
+
+def main():
+    parser = argparse.ArgumentParser(
+            description='load a encrypted or not encrypted model and a '
+            'json format of the infomation of the model, pack them to a file '
+            'which can be loaded by lite.')
+    parser.add_argument('--input-model', help='input a encrypted or not encrypted model')
+    parser.add_argument('--input-info', help='input a encrypted or not encrypted '
+            'json format file.')
+    parser.add_argument('--model-name', help='the model name, this must match '
+            'with the model name in model info', default = 'NONE')
+    parser.add_argument('--model-cryption', help='the model encryption method '
+            'name, this is used to find the right decryption method. e.g. '
+            '--model_cryption = "AES_default", default is NONE.', default =
+            'NONE')
+    parser.add_argument('--info-cryption', help='the info encryption method '
+            'name, this is used to find the right decryption method. e.g. '
+            '--model_cryption = "AES_default", default is NONE.', default =
+            'NONE')
+    parser.add_argument('--info-parser', help='The information parse method name '
+            'default is "LITE_default". ', default = 'LITE_default')
+    parser.add_argument('--append', '-a', help='append another model to a '
+            'packed model.')
+    parser.add_argument('--output', '-o', help='output file of packed model.')
+
+    args = parser.parse_args()
+
+    generate_flatbuffer()
+    assert not args.append, ('--append is not support yet')
+    assert args.input_model, ('--input_model must be given')
+    with open(args.input_model, 'rb') as fin:
+        raw_model = fin.read()
+
+    model_length = len(raw_model)
+
+    if args.input_info:
+        with open(args.input_info, 'rb') as fin:
+            raw_info = fin.read()
+            info_length = len(raw_info)
+    else:
+        raw_info = None
+        info_length = 0
+
+    # Generated by `flatc`.
+    from model_parse import Model, ModelData, ModelHeader, ModelInfo, PackModel
+
+    builder = flatbuffers.Builder(1024)
+
+    model_name = builder.CreateString(args.model_name)
+    model_cryption = builder.CreateString(args.model_cryption)
+    info_cryption = builder.CreateString(args.info_cryption)
+    info_parser = builder.CreateString(args.info_parser)
+
+    info_data = builder.CreateByteVector(raw_info)
+    arr_data = builder.CreateByteVector(raw_model)
+
+    #model header
+    ModelHeader.ModelHeaderStart(builder)
+    ModelHeader.ModelHeaderAddName(builder, model_name)
+    ModelHeader.ModelHeaderAddModelDecryptionMethod(builder, model_cryption)
+    ModelHeader.ModelHeaderAddInfoDecryptionMethod(builder, info_cryption)
+    ModelHeader.ModelHeaderAddInfoParseMethod(builder, info_parser)
+    model_header = ModelHeader.ModelHeaderEnd(builder)
+
+    #model info
+    ModelInfo.ModelInfoStart(builder)
+    ModelInfo.ModelInfoAddData(builder, info_data)
+    model_info = ModelInfo.ModelInfoEnd(builder)
+
+    #model data
+    ModelData.ModelDataStart(builder)
+    ModelData.ModelDataAddData(builder, arr_data)
+    model_data = ModelData.ModelDataEnd(builder)
+
+    Model.ModelStart(builder)
+    Model.ModelAddHeader(builder, model_header)
+    Model.ModelAddData(builder, model_data)
+    Model.ModelAddInfo(builder, model_info)
+    model = Model.ModelEnd(builder)
+
+    PackModel.PackModelStartModelsVector(builder, 1)
+    builder.PrependUOffsetTRelative(model)
+    models = builder.EndVector(1)
+
+    PackModel.PackModelStart(builder)
+    PackModel.PackModelAddModels(builder, models)
+    packed_model = PackModel.PackModelEnd(builder)
+
+    builder.Finish(packed_model)
+    buff = builder.Output()
+
+    result = struct.pack(str(len("packed_model")) + 's', "packed_model".encode('ascii'))
+    result += buff
+
+    assert args.output, ('--output must be given')
+    with open(args.output, 'wb') as fin:
+        fin.write(result)
+
+    print("Model packaged successfully!!!")
+    print("model name is: {}.".format(args.model_name))
+    print("model encryption method is: {}. ".format(args.model_cryption))
+    print("model json infomation encryption method is: {}. ".format(args.info_cryption))
+    print("model json infomation parse method is: {}. ".format(args.info_parser))
+    print("packed model is write to {} ".format(args.output))
+
+if __name__ == '__main__':
+    main()
diff --git a/lite/tools/rc4_encrypt.cpp b/lite/tools/rc4_encrypt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75d3ff9c9ffa03bb4890e5d9acccb38e5b448207
--- /dev/null
+++ b/lite/tools/rc4_encrypt.cpp
@@ -0,0 +1,211 @@
+/** \file tools/rc4_encrypt.cpp
+ *
+ * This file is part of MegEngine, a deep learning framework developed by
+ * Megvii.
+ *
+ * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
+ */
+
+#include <stdio.h>
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <memory>
+
+#include "../src/decryption/rc4/rc4_cryption_base.h"
+#include "../src/decryption/rc4_cryption.h"
+
+using namespace lite;
+
+std::shared_ptr<void> read_file(std::string file_path, size_t& size) {
+    FILE* fin = fopen(file_path.c_str(), "rb");
+    if (!fin) {
+        printf("failed to open %s.", file_path.c_str());
+    };
+    fseek(fin, 0, SEEK_END);
+    size = ftell(fin);
+    fseek(fin, 0, SEEK_SET);
+    void* ptr = malloc(size);
+    std::shared_ptr<void> buf{ptr, ::free};
+    fread(buf.get(), 1, size, fin);
+    fclose(fin);
+    return buf;
+}
+
+void write_file(std::string file_path, const std::vector<uint8_t>& data) {
+    FILE* fin = fopen(file_path.c_str(), "wb");
+    if (!fin) {
+        printf("failed to open %s.", file_path.c_str());
+    };
+    fwrite(data.data(), 1, data.size(), fin);
+    fclose(fin);
+}
+
+typedef int (*CommandHandler)(int, char**);
+
+const char* usage =
+        "Usage:\n"
+        " rc4_encryptor encrypt_predefined_rc4 <input file> <output file>\n"
+        " rc4_encryptor encrypt_rc4 <hash key> <enc key> <input file> <output "
+        "file>\n"
+        " rc4_encryptor encrypt_predefined_sfrc4 <input file> <output file>\n"
+        " rc4_encryptor encrypt_sfrc4 <hash key> <enc key> <input file> "
+        "<output "
+        "file>\n"
+        " rc4_encryptor hash <input file>\n";
+
+int command_encrypt_predefined_rc4(int argc, char** argv) {
+    if (argc != 4) {
+        printf("Invalid encrypt_predefined_rc4 arguments.\n");
+        return 1;
+    }
+
+    const char* input_file_path = argv[2];
+    const char* output_file_path = argv[3];
+
+    size_t size = 0;
+    auto keys = RC4::get_decrypt_key();
+    auto input = read_file(input_file_path, size);
+    printf("Reading input file ...\n");
+    auto output = RC4::encrypt_model(input.get(), size, keys);
+
+    write_file(output_file_path, output);
+
+    printf("Done.\n");
+    return 0;
+}
+
+int command_encrypt_rc4(int argc, char** argv) {
+    if (argc != 6) {
+        printf("Invalid encrypt_rc4 arguments.\n");
+        return 1;
+    }
+
+    uint64_t hash_key = std::stoull(argv[2], 0, 0);
+    uint64_t enc_key = std::stoull(argv[3], 0, 0);
+    const char* input_file_path = argv[4];
+    const char* output_file_path = argv[5];
+
+    std::vector<uint8_t> keys(128, 0);
+    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
+    data[0] = hash_key;
+    data[1] = enc_key;
+
+    size_t size = 0;
+    auto input = read_file(input_file_path, size);
+    printf("Reading input file ...\n");
+    auto output = RC4::encrypt_model(input.get(), size, keys);
+
+    printf("Encrypting ...\n");
+    write_file(output_file_path, output);
+
+    printf("Done.\n");
+    return 0;
+}
+
+int command_encrypt_predefined_sfrc4(int argc, char** argv) {
+    if (argc != 4) {
+        printf("Invalid encrypt_predefined_rc4 arguments.\n");
+        return 1;
+    }
+
+    const char* input_file_path = argv[2];
+    const char* output_file_path = argv[3];
+
+    size_t size = 0;
+    auto keys = SimpleFastRC4::get_decrypt_key();
+    auto input = read_file(input_file_path, size);
+    printf("Reading input file ...\n");
+    auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys);
+
+    write_file(output_file_path, output);
+
+    printf("Done.\n");
+    return 0;
+}
+
+int command_encrypt_sfrc4(int argc, char** argv) {
+    if (argc != 6) {
+        printf("Invalid encrypt_rc4 arguments.\n");
+        return 1;
+    }
+
+    uint64_t hash_key = std::stoull(argv[2], 0, 0);
+    uint64_t enc_key = std::stoull(argv[3], 0, 0);
+    const char* input_file_path = argv[4];
+    const char* output_file_path = argv[5];
+
+    std::vector<uint8_t> keys(128, 0);
+    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
+    data[0] = hash_key;
+    data[1] = enc_key;
+
+    size_t size = 0;
+    auto input = read_file(input_file_path, size);
+    printf("Reading input file ...\n");
+    auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys);
+
+    printf("Encrypting ...\n");
+    write_file(output_file_path, output);
+
+    printf("Done.\n");
+    return 0;
+}
+
+int command_hash(int argc, char** argv) {
+    if (argc != 3) {
+        printf("Invalid hash arguments.\n");
+        return 1;
+    }
+
+    const char* input_file_path = argv[2];
+
+    size_t len = 0;
+    auto input = read_file(input_file_path, len);
+
+    rc4::FastHash64 hasher(rc4::key_gen_hash_key());
+    auto start = static_cast<const char*>(input.get());
+
+    auto ptr = reinterpret_cast<const uint64_t*>(start);
+    while (reinterpret_cast<const char*>(ptr + 1) <= start + len) {
+        hasher.feed(*ptr);
+        ++ptr;
+    }
+
+    auto cptr = reinterpret_cast<const char*>(ptr);
+    if (cptr < start + len) {
+        uint64_t v = 0;
+        std::copy(cptr, start + len, reinterpret_cast<char*>(&v));
+        hasher.feed(v);
+    }
+
+    printf("%llx\n", static_cast<unsigned long long>(hasher.get()));
+    return 0;
+}
+
+
+std::unordered_map<std::string, CommandHandler> commands = {
+        {"encrypt_predefined_rc4", command_encrypt_predefined_rc4},
+        {"encrypt_rc4", command_encrypt_rc4},
+        {"encrypt_predefined_sfrc4", command_encrypt_predefined_sfrc4},
+        {"encrypt_sfrc4", command_encrypt_sfrc4},
+        {"hash", command_hash},
+};
+
+int main(int argc, char** argv) {
+    if (argc == 1) {
+        printf("%s", usage);
+        return 1;
+    }
+
+    auto it = commands.find(argv[1]);
+    if (it == commands.end()) {
+        printf("Invalid command arguments.\n");
+        printf("%s", usage);
+        return 1;
+    }
+    return it->second(argc, argv);
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index e1bafb7448c823fdcb543d6cd7807dcf087545e9..86f3aaa35570233de977ec83f51d6ac6759acb58 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -209,6 +209,35 @@ function do_build() {
         echo "comapt whl name: ${compat_whl_name}"
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}
 
+        # handle megenginelite
+        cd ${BUILD_DIR}
+        rm -rf lite_staging
+        mkdir -p lite_staging/megenginelite
+        cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/
+        cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/
+        cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/
+        VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py
+        if [ -f ${VER_FILE} ];then
+            cp ${VER_FILE} lite_staging/megenginelite
+        else
+            echo "ERROR: can not find version file"
+            exit -1
+        fi
+        mkdir -p ${BUILD_DIR}/lite_staging/megenginelite/libs
+        LITE_LIB=${BUILD_DIR}/lite_staging/megenginelite/libs/liblite_shared.dylib
+        cp ${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/lite/liblite_shared.dylib ${LITE_LIB}
+        llvm-strip -s ${LITE_LIB}
+
+        cd ${BUILD_DIR}/lite_staging/
+        ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
+        cd ${BUILD_DIR}/lite_staging/dist/
+        org_whl_name=`ls Meg*.whl`
+        index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'`
+        compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl
+        echo "megenginelite org whl name: ${org_whl_name}"
+        echo "megenginelite comapt whl name: ${compat_whl_name}"
+        cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}
+
         cd ${SRC_DIR}
         echo ""
         echo "##############################################################################################"
diff --git a/scripts/whl/manylinux2014/do_build_common.sh b/scripts/whl/manylinux2014/do_build_common.sh
index 0f1fc771e3359f907b92e344586c298989eab789..57e6b391857cf7bfc805045ff2530d445d544616 100755
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -155,6 +155,33 @@ do
     echo "comapt whl name: ${compat_whl_name}"
     mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}
 
+    # handle megenginelite
+    cd ${BUILD_DIR}
+    rm -rf lite_staging
+    mkdir -p lite_staging/megenginelite
+    cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/
+    cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/
+    cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/
+    VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py
+    if [ -f ${VER_FILE} ];then
+        cp ${VER_FILE} lite_staging/megenginelite
+    else
+        echo "ERROR: can not find version file"
+        exit -1
+    fi
+    patch_elf_depend_lib_megenginelite
+
+    cd ${BUILD_DIR}/lite_staging/
+    ${PYTHON_DIR}/bin/python setup.py bdist_wheel
+    cd /home/output
+    mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}
+    cd ${BUILD_DIR}/lite_staging/dist/
+    org_whl_name=`ls Meg*${ver}*.whl`
+    compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'`
+    echo "megenginelite org whl name: ${org_whl_name}"
+    echo "megenginelite comapt whl name: ${compat_whl_name}"
+    mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}
+
     cd /home/output
     chown -R ${UID}.${UID} .
     # compat for root-less docker env to remove output at host side
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index c917c4895c2432f2bb8f7ab3c6686f5682ed42a7..4896071c5cd70e7be21955ba15e12cdf172148b9 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -106,6 +106,23 @@ function copy_more_dll() {
         depend_real_copy ${CP_WHL_DST_IMP}
     fi
 }
+
+function lite_copy_more_dll() {
+    # for python whl real use
+    echo "config megenginelite core lib dir"
+    CP_WHL_DST_IMP=${BUILD_DIR}/lite_staging/megenginelite/libs
+
+    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
+        echo "copy nvidia lib to whl use...."
+        depend_real_copy ${CP_WHL_DST_IMP}
+        if [ ${IN_CI} = "true" ]; then
+            echo "copy lib for lite for ci test"
+            IMP_TEST_DST=${SRC_DIR}/build_dir/host/build/lite/test/
+            depend_real_copy ${IMP_TEST_DST}
+        fi
+    fi
+}
+
 BUILD_DIR=${SRC_DIR}/build_dir/host/build/
 
 # here we just treat cu file should not in the increment build file list
@@ -196,6 +213,32 @@ function do_build() {
         ${PYTHON_DIR}/python3 setup.py bdist_wheel
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/
 
+        # handle megenginelite
+        cd ${BUILD_DIR}
+        rm -rf lite_staging
+        mkdir -p lite_staging/megenginelite
+        cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/
+        cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/
+        cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/
+        VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py
+        if [ -f ${VER_FILE} ];then
+            cp ${VER_FILE} lite_staging/megenginelite
+        else
+            echo "ERROR: can not find version file"
+            exit -1
+        fi
+
+        LITE_CORE_LIB_DIR=${BUILD_DIR}/lite_staging/megenginelite/libs/
+        mkdir -p ${LITE_CORE_LIB_DIR}
+        cd ${LITE_CORE_LIB_DIR}
+        cp ${BUILD_DIR}/lite/lite_shared.dll liblite_shared.dll
+        llvm-strip -s liblite_shared.dll
+        lite_copy_more_dll
+
+        cd ${BUILD_DIR}/lite_staging/
+        ${PYTHON_DIR}/python3 setup.py bdist_wheel
+        cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/
+
         echo ""
         echo "##############################################################################################"
         echo "windows whl package location: ${WINDOWS_WHL_HOME}"