提交 71230e9a 编写于 作者: M Megvii Engine Team

feat(lite): open source for lite

GitOrigin-RevId: f442431381cc9eb3bb92eb03f744ba6ffa7e2b64
上级 5fe789ab
test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text
test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text
test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text
test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text
test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text
test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text
test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text
test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text
third_party/librknn_api filter=lfs diff=lfs merge=lfs -text
test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text
option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON)
# config lite_build_config.h.in
set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL})
set(LITE_WITH_CUDA ${MGE_WITH_CUDA})
set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING})
set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC})
if(NOT MGB_WITH_FLATBUFFERS)
include(../cmake/flatbuffers.cmake)
endif()
file(GLOB_RECURSE SRC_FBS src/**/*.fbs)
build_flatbuffers(
"${SRC_FBS}"
""
lite_fbs_generate
""
"${CMAKE_CURRENT_BINARY_DIR}"
""
""
)
file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp)
if(MGE_WITH_MINIMUM_SIZE)
set(LITE_ENABLE_LOGGING OFF)
set(LITE_ENABLE_EXCEPTION OFF)
endif()
# Write out lite_build_config.h
# It defines macros needed by lite
configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
# begin config lite
if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
# FXIME third_party cpp redis do not support build with clang-cl
file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp)
list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS})
file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
list(APPEND SOURCES_LITE ${SOURCES_TACOPIE})
endif()
add_library(lite_static STATIC ${SOURCES_LITE})
add_dependencies(lite_static lite_fbs_generate)
include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)
if(LITE_BUILD_WITH_MGE)
target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
add_compile_definitions(LITE_BUILD_WITH_MGE=1)
message(STATUS "build lite with MegEngine.")
else()
target_link_libraries(lite_static PUBLIC flatbuffers)
endif()
include_directories(
PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>
)
# end config lite
# define a shared lib
add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>)
if(LITE_BUILD_WITH_MGE)
target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
endif()
if(ANDROID)
link_libraries(log)
target_link_libraries(lite_static PRIVATE log)
target_link_libraries(lite_shared PRIVATE log)
endif()
if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
# FXIME third_party cpp redis do not support build with clang-cl
target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
endif()
set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script")
add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT})
if(NOT MSVC AND NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
endif()
#TODO: implemente version script for other OS
if (UNIX AND NOT APPLE)
target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
endif()
# config install
install(TARGETS lite_static
LIBRARY DESTINATION lite/lib/${MGE_ARCH}
FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
install(TARGETS lite_shared
LIBRARY DESTINATION lite/lib/${MGE_ARCH}
FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
ARCHIVE DESTINATION lite/lib/${MGE_ARCH}
)
install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
add_subdirectory(example)
if(MGE_WITH_TEST)
add_subdirectory(test)
endif()
# tools and example
add_executable(rc4_encryptor tools/rc4_encrypt.cpp)
target_link_libraries(rc4_encryptor lite_static)
if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
# FIXME: hip obj can not find cpp obj only through lite_static
target_link_libraries(rc4_encryptor megdnn)
endif()
target_include_directories(rc4_encryptor PRIVATE
{PROJECT_SOURCE_DIR}/lite/src/decryption)
install (TARGETS rc4_encryptor
EXPORT ${LITE_EXPORT_TARGETS}
RUNTIME DESTINATION lite/tools)
# Lite
It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in
user's SDK
## bazel build
目前支持内部 bazel 和 CMake 编译,支持 C++/C, Python 接口,
下面是 bazel 中 lite_shared 目标的编译,可以作为其他目标的编译的参考,
该编译依赖内部 bazel 编译以及 megvii3。
### 配置编译环境
需要使用 megvii3 workspace 来完成 bazel 的编译
#### Clone megvii3 安装 bazel
```bash
git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git
./utils/bazel/get_bazel.sh
```
#### Clone megbrain
```
git submodule update brain/megbrain brain/midout
```
### 编译 x86 CUDA 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
--compiler="gcc7_cuda10" -c opt
```
### 编译 x86 CPU 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
--compiler="gcc9" -c opt
```
### 编译 arm OpenCL 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \
-c opt --define enable_opencl=1 --define enable_opencl_search=1
```
### 编译 arm opencl lite_examples
bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \
--cpu=android_aarch64 --define enable_opencl=1 --define enable_opencl_search=1
####如何运行snpe_loder 的lite_exampes 请查看下面的wiki
https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906
### 编译 armv7 CPU 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \
-c opt
```
### 编译 arm64 CPU 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
-c opt
```
### 编译 arm64 CPU v8.2 版本
```bash
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
--copt -march=armv8.2-a+fp16+dotprod -c opt
```
## 同时支持cmake构建
cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine
和RKNPU后端且打开OpenCL的release模式
```bash
EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \
-DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh"
```
* 如果需要支持性能分析的 profile 功能,则需要在编译时候加上
--copt -DMGB_ENABLE_JSON=1 该参数
* 如果需要支持 fast-run 功能则需要加上
--copt -DMGB_ENABLE_FASTRUN=1,开启 fast-run 功能
* 如果编译 arm64,可以加上 --copt -mcpu=cortex-a53 选项进行优化。
### midout 裁减编译
具体 midout 的裁减原理见 megbrain 中 midout 裁减,裁减方法见 MegBrain
和 MegEngine 的裁减方法
## 模型
### 支持的模型
lite 目前支持只支持 MegEngine dump 的模型格式,可以加载的模型文件包括原始
的模型文件,原始的加密模型,pack 之后的加密或者非加密模型。加密算法以及
加密的秘钥可以用户自定义,然后注册到 lite 中,详见 example 中加解密部分。
* 原始模型未加密:直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型
* 原始加密模型:将上述 dump 的模型通过加密算法进行加密,lite 提供两种默认
的加密算法,在 tools 中,分别为 aes 和 rc4. 对应为:aes_encypt.sh 和
rc4_encrypt.cpp,rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在
加载时候需要在 Config 中配置模型的加密方式。
* pack 之后的模型:模型结构将在下面介绍,可以将上面加密或者未加密的模型,和下面
定义的 json config 文件一同打包为一个 pack 之后的模型,可以使用 tools 下面
的 pack_model_and_info.py 工具中完成,pack_model_and_info.py 的使用详见其中
的 help 输出。
### 模型结构
不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分.
* 打包处理之后的文件:
模型打包过程可以通过脚本 pack_model_and_json.py 来完成,其将模型info文件(
可以是任意格式,推荐使用JSON,可以加密也可以不加密)和加密或者未加密的模型文件
一同打包在一起,并在文件开头加上 Header 来帮助解析。
* 原始文件和原始的加密文件没有 Header 和模型 info部分,模型加载需要的信息
可以通过 Config 和 NetworkIO 进行传递。
### Header
Header 部分最开始为一个明文固定model_tag,目前定义为"packed_model"字符串,
后面主要包含模型文件各个部分的信息,每个部分的加密方式,load 模型时候可以
调用相应的解密方法对各个部分进行解密,以及model infomation 部分的解析方法。
具体细节参考lite/src/parse_model/pack_model.fbs
### Info部分
Info 部分主要用来解释模型,如用户关心的:模型的输入数据的格式,模型运行的平台
等信息,这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。
由于这个 Info 部分不同的用户需求不一致,想传递的信息也无法统一,所以目前
Lite 中提供自定义的方式,用户可以自定义自己 Info 部分的类容,并在 Header 中
指定 **Info 解析方式名字** ,并注册以该名字为 key 的解析函数到 Lite 中,
以这样方式来可以实现用户自定义 Info 格式。同时,Lite 中也提供了一套定义好的
格式,其名字为 "LITE_default",并已经实现了对应的解析函数,该 info
为 JSON 格式,具体内容定义如下:
```json
{
"name": "shufflenet_test",
"valid": true,
"version": "8.9999.0",
"has_compression": false,
"device": {
"type": "CPU",
"device_id": 0,
"number_threads": 1,
"use_tensor_rt": false,
"enable_inplace_model": false
},
"options":{
"weight_preprocess": false,
"var_sanity_check_first_run": true,
"const_shape": false,
"jit_level": 0,
"record_level": 0
},
"IO":{
"inputs":[
{
"name": "data",
"io_type": "value",
"is_host": true,
"dtype": "float32",
"shape": {
"dim0": 1,
"dim1": 3,
"dim2": 224,
"dim3": 224
}
}
],
"outputs":[
{
"name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]",
"io_type": "value",
"is_host": true,
"dtype": "float32",
"shape": {
"dim0": 1,
"dim1": 1000,
"dim2": 0,
"dim3": 0
}
}
]
}
}
```
* model_name: 指这个模型的名字,用户可以用来验证是否运行了正确的模型,
和 Header 部分中的进行对比 check
* valid: 指在这个 info 文件中的设置是否影响模型的 Config
* version: 指模型对应的 megbrain 的版本号,load 模型时候会进行 check
* has_compression: 标识这个模型文件中 tensor 的数据是否压缩过
* device: 目前支持字段包括:"CPU","CUDA","OPENCL","ATLAS"
* number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效
* IO::inputs::type: 包括 value,shape,详见 include"network.h"
* IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端
* IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端
* IO::outputs::shape::dimx: 如果为0,则便是该 dim 无效
### Model部分
可以是加密的模型文件或者未加密的模型文件
## 使用
丰富的使用方法详见文件 example 中文档和对应的 example。
## 工具
目前 lite 中有三个工具保存在 tools 目录中,其他 megbrain 工具
没有包含在内,分别为:
* pack_model_and_info.py 为上面提到的模型打包工具,其为一个
python 脚本,可以直接用其对已有的模型和模型 information 的文件,按照上面
的格式进行打包模型,用户可以指定模型名字,模型加密方式,模型信息
文件加密方式,解析方式等,如下:
```bash
python3 pack_model_and_info.py --input-model xxx.mge \
--model-name="shufflenet_test" \
--model-cryption="RC4_default" \
--input-info xxx.json \
--info-cryption="RC4_default" \
--info-parser="LITE_default" \
-o xxx.lite
```
* aes_encrypt.sh 为一个 aes 加密方式的加密脚本,可以将一个文件,
通过指定的的 key 加密成一个 aes 加密的文件,其中 key 为 32 个字节
16进制数。
```bash
aes_encrypt.sh xxx.mdl xxx_encrypted.mdl \
000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
```
* rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具,这个工具可以通过
制定的 key 或者默认的 key 加密制定的文件,支持 rc4 方法和
simple_fast_rc4 两种方法,支持自定义 key。
* bazel 编译 x86 命令为:
```bash
bazel build //brain/megbrain/lite:rc4_encryptor \
--cpu='k8' --compiler='gcc9'
```
* 加密文件,具体用法见 help
```bash
rc4_encryptor encrypt_predefined_rc4 \
to_be_encrypt.file encrypted.file
```
/**
* \file lite/build_config/lite_build_config.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef _HEADER_LITE_BUILD_CONFIG
#define _HEADER_LITE_BUILD_CONFIG
#ifndef LITE_ENABLE_LOGGING
#define LITE_ENABLE_LOGGING 1
#endif
#ifndef LITE_ENABLE_EXCEPTION
#if __cpp_exceptions || __EXCEPTIONS || \
(defined(_MSC_VER) && defined(_CPPUNWIND))
#define LITE_ENABLE_EXCEPTION 1
#else
#define LITE_ENABLE_EXCEPTION 0
#endif
#endif
#ifndef LITE_WITH_CUDA
#define LITE_WITH_CUDA 0
#endif
#ifndef LITE_ASSERT_LOC
#define LITE_ASSERT_LOC 1
#endif
#endif // _HEADER_LITE_BUILD_CONFIG
file (GLOB_RECURSE SOURCES ./*.cpp)
add_executable(lite_examples ${SOURCES})
if(LITE_BUILD_WITH_RKNPU)
#rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
target_link_options(lite_examples PRIVATE "-fuse-ld=gold")
endif()
target_link_libraries(lite_examples lite_static)
if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
# FIXME: hip obj can not find cpp obj only through lite_static
target_link_libraries(lite_examples megdnn)
endif()
if(UNIX)
if(APPLE OR ANDROID)
target_link_libraries(lite_examples dl)
else()
target_link_libraries(lite_examples dl rt)
endif()
endif()
install (TARGETS lite_examples
EXPORT ${LITE_EXPORT_TARGETS}
RUNTIME DESTINATION lite/bin)
# add lite_examples_depends_shared for CI check symbol export valid
add_executable(lite_examples_depends_shared ${SOURCES})
if(LITE_BUILD_WITH_RKNPU)
#rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold")
endif()
target_link_libraries(lite_examples_depends_shared lite_shared)
if(UNIX)
if(APPLE OR ANDROID)
target_link_libraries(lite_examples_depends_shared dl)
else()
target_link_libraries(lite_examples_depends_shared dl rt)
endif()
endif()
install (TARGETS lite_examples_depends_shared
EXPORT ${LITE_EXPORT_TARGETS}
RUNTIME DESTINATION lite/bin)
/**
* \file example/example.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#pragma once
#include <lite_build_config.h>
#include "lite/global.h"
#include "lite/network.h"
#include "lite/tensor.h"
#include "npy.h"
#include <string.h>
#include <memory>
#include <unordered_map>
#include <vector>
namespace lite {
namespace example {
void set_cpu_affinity(const std::vector<int>& cpuset);
struct Args {
int args_parse_ret = 0;
std::string example_name;
std::string model_path;
std::string input_path;
std::string output_path;
std::string loader_path;
static Args from_argv(int argc, char** argv);
};
std::shared_ptr<Tensor> parse_npy(
const std::string& path,
LiteBackend backend = LiteBackend::LITE_DEFAULT);
using ExampleFunc = std::function<bool(const Args&)>;
using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>;
ExampleFuncMap* get_example_function_map();
bool register_example(std::string example_name, const ExampleFunc& fuction);
template <int>
struct Register;
#if LITE_BUILD_WITH_MGE
#if LITE_WITH_CUDA
bool load_from_path_run_cuda(const Args& args);
#endif
bool basic_load_from_path(const Args& args);
bool basic_load_from_path_with_loader(const Args& args);
bool basic_load_from_memory(const Args& args);
bool cpu_affinity(const Args& args);
bool network_share_same_weights(const Args& args);
bool reset_input(const Args& args);
bool reset_input_output(const Args& args);
bool config_user_allocator(const Args& args);
bool register_cryption_method(const Args& args);
bool update_cryption_key(const Args& args);
bool async_forward(const Args& args);
#if LITE_WITH_CUDA
bool device_input(const Args& args);
bool device_input_output(const Args& args);
bool pinned_host_input(const Args& args);
#endif
#endif
} // namespace example
} // namespace lite
#if LITE_BUILD_WITH_MGE
bool basic_c_interface(const lite::example::Args& args);
bool device_io_c_interface(const lite::example::Args& args);
bool async_c_interface(const lite::example::Args& args);
#endif
#define CONCAT_IMPL(a, b) a##b
#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)
#define REGIST_EXAMPLE(name_, func_) \
REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_)
#define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_) \
template <> \
struct Register<number_> { \
Register() { register_example(name_, func_); } \
}; \
namespace { \
Register<number_> MACRO_CONCAT(example_function_, number_); \
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/example.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "lite/global.h"
#include "lite/network.h"
#include "lite/tensor.h"
#include "example.h"
#include "npy.h"
#include <string.h>
#include <map>
#include <memory>
#include <vector>
using namespace lite;
using namespace example;
Args Args::from_argv(int argc, char** argv) {
Args ret;
if (argc < 4) {
printf("usage: lite_examples <example_name> <model file> <input "
"file> <output file>.\n");
printf("*********The output file is optional.*************\n");
printf("The registered examples include:\n");
size_t index = 0;
for (auto it : *get_example_function_map()) {
printf("%zu : %s\n", index, it.first.c_str());
index++;
}
ret.args_parse_ret = -1;
return ret;
}
ret.example_name = argv[1];
ret.model_path = argv[2];
ret.input_path = argv[3];
if (argc > 4) {
ret.output_path = argv[4];
}
if (argc > 5) {
ret.loader_path = argv[5];
}
return ret;
}
ExampleFuncMap* lite::example::get_example_function_map() {
static ExampleFuncMap static_map;
return &static_map;
}
bool lite::example::register_example(std::string example_name,
const ExampleFunc& fuction) {
auto map = get_example_function_map();
if (map->find(example_name) != map->end()) {
printf("Error!!! This example is registed yet\n");
return false;
}
(*map)[example_name] = fuction;
return true;
}
std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path,
LiteBackend backend) {
std::string type_str;
std::vector<npy::ndarray_len_t> stl_shape;
std::vector<int8_t> raw;
npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);
auto lite_tensor =
std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU);
Layout layout;
layout.ndim = stl_shape.size();
const std::map<std::string, LiteDataType> type_map = {
{"f4", LiteDataType::LITE_FLOAT},
{"i4", LiteDataType::LITE_INT},
{"i1", LiteDataType::LITE_INT8},
{"u1", LiteDataType::LITE_UINT8}};
layout.shapes[0] = 1;
for (size_t i = 0; i < layout.ndim; i++) {
layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
}
for (auto& item : type_map) {
if (type_str.find(item.first) != std::string::npos) {
layout.data_type = item.second;
break;
}
}
lite_tensor->set_layout(layout);
size_t length = lite_tensor->get_tensor_total_size_in_byte();
void* dest = lite_tensor->get_memory_ptr();
memcpy(dest, raw.data(), length);
//! rknn not support reshape now
if (layout.ndim == 3) {
lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]),
static_cast<int>(layout.shapes[1]),
static_cast<int>(layout.shapes[2])});
}
return lite_tensor;
}
void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) {
#if defined(__APPLE__) || defined(WIN32)
#pragma message("set_cpu_affinity not enabled on apple and windows platform")
#else
cpu_set_t mask;
CPU_ZERO(&mask);
for (auto i : cpuset) {
CPU_SET(i, &mask);
}
auto err = sched_setaffinity(0, sizeof(mask), &mask);
if (err) {
printf("failed to sched_setaffinity: %s (error ignored)",
strerror(errno));
}
#endif
}
int main(int argc, char** argv) {
set_log_level(LiteLogLevel::WARN);
auto&& args = Args::from_argv(argc, argv);
if (args.args_parse_ret)
return -1;
auto map = get_example_function_map();
auto example = (*map)[args.example_name];
if (example) {
printf("Begin to run %s example.\n", args.example_name.c_str());
return example(args);
} else {
printf("The example of %s is not registed.", args.example_name.c_str());
return -1;
}
}
namespace lite {
namespace example {
#if LITE_BUILD_WITH_MGE
#if LITE_WITH_CUDA
REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda);
#endif
REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path);
REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader);
REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory);
REGIST_EXAMPLE("cpu_affinity", cpu_affinity);
REGIST_EXAMPLE("register_cryption_method", register_cryption_method);
REGIST_EXAMPLE("update_cryption_key", update_cryption_key);
REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights);
REGIST_EXAMPLE("reset_input", reset_input);
REGIST_EXAMPLE("reset_input_output", reset_input_output);
REGIST_EXAMPLE("config_user_allocator", config_user_allocator);
REGIST_EXAMPLE("async_forward", async_forward);
REGIST_EXAMPLE("basic_c_interface", basic_c_interface);
REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface);
REGIST_EXAMPLE("async_c_interface", async_c_interface);
#if LITE_WITH_CUDA
REGIST_EXAMPLE("device_input", device_input);
REGIST_EXAMPLE("device_input_output", device_input_output);
REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
#endif
#endif
} // namespace example
} // namespace lite
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
# Example
在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子,主要
是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example
都是使用 shufflenet 来进行演示。
## Example bazel 的编译和运行
* 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境,编译 CPU 版本
```bash
./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \
--compiler="gcc9" -c opt
```
* 运行时需要指定运行的具体 example 名字,运行的模型,模型运行的数据
* 获取所有的 example 名字
```
bazel-bin/brain/megbrain/lite/lite_examples
```
* 运行 example,下面命令运行 basic_load_from_memory
```
bazel-bin/brain/megbrain/lite/lite_examples \
basic_load_from_memory \
path-to-megbrain/lite/test/resource/lite/shufflenet.mge \
path-to-megbrain/lite/test/resource/lite/input_data.npy
```
## basic 使用
* **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和
basic_load_from_memory**
* 该 example 使用 lite 来完成基本的 inference 功能,load 模型使用默认的配置,
进行 forward 之前将输入数据 copy 到输入 tensor 中,完成 forward 之后,再将
数据从输出 tensor 中 copy 到用户的内存中,输入 tensor 和输出 tensor 都是从
Network 中通过 name 来获取的,输入输出 tensor 的 layout 也可以从对应的 tensor
中直接获取获取,**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。**
## 输入输出指定的内存
* **实现在 reset_io.cpp 中,包括两个 example,reset_input 和 reset_input_output
两个 example。**
* 该 example 中演示输入 tensor 的内存为用户指定的内存(该内存中已经保存好输入
数据),输出 tensor 也可以是用户指定的内存,这样 Network 完成 Forward 之后就会将数据
保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。
* 主要是通过 tensor 中的 reset 接口,该接口可以重新指定 tensor 的内存和对应的
layout,如果 layout 没有指定,默认为 tensor 中原来的 layout。
* **该方法中由于内存是用户申请,需要用户提前知道输入,输出 tensor 对应的 layout,然后
根据 layout 来申请内存,另外通过 reset 设置到 tensor 中的内存,生命周期不由 tensor
管理,由外部用户来管理。**
## 输入输出指定 device 上内存
* **实现在 device_io.cpp 中,device_input 和 device_input_output 两个 example。**
* 该 example 中配置模型运行在 device(CUDA) 上,并且使用用户提前申请的 device 上的内存
作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上,不设置默认
在 CPU 上,其他地方和**输入输出为用户指定的内存**的使用相同
* 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端
## 申请 pinned host 内存作为输入
* **实现在 device_io.cpp 中,函数名字为 pinned_host_input。**
* 这个 example 中模型运行在 device(CUDA) 上,但是输入输出在 CPU 上,为了加速 host2device 的
copy,将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出
output tensor 不是 device 上的时候,默认就是 pinned host 的。
* 申请 pinned host 内存的方法是:构建 tensor 的时候指定 device,layout,以及 is_host_pinned
参数,这样申请的内存就是 pinned host 的内存。
```C
bool is_pinned_host = true;
auto tensor_pinned_input =
Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
```
## 用户指定内存分配器
* **实现在 user_allocator.cpp 中,函数名为:config_user_allocator。**
* 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法,用户自定义
内存分配器需要继承自 lite 中的 Allocator 基类,并实现 allocate 和 free 两个接口。目前在 CPU
上验证是正确的,其他设备上有待测试。
* 设置自定定义内存分配器的接口为 Network 中如下接口:
```C
Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator);
```
## 多个 Network 共享同一份模型 weights
* **实现在 network_share_weights.cpp 中,函数名为:network_share_same_weights。**
* 很多情况用户希望多个 Network 共享同一份 weights,因为模型中 weights 是只读的,这样可以节省
模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能,首先创建一个新的 Network,
用户可以指定新的 Config 和 NetworkIO 以及其他一些配置,使得新创建出来的 Network 完成不同的
功能。
* 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口:
```C
static void shared_weight_with_network(
std::shared_ptr<Network> dst_network,
const std::shared_ptr<Network> src_network);
```
* dst_network: 指新 load 出来的 Network
* src_network:已经 load 的老的 Network
## CPU 绑核
* **实现在 cpu_affinity.cpp 中,函数名为:cpu_affinity。**
* 该 example 之中指定模型运行在 CPU 多线程上,然后使用 Network 中的
set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来,用户可以
根据该 id 决定具体绑核行为,在多线程中,如果线程总数为 n,则 id 为 n-1 的线程为主线程。
## 用户注册自定义解密算法和 key
* **实现在 user_cryption.cpp 中,函数名为:register_cryption_method 和 update_aes_key 。**
* 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口,实现了使用用户自定的解密算法
实现模型的 load 操作。在这个 example 中,自定义了一个解密方法,(其实没有做任何事情,
将模型两次异或上 key 之后返回,等于将原始模型直接返回),然后将其注册到 lite 中,后面创建 Network 时候在其
config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其
key 的更新操作。
目前 lite 里面定义好了几种解密算法:
* AES_default : 其 key 是由 32 个 unsighed char 组成,默认为0到31
* RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char,hash
key 在前,enc_key 在后。
* SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。
大概命名规则为:前面大写是具体算法的名字,'_'后面的小写,代表解密 key。
具体的接口为:
```C
bool register_decryption_and_key(std::string decrypt_name,
const DecryptionFunc& func,
const std::vector<uint8_t>& key);
bool update_decryption_or_key(std::string decrypt_name,
const DecryptionFunc& func,
const std::vector<uint8_t>& key);
```
register 接口中必须要求三个参数都是正确的值,update中 decrypt_nam 必须为已有的解密算法,
将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新
## 异步执行模式
* **实现在 basic.cpp 中,函数名为:async_forward。**
* 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式,
目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持,在 inference 时异步模式,
主线程可以在工作线程正在执行计算的同时做一些其他的运算,避免长时间等待,但是
在一些单核处理器上没有收益。
## 纯 C example
* **实现在 lite_c_interface.cpp,函数名为:basic_c_interface,
device_io_c_interface,async_c_interface**
* Lite 完成对 C++ 接口的封装,对外暴露了纯 C 的接口,用户如果不是源码依赖 Lite
的情况下,应该使用纯 C 接口来完成集成。
* 纯 C 的所有接口都是返回一个 int,如果这个 int 的数值不为 0,则又错误产生,需要
调用 LITE_get_last_error 来获取错误信息。
* 纯 C 的所有 get 函数都需要先定义一个对应的对象,然后将该对象的指针传递进接口,
Lite 会将结果写入到 对应指针的地址里面。
/**
* \file example/basic.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include <thread>
#include "../example.h"
#if LITE_BUILD_WITH_MGE
#include <cstdio>
#include "misc.h"
using namespace lite;
using namespace example;
namespace {
void output_info(std::shared_ptr<Network> network, size_t output_size) {
for (size_t index = 0; index < output_size; index++) {
printf("output[%zu] names %s \n", index,
network->get_all_output_name()[index].c_str());
std::shared_ptr<Tensor> output_tensor =
network->get_output_tensor(index);
size_t ndim = output_tensor->get_layout().ndim;
for (size_t i = 0; i < ndim; i++) {
printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
output_tensor->get_layout().shapes[i]);
}
}
}
void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
for (size_t index = 0; index < output_size; index++) {
auto output_tensor = network->get_output_tensor(index);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
LiteDataType dtype = output_tensor->get_layout().data_type;
float max = -1000.0f;
float min = 1000.0f;
int max_idx = 0;
int min_idx = 0;
float sum = 0.0f;
#define cb(_dtype, _real_dtype) \
case LiteDataType::_dtype: { \
for (size_t i = 0; i < out_length; i++) { \
_real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
sum += data; \
if (max < data) { \
max = data; \
max_idx = i; \
} \
if (min > data) { \
min = data; \
min_idx = i; \
} \
} \
} break;
switch (dtype) {
cb(LITE_FLOAT, float);
cb(LITE_INT, int);
cb(LITE_INT8, int8_t);
cb(LITE_UINT8, uint8_t);
default:
printf("unknow datatype");
}
printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
out_length, index, max, max_idx, min, min_idx, sum);
}
#undef cb
}
} // namespace
#if LITE_WITH_CUDA
bool lite::example::load_from_path_run_cuda(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
set_log_level(LiteLogLevel::DEBUG);
//! config the network running in CUDA device
lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
//! set NetworkIO
NetworkIO network_io;
std::string input_name = "img0_comp_fullface";
bool is_host = false;
IO device_input{input_name, is_host};
network_io.inputs.push_back(device_input);
//! create and load the network
std::shared_ptr<Network> network =
std::make_shared<Network>(config, network_io);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
Layout input_layout = input_tensor->get_layout();
//! read data from numpy data file
auto src_tensor = parse_npy(input_path);
//! malloc the device memory
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
//! copy to the device memory
tensor_device.copy_from(*src_tensor);
//! Now the device memory if filled with user input data, set it to the
//! input tensor
input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
//! forward
{
lite::Timer ltimer("warmup");
network->forward();
network->wait();
ltimer.print_used_time(0);
}
lite::Timer ltimer("forward_iter");
for (int i = 0; i < 10; i++) {
ltimer.reset_start();
network->forward();
network->wait();
ltimer.print_used_time(i);
}
//! get the output data or read tensor set in network_in
size_t output_size = network->get_all_output_name().size();
output_info(network, output_size);
output_data_info(network, output_size);
return true;
}
#endif
bool lite::example::basic_load_from_path(const Args& args) {
set_log_level(LiteLogLevel::DEBUG);
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto layout = input_tensor->get_layout();
for (size_t i = 0; i < layout.ndim; i++) {
printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
}
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
void* dst_ptr = input_tensor->get_memory_ptr();
auto src_tensor = parse_npy(input_path);
auto layout0 = src_tensor->get_layout();
for (size_t i = 0; i < layout0.ndim; i++) {
printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
}
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
//! forward
{
lite::Timer ltimer("warmup");
network->forward();
network->wait();
ltimer.print_used_time(0);
}
lite::Timer ltimer("forward_iter");
for (int i = 0; i < 10; i++) {
network->forward();
network->wait();
ltimer.print_used_time(i);
}
//! forward
{
lite::Timer ltimer("warmup");
network->forward();
network->wait();
ltimer.print_used_time(0);
}
for (int i = 0; i < 10; i++) {
ltimer.reset_start();
network->forward();
network->wait();
ltimer.print_used_time(i);
}
//! get the output data or read tensor set in network_in
size_t output_size = network->get_all_output_name().size();
output_info(network, output_size);
output_data_info(network, output_size);
return true;
}
bool lite::example::basic_load_from_path_with_loader(const Args& args) {
set_log_level(LiteLogLevel::DEBUG);
lite::set_loader_lib_path(args.loader_path);
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto input_layout = input_tensor->get_layout();
//! copy or forward data to network
auto src_tensor = parse_npy(input_path);
auto src_layout = src_tensor->get_layout();
if (src_layout.ndim != input_layout.ndim) {
printf("src dim is not equal model input dim\n");
}
//! pay attention the input shape can change
for (size_t i = 0; i < input_layout.ndim; i++) {
if (input_layout.shapes[i] != src_layout.shapes[i]) {
printf("src shape not equal input shape");
}
}
input_tensor->set_layout(src_tensor->get_layout());
//! reset or forward data to network
input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
//! forward
network->forward();
network->wait();
//! forward
{
lite::Timer ltimer("warmup");
network->forward();
network->wait();
ltimer.print_used_time(0);
}
lite::Timer ltimer("forward_iter");
for (int i = 0; i < 10; i++) {
ltimer.reset_start();
network->forward();
network->wait();
ltimer.print_used_time(i);
}
//! get the output data or read tensor set in network_in
size_t output_size = network->get_all_output_name().size();
output_info(network, output_size);
output_data_info(network, output_size);
return true;
}
bool lite::example::basic_load_from_memory(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
FILE* fin = fopen(network_path.c_str(), "rb");
if (!fin) {
printf("failed to open %s.", network_path.c_str());
}
fseek(fin, 0, SEEK_END);
size_t size = ftell(fin);
fseek(fin, 0, SEEK_SET);
void* ptr = malloc(size);
std::shared_ptr<void> buf{ptr, ::free};
auto len = fread(buf.get(), 1, size, fin);
if (len < 1) {
printf("read file failed.\n");
}
fclose(fin);
network->load_model(buf.get(), size);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
void* dst_ptr = input_tensor->get_memory_ptr();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
printf("length=%zu\n", length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool lite::example::async_forward(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
Config config;
config.options.var_sanity_check_first_run = false;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
void* dst_ptr = input_tensor->get_memory_ptr();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
//! set async mode and callback
volatile bool finished = false;
network->set_async_callback([&finished]() {
#if !__DEPLOY_ON_XP_SP2__
std::cout << "worker thread_id:" << std::this_thread::get_id()
<< std::endl;
#endif
finished = true;
});
#if !__DEPLOY_ON_XP_SP2__
std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
#endif
//! forward
network->forward();
size_t count = 0;
while (finished == false) {
count++;
}
printf("Forward finish, count is %zu\n", count);
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
printf("length=%zu\n", length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/cpu_affinity.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
bool lite::example::cpu_affinity(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
//! run with multi theads
Runtime::set_cpu_threads_number(network, 4);
network->load_model(network_path);
std::vector<int> core_ids = {0, 1, 2, 3};
auto affinity = [core_ids](int id) {
//! add user define affinity function
set_cpu_affinity({core_ids[id]});
printf("set thread id = %d with the affinity of core %d.\n", id,
core_ids[id]);
};
Runtime::set_runtime_thread_affinity(network, affinity);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
void* dst_ptr = input_tensor->get_memory_ptr();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
printf("length=%zu\n", length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/device_io.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include <thread>
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
#if LITE_WITH_CUDA
bool lite::example::device_input(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! config the network running in CUDA device
lite::Config config{LiteDeviceType::LITE_CUDA};
//! set NetworkIO
NetworkIO network_io;
std::string input_name = "data";
bool is_host = false;
IO device_input{input_name, is_host};
network_io.inputs.push_back(device_input);
//! create and load the network
std::shared_ptr<Network> network =
std::make_shared<Network>(config, network_io);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
Layout input_layout = input_tensor->get_layout();
//! read data from numpy data file
auto src_tensor = parse_npy(input_path);
//! malloc the device memory
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
//! copy to the device memory
tensor_device.copy_from(*src_tensor);
//! Now the device memory if filled with user input data, set it to the
//! input tensor
input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool lite::example::device_input_output(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! config the network running in CUDA device
lite::Config config{LiteDeviceType::LITE_CUDA};
//! set NetworkIO include input and output
NetworkIO network_io;
std::string input_name = "data";
std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
bool is_host = false;
IO device_input{input_name, is_host};
IO device_output{output_name, is_host};
network_io.inputs.push_back(device_input);
network_io.outputs.push_back(device_output);
//! create and load the network
std::shared_ptr<Network> network =
std::make_shared<Network>(config, network_io);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
Layout input_layout = input_tensor_device->get_layout();
//! read data from numpy data file
auto src_tensor = parse_npy(input_path);
//! malloc the device memory
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
//! copy to the device memory
tensor_device.copy_from(*src_tensor);
//! Now the device memory is filled with user input data, set it to the
//! input tensor
input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
//! forward
network->forward();
network->wait();
//! output is in device, should copy it to host
std::shared_ptr<Tensor> output_tensor_device =
network->get_io_tensor(output_name);
auto output_tensor = std::make_shared<Tensor>();
output_tensor->copy_from(*output_tensor_device);
//! get the output data or read tensor set in network_in
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool lite::example::pinned_host_input(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! config the network running in CUDA device
lite::Config config{LiteDeviceType::LITE_CUDA};
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
Layout input_layout = input_tensor->get_layout();
//! read data from numpy data file
auto src_tensor = parse_npy(input_path);
//! malloc the pinned host memory
bool is_pinned_host = true;
auto tensor_pinned_input =
Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
//! copy to the pinned memory
tensor_pinned_input.copy_from(*src_tensor);
//! set the pinned host memory to the network as input
input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/basic_c_interface.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#include "misc.h"
#if LITE_BUILD_WITH_MGE
#include "lite-c/global_c.h"
#include "lite-c/network_c.h"
#include "lite-c/tensor_c.h"
#include <thread>
#define LITE_CAPI_CHECK(_expr) \
do { \
int _ret = (_expr); \
if (_ret) { \
LITE_THROW(LITE_get_last_error()); \
} \
} while (0)
bool basic_c_interface(const lite::example::Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! read input data to lite::tensor
auto src_tensor = lite::example::parse_npy(input_path);
void* src_ptr = src_tensor->get_memory_ptr();
//! create and load the network
LiteNetwork c_network;
LITE_CAPI_CHECK(
LITE_make_network(&c_network, *default_config(), *default_network_io()));
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
//! set input data to input tensor
LiteTensor c_input_tensor;
LITE_CAPI_CHECK(
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
void* dst_ptr;
size_t length_in_byte;
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
&length_in_byte));
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr));
//! copy or forward data to network
memcpy(dst_ptr, src_ptr, length_in_byte);
//! forward
LITE_CAPI_CHECK(LITE_forward(c_network));
LITE_CAPI_CHECK(LITE_wait(c_network));
//! get the output data or read tensor data
const char* output_name;
LiteTensor c_output_tensor;
//! get the first output tensor name
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
&c_output_tensor));
void* output_ptr;
size_t length_output_in_byte;
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
&length_output_in_byte));
size_t out_length = length_output_in_byte / sizeof(float);
printf("length=%zu\n", out_length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(output_ptr)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool device_io_c_interface(const lite::example::Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! read input data to lite::tensor
auto src_tensor = lite::example::parse_npy(input_path);
void* src_ptr = src_tensor->get_memory_ptr();
size_t length_read_in = src_tensor->get_tensor_total_size_in_byte();
//! create and load the network
LiteNetwork c_network;
LITE_CAPI_CHECK(
LITE_make_network(&c_network, *default_config(), *default_network_io()));
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
//! set input data to input tensor
LiteTensor c_input_tensor;
size_t length_tensor_in;
LITE_CAPI_CHECK(
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
&length_tensor_in));
if (length_read_in != length_tensor_in) {
LITE_THROW("The input data size is not match the network input tensro "
"size,\n");
}
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
length_tensor_in));
//! reset the output tensor memory with user allocated memory
size_t out_length = 1000;
LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
std::shared_ptr<float> ptr(new float[out_length],
[](float* ptr) { delete[] ptr; });
const char* output_name;
LiteTensor c_output_tensor;
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
&c_output_tensor));
LITE_CAPI_CHECK(
LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));
//! forward
LITE_CAPI_CHECK(LITE_forward(c_network));
LITE_CAPI_CHECK(LITE_wait(c_network));
printf("length=%zu\n", out_length);
float max = -1.0f;
float sum = 0.0f;
void* out_data = ptr.get();
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
namespace {
volatile bool finished = false;
int async_callback(void) {
#if !__DEPLOY_ON_XP_SP2__
std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
#endif
finished = true;
return 0;
}
} // namespace
bool async_c_interface(const lite::example::Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! read input data to lite::tensor
auto src_tensor = lite::example::parse_npy(input_path);
void* src_ptr = src_tensor->get_memory_ptr();
LiteNetwork c_network;
LiteConfig config = *default_config();
config.options.var_sanity_check_first_run = false;
LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io()));
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));
//! set input data to input tensor
LiteTensor c_input_tensor;
size_t length_tensor_in;
LITE_CAPI_CHECK(
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
&length_tensor_in));
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
length_tensor_in));
#if !__DEPLOY_ON_XP_SP2__
std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl;
#endif
LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback));
//! forward
LITE_CAPI_CHECK(LITE_forward(c_network));
size_t count = 0;
while (finished == false) {
count++;
}
printf("The count is %zu\n", count);
finished = false;
//! get the output data or read tensor data
const char* output_name;
LiteTensor c_output_tensor;
//! get the first output tensor name
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
&c_output_tensor));
void* output_ptr;
size_t length_output_in_byte;
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
&length_output_in_byte));
size_t out_length = length_output_in_byte / sizeof(float);
printf("length=%zu\n", out_length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(output_ptr)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/network_share_weights.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
bool lite::example::network_share_same_weights(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
network->load_model(network_path);
//! load a new network from the created network and share the same weights,
Config config_new;
config_new.options.const_shape = true;
NetworkIO network_io_new;
std::shared_ptr<Network> weight_shared_network =
std::make_shared<Network>(config_new, network_io_new);
Runtime::shared_weight_with_network(weight_shared_network, network);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
void* dst_ptr = input_tensor->get_memory_ptr();
std::shared_ptr<Tensor> input_tensor2 =
weight_shared_network->get_input_tensor(0);
void* dst_ptr2 = input_tensor2->get_memory_ptr();
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
memcpy(dst_ptr2, src, length);
//! forward
network->forward();
network->wait();
weight_shared_network->forward();
weight_shared_network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
std::shared_ptr<Tensor> output_tensor2 =
weight_shared_network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
void* out_data2 = output_tensor2->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
printf("length=%zu\n", length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
float data2 = static_cast<float*>(out_data2)[i];
if (data != data2) {
printf("the result between the origin network and weight share "
"netwrok is different.\n");
}
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/reset_io.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
bool lite::example::reset_input(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
lite::Config config;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
input_tensor->reset(src, layout);
//! forward
network->forward();
network->wait();
//! 6. get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool lite::example::reset_input_output(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
lite::Config config;
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
input_tensor->reset(src, layout);
//! set output ptr to store the network output
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
auto result_tensor = std::make_shared<Tensor>(
LiteDeviceType::LITE_CPU,
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
void* out_data = result_tensor->get_memory_ptr();
output_tensor->reset(out_data, result_tensor->get_layout());
network->forward();
network->wait();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < 1000; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/user_allocator.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
namespace {
class CheckAllocator : public lite::Allocator {
public:
//! allocate memory of size in the given device with the given align
void* allocate(LiteDeviceType, int, size_t size, size_t align) override {
#ifdef WIN32
return _aligned_malloc(size, align);
#elif defined(__ANDROID__) || defined(ANDROID)
return memalign(align, size);
#else
void* ptr = nullptr;
auto err = posix_memalign(&ptr, align, size);
if (!err) {
printf("failed to malloc %zu bytes with align %zu", size, align);
}
return ptr;
#endif
};
//! free the memory pointed by ptr in the given device
void free(LiteDeviceType, int, void* ptr) override {
#ifdef WIN32
_aligned_free(ptr);
#else
::free(ptr);
#endif
};
};
} // namespace
bool lite::example::config_user_allocator(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
auto allocator = std::make_shared<CheckAllocator>();
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>();
Runtime::set_memory_allocator(network, allocator);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
//! copy or forward data to network
size_t length = input_tensor->get_tensor_total_size_in_byte();
void* dst_ptr = input_tensor->get_memory_ptr();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
memcpy(dst_ptr, src, length);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
printf("length=%zu\n", length);
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file example/user_cryption.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "../example.h"
#if LITE_BUILD_WITH_MGE
using namespace lite;
using namespace example;
namespace {
std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size,
const std::vector<uint8_t>& key) {
if (key.size() == 1) {
std::vector<uint8_t> ret(size, 0);
const uint8_t* ptr = static_cast<const uint8_t*>(model_mem);
uint8_t key_data = key[0];
for (size_t i = 0; i < size; i++) {
ret[i] = ptr[i] ^ key_data ^ key_data;
}
return ret;
} else {
printf("the user define decrypt method key length is wrong.\n");
return {};
}
}
} // namespace
bool lite::example::register_cryption_method(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! register the decryption method
register_decryption_and_key("just_for_test", decrypt_model, {15});
lite::Config config;
config.bare_model_cryption_name = "just_for_test";
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
input_tensor->reset(src, layout);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool lite::example::update_cryption_key(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
//! update the decryption method key
std::vector<uint8_t> key(32, 0);
for (size_t i = 0; i < 32; i++) {
key[i] = 31 - i;
}
update_decryption_or_key("AES_default", nullptr, key);
lite::Config config;
config.bare_model_cryption_name = "AES_default";
//! create and load the network
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
//! set input data to input tensor
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
auto layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
void* src = src_tensor->get_memory_ptr();
input_tensor->reset(src, layout);
//! forward
network->forward();
network->wait();
//! get the output data or read tensor set in network_in
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
此差异已折叠。
/**
* \file inlude/lite/common_enum_c.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_COMMON_ENUM_C_H_
#define LITE_COMMON_ENUM_C_H_
/*!
* \brief The log level.
*/
typedef enum LiteLogLevel {
DEBUG = 0, /*!< The lowest level and most verbose */
INFO = 1, /*!< The lowest level and most verbose */
WARN = 2, /*!< Print only warning and errors */
ERROR = 3, /*!< Print only errors */
} LiteLogLevel;
typedef enum LiteBackend {
LITE_DEFAULT = 0, //! default backend is mge
} LiteBackend;
typedef enum LiteDeviceType {
LITE_CPU = 0,
LITE_CUDA = 1,
LITE_ATLAS = 3,
LITE_NPU = 4,
//! when the device information is set in model, so set LITE_DEVICE_DEFAULT
//! in lite
LITE_DEVICE_DEFAULT = 5,
} LiteDeviceType;
typedef enum LiteDataType {
LITE_FLOAT = 0,
LITE_HALF = 1,
LITE_INT = 2,
LITE_INT16 = 3,
LITE_INT8 = 4,
LITE_UINT8 = 5,
LITE_UINT = 6,
LITE_UINT16 = 7,
LITE_INT64 = 8,
} LiteCDataType;
typedef enum LiteTensorPhase {
//! Tensor maybe input or output
LITE_IO = 0,
//! Tensor is input
LITE_INPUT = 1,
//! Tensor is output
LITE_OUTPUT = 2,
} LiteTensorPhase;
/*!
* \brief the input and output type, include SHAPE and VALUE
* sometimes user only need the shape of the output tensor
*/
typedef enum LiteIOType {
LITE_IO_VALUE = 0,
LITE_IO_SHAPE = 1,
} LiteIOType;
/*!
* \brief operation algorithm seletion strategy type, some operations have
* multi algorithms, different algorithm has different attribute, according to
* the strategy, the best algorithm will be selected.
*
* Note: These strategies can be combined
*
* 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
* use heuristic instead
*
* 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
* reproducible algo
*
* 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
* algorithm from the reproducible algorithms set
*
* 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
* algorithm form the optimzed algorithms, thus profile will process fast
*
* 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
* profile the best algorithm form the optimzed and reproducible algorithms
*/
typedef enum LiteAlgoSelectStrategy {
LITE_ALGO_HEURISTIC = 1 << 0,
LITE_ALGO_PROFILE = 1 << 1,
LITE_ALGO_REPRODUCIBLE = 1 << 2,
LITE_ALGO_OPTIMIZED = 1 << 3,
} LiteAlgoSelectStrategy;
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file inlude/lite/global.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#pragma once
#include "macro.h"
#include "network.h"
#include <functional>
#include <memory>
#include <vector>
namespace lite {
/**
* \brief Model decryption function
*
* \param[in] const void* is the decrypted model memory pointer
* \param[in] size_t the size the decrypted model memory in byte
* \param[in] const std::vector<uint8_t>& the decryption key vector
*/
using DecryptionFunc = std::function<std::vector<uint8_t>(
const void*, size_t, const std::vector<uint8_t>&)>;
/**
* \brief register a custom decryption method and key to lite.
*
* \param[in] decrypt_name the name of the decryption, which will act as the
* hash key to find the decryption method.
*
* \param[in] func the decryption function, which will decrypt the model with
* the registered key, return a vector that contain the decrypted model.
*
* \param[in] key the decryption key of the method
*/
LITE_API bool register_decryption_and_key(std::string decrypt_name,
const DecryptionFunc& func,
const std::vector<uint8_t>& key);
/**
* \brief update decryption function or key of a custom decryption method.
*
* \param[in] decrypt_name the name of the decryption, which will act as the
* hash key to find the decryption method.
*
* \param[in] func the decryption function, which will decrypt the model with
* the registered key, return a vector that contain the decrypted model. if
* function is nullptr, it will not be updated.
*
* \param[in] key the decryption key of the method, if the size of key is zero,
* it will not be updated
*/
LITE_API bool update_decryption_or_key(std::string decrypt_name,
const DecryptionFunc& func,
const std::vector<uint8_t>& key);
/**
* \brief Model information parse function
*
* \param[in] const void* is the information memory
* \param[in] size_t the size the information memory
* \param[in] const std::string the model name used for check whether the
* infomation match the model
* \param[in] Config the model config, ParseInfoFunc can fill it with the
* information in json, the config will influence Network loading later
* \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the
* information in json, the networkio will influence Network forwarding later
* \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the
* other config not inclue in config and networkIO, ParseInfoFunc can fill it
* with the information in json, now support:
* "device_id" : int, default 0
* "number_threads" : size_t, default 1
* "is_inplace_model" : bool, default false
* "use_tensorrt" : bool, default false
*/
using ParseInfoFunc = std::function<bool(
const void*, size_t, const std::string model_name, Config& config,
NetworkIO& network_io,
std::unordered_map<std::string, LiteAny>& isolated_config_map,
std::string& extra_info)>;
/**
* \brief register a custom parser function to lite.
*
* \param[in] info_type the name of the parser function, which will act as the
* hash key to find the parser method.
*
* \param[in] parse_func the parser function, which will parse the given
* information and modify the Network Config and IO.
*
*/
LITE_API bool register_parse_info_func(std::string info_type,
const ParseInfoFunc& parse_func);
/*! \brief Get version
*/
LITE_API void get_version(int& major, int& minor, int& patch);
/*! \brief Set the current log level.
* \param[in] level The new log level
*/
LITE_API void set_log_level(LiteLogLevel level);
/*! \brief Get the current log level.
* \return The current log level
*/
LITE_API LiteLogLevel get_log_level();
/*! \brief Get device count
* \param[in] device_type device type
* \return the device count
*/
LITE_API size_t get_device_count(LiteDeviceType device_type);
/*! \brief try to coalesce all free memory in megenine
*/
LITE_API void try_coalesce_all_free_memory();
/*!
* \brief Set the loader to the lite
* \param loader_path is the file path which store the cache
*/
LITE_API void set_loader_lib_path(const std::string& loader_path);
/*!
* \brief Set the algo policy cache file for CPU/CUDA ...
* \param cache_path is the file path which store the cache
* \param always_sync sync the cache when model run
*/
LITE_API void set_persistent_cache(const std::string& cache_path,
bool always_sync = false);
/*!
* \brief dump the PersistentCache policy cache to file, if the network is set
* to profile when forward, though this the algo policy will dump to file
*/
LITE_API void dump_persistent_cache(const std::string& cache_path);
/*!
* \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
*/
LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path);
/*!
* \brief dump the TensorRT cache to the file set in set_tensor_rt_cache
*/
LITE_API void dump_tensor_rt_cache();
} // namespace lite
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file include/lite/macro.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_MACRO_H_
#define LITE_MACRO_H_
#if defined(_WIN32)
#define LITE_API __declspec(dllexport)
#else
#define LITE_API __attribute__((visibility("default")))
#endif
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file inlude/lite/network.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#pragma once
#include "macro.h"
#include "tensor.h"
#include <functional>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
namespace lite {
LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x,
LiteAlgoSelectStrategy y) {
return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) |
static_cast<uint32_t>(y));
}
/*!
* \brief the inference options which will be translated to megenine
*
* \param weight_preprocess is the option wich optimize the inferece performance
* with preprocess the const weights
*
* \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
* dimshuffle
*
* \param fake_next_exec whether only to perform non-computing tasks (like
* memory allocation and queue initialization) for next exec. This would be
* reset to false when the graph is executed.
*
* \param var_sanity_check_first_run Disable var sanity check on the first run.
* Var sanity check is enabled on the first-time execution by default, and can
* be used to find some potential memory access errors in the operator
* implementation.
*
* \param const_shape This can be used to reduce memory usage since some
* static inference data structures can be omitted.
*
* \param force_dynamic_alloc force dynamic memory alloc for all vars
*
* \param force_output_dynamic_alloc force dynamic memory alloc for output vars
* which are used as CallbackCaller input when call compile() function
*
* \param no_profiling_on_shape_change do not re-profile to select best impl
* algo when input shape changes (use previous algo)
*
* \param jit_level Execute supported operators with JIT (support MLIR,
* NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
* 1 for basic elemwise opr;
* 2 for including reduce operator
*
* \param record_level flag optimize the inference performace with record the
* kernel tasks in first run, hereafter the inference all need to execute the
* recorded tasks.
* level = 0 means the normal inference,
* level = 1 means use record inference,
* level = 2 means record inference with free the extra memory
*
* \param graph_opt_level optimization level:
* 0: disable
* 1: level-1: inplace arith transformations during graph
* construction
* 2: level-2: level-1, plus global optimization before graph
* compiling
* 3: also enable JIT
* <0: corresponding level, with result check for debug
*
* \param async_exec_level exec: dispatch on separate threads for different
* comp_node.
* 0: do not perform async dispatch
* 1: dispatch async if there are more than one comp node with limited queue
* mask 0b10: async if there are multiple comp nodes with
* mask 0b100: always async
*/
struct LITE_API Options {
bool weight_preprocess = false;
bool fuse_preprocess = false;
bool fake_next_exec = false;
bool var_sanity_check_first_run = true;
bool const_shape = false;
bool force_dynamic_alloc = false;
bool force_output_dynamic_alloc = false;
bool no_profiling_on_shape_change = false;
uint8_t jit_level = 0;
uint8_t comp_node_seq_record_level = 0;
uint8_t graph_opt_level = 2;
uint16_t async_exec_level = 1;
//! layout transform options
bool enable_nchw44 = false;
bool enable_nchw44_dot = false;
bool enable_nchw88 = false;
bool enable_nhwcd4 = false;
bool enable_nchw4 = false;
bool enable_nchw32 = false;
bool enable_nchw64 = false;
};
/*!
* \brief Configuration when load and compile the graph
*
* \param bare_model_cryption_name is the bare model cryption method name, bare
*model is not pack json info inside
*
*\param has_compression flag whether the model is compressed, the compress
*method will read form the model
*/
struct LITE_API Config {
bool has_compression = false;
int device_id = 0;
LiteDeviceType device_type = LiteDeviceType::LITE_CPU;
LiteBackend backend = LiteBackend::LITE_DEFAULT;
std::string bare_model_cryption_name = {};
Options options = {};
};
/*!
* \brief config the network input and output item
*
*/
struct LITE_API IO {
//! the tensor name in the graph corresponding to the IO
std::string name;
//! Used to mark where the input tensor comes from and the output where copy
//! to, if is_host is true, the input is from host and output copy to host,
//! otherwise device. Sometimes The input is from device and output no need
//! copy to host, default is true.
bool is_host = true;
//! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
//! output tensor value is invaid, only shape will be set, default is VALUE
LiteIOType io_type = LiteIOType::LITE_IO_VALUE;
//! The layout of the config from user, if other layout is set before
//! forward or get after forward by input tensor reset, this layout will by
//! pass. if no other layout is set before forward, this layout will work.
//! if this layout is no set, the model will forward with its origin layout.
//! if in output, it will used to check.
Layout config_layout = {};
};
/*!
* \brief the input and output information when load the network
* the NetworkIO will remain in the network until the network is destroyed
*/
struct LITE_API NetworkIO {
std::vector<IO> inputs = {};
std::vector<IO> outputs = {};
};
/*!
* \brief A user-implemented allocator interface
*/
class LITE_API Allocator {
public:
virtual ~Allocator() = default;
//! allocate memory of size in the given device with the given align
virtual void* allocate(LiteDeviceType device_type, int device_id,
size_t size, size_t align) = 0;
//! free the memory pointed by ptr in the given device
virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0;
};
/*!
* \brief the thread affinith callback type
* \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
* thread_id of (nr_threads - 1) is the main worker thread.
*/
using ThreadAffinityCallback = std::function<void(int thread_id)>;
using AsyncCallback = std::function<void(void)>;
/*!
* \brief the start/finish callback function
* \param unordered_map map from the io tensor name to the pair of which is the
* corresponding IO of user config and the realy input or output tensor.
*/
using StartCallback = std::function<void(
const std::unordered_map<std::string,
std::pair<IO, std::shared_ptr<Tensor>>>&)>;
using FinishCallback = std::function<void(
const std::unordered_map<std::string,
std::pair<IO, std::shared_ptr<Tensor>>>&)>;
/*!
* \brief The network is construct form a model, implement model load, init,
* forward, and display some model information
*/
class LITE_API Network {
public:
class NetworkImplBase;
~Network();
Network(const Config& config = {}, const NetworkIO& networkio = {});
Network(const NetworkIO& networkio, const Config& config = {});
//! load the model form memory
void load_model(void* model_mem, size_t size);
//! load the model from a model path
void load_model(std::string model_path);
//! only compute the output tensor in user configured
void compute_only_configured_output();
//! get the network input and output tensor, the layout of which is
//! sync from mge tensor, when the name of input and output tensor are the
//! same, use LiteTensorPhase to separate
std::shared_ptr<Tensor> get_io_tensor(
std::string io_name,
LiteTensorPhase phase = LiteTensorPhase::LITE_IO);
//! get the network input by index
std::shared_ptr<Tensor> get_input_tensor(size_t index);
//! get the network output tensor by index
std::shared_ptr<Tensor> get_output_tensor(size_t index);
//! set the network forward in async mode and set the async callback
//! function
Network& set_async_callback(const AsyncCallback& async_callback);
//! set the start forward callback function, which will be execute before
//! forward. this can be used to check network input or dump model inputs
//! for debug
Network& set_start_callback(const StartCallback& start_callback);
//! set the finish forward callback function, which will be execute after
//! forward. this can be used to dump model outputs for debug
Network& set_finish_callback(const FinishCallback& finish_callback);
//! forward the network with filled input data and fill the output data
//! to the output tensor
void forward();
//! waite until forward finish in sync model
void wait();
//! get the input tensor name in the order in load return
std::string get_input_name(size_t index) const;
//! get the output tensor name in the order in load return
std::string get_output_name(size_t index) const;
//! get all the input tensor name in the order in load return
std::vector<std::string> get_all_input_name() const;
//! get all the output tensor name in the order in load return
std::vector<std::string> get_all_output_name() const;
//! set/get device id, default device id = 0
Network& set_device_id(int device_id);
int get_device_id() const;
//! set/get stream id, default stream id = 0
Network& set_stream_id(int stream_id);
int get_stream_id() const;
//! enable profile the network, a file will be generated
void enable_profile_performance(std::string profile_file_path);
//! get model extra info
const std::string& get_model_extra_info();
//! get device type
LiteDeviceType get_device_type() const;
public:
friend class NetworkHelper;
private:
//! update member from implement
void update_from_implement();
//! decrypt and parse the model file
void prase_model(std::shared_ptr<void> model_data, size_t size);
private:
bool m_loaded = false;
Config m_config;
NetworkIO m_network_io;
std::unique_ptr<NetworkImplBase> m_impl;
std::string m_extra_info;
};
/*********************** MGE special network function ***************/
class LITE_API Runtime {
public:
//! When device is CPU, this interface will set the to be loaded model
//! run in multi thread mode with the given thread number.
static void set_cpu_threads_number(std::shared_ptr<Network> dst_network,
size_t nr_threads);
static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network);
//! set threads affinity callback;
static void set_runtime_thread_affinity(
std::shared_ptr<Network> network,
const ThreadAffinityCallback& thread_affinity_callback);
//! Set cpu default mode when device is CPU, in some low computation
//! device or single core device, this mode will get good performace
static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
//! Set use tensorrt forward
static void use_tensorrt(std::shared_ptr<Network> dst_network);
//! set opr algorithm selection strategy in the network
//! shared_batch_size: the batch size used by fastrun,
//! Non-zero value means that fastrun use this batch size
//! regardless of the batch size of the model. Zero means
//! fastrun use batch size of the model
//! binary_equal_between_batch: if the content of each input batch is binary
//! equal,whether the content of each output
//! batch is promised to be equal
static void set_network_algo_policy(
std::shared_ptr<Network> dst_network,
LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0,
bool binary_equal_between_batch = false);
//! set workspace_limit for oprs with multiple algorithms, set
//! workspace limitation can save memory but may influence the performance
static void set_network_algo_workspace_limit(
std::shared_ptr<Network> dst_network, size_t workspace_limit);
//! set the network memroy allocator, the allocator is defined by user
static void set_memory_allocator(std::shared_ptr<Network> dst_network,
std::shared_ptr<Allocator> user_allocator);
//! share the runtime memory with other network, the weights is not shared
static void share_runtime_memory_with(std::shared_ptr<Network> dst_network,
std::shared_ptr<Network> src_network);
//! Dump input/output values of all internal variables to output
//! file, in txt format
static void enable_io_txt_dump(std::shared_ptr<Network> dst_network,
std::string io_txt_out_file);
//! Dump input/output values of all internal variables to output
//! directory, in binary format
static void enable_io_bin_dump(std::shared_ptr<Network> dst_network,
std::string io_bin_out_dir);
//! load a new network which will share weights with src network
static void shared_weight_with_network(
std::shared_ptr<Network> dst_network,
const std::shared_ptr<Network> src_network);
};
} // namespace lite
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file inlude/lite/tensor.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#pragma once
#include "common_enum_c.h"
#include "macro.h"
#include <memory>
#include <unordered_map>
#include <vector>
namespace lite {
/*!
* \brief the simple layout description
*/
struct LITE_API Layout {
static constexpr uint32_t MAXDIM = 7;
size_t shapes[MAXDIM];
size_t ndim = 0;
LiteDataType data_type = LiteDataType::LITE_FLOAT;
//! get the total byte of a layout
size_t get_elem_size() const;
//! compare whether the two layout is equal
bool operator==(const Layout& other) const;
};
/*!
* \brief warpper of the MegEngine Tensor
*
* The memory is not alloc directly, when call get_memory_ptr() the memory
* will be allocated in tensor implement, which will be deleted automatically
*
* Note: if the tensor memory is set through reset() interface, the memory is
* managed by the user, it will not be freed by the tensor
*
* If the device or layout is not set, when copy form other source tensor, its
* device and layout will be copy form the source tensor
*
* if is_pinned_host is set, the storage memory of the tensor is pinned memory,
* this is used to Optimize the H2D or D2H memory copy, if the device or layout
* is not set, when copy form other device(CUDA) tensor, this tensor
* will be automatically set to pinned tensor
*/
class LITE_API Tensor {
class TensorImpl;
public:
class TensorImplBase;
Tensor();
Tensor(LiteDeviceType device_type, bool is_pinned_host = false);
Tensor(LiteDeviceType device_type, const Layout& layout,
bool is_pinned_host = false);
Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {},
bool is_pinned_host = false);
Tensor(int device_id, int stream_id, LiteDeviceType device_type,
bool is_pinned_host = false);
Tensor(LiteBackend backend,
LiteDeviceType device_type = LiteDeviceType::LITE_CPU,
int device_id = 0, const Layout& layout = {},
bool is_pinned_host = false);
~Tensor();
LiteDeviceType get_device_type() const { return m_device_type; };
int get_device_id() const { return m_device_id; };
Layout get_layout() const { return m_layout; };
bool is_pinned_host() const { return m_is_pinned_host; };
//! set layout will change the layout and reallocate memory of the tensor
void set_layout(const Layout& layout);
//! which will trigger memory alloc in tensor implement
void* get_memory_ptr() const;
//! get the memory with the offset describe in idx
void* get_memory_ptr(const std::vector<size_t>& idx) const;
//! get the tensor capacity in byte
size_t get_tensor_total_size_in_byte() const;
//! use the user allocated data to reset the memory of the tensor, the
//! memory will not be managed by the lite, later, the user should delete
//! it.
void reset(void* prepared_data, size_t data_length_in_byte);
//! use the user allocated data and corresponding layout to reset the data
//! and layout of the tensor, the memory will not be managed by lite, later,
//! the user should delete it.
void reset(void* prepared_data, const Layout& layout);
//! reshape the tensor with new shape, keep the data_type the same
void reshape(const std::vector<int>& shape);
//! get a new tensor slice from the origin tensor
std::shared_ptr<Tensor> slice(const std::vector<size_t>& start,
const std::vector<size_t>& end,
const std::vector<size_t>& step = {});
//! set the tensor memory with zero
void fill_zero();
//! copy tensor form other tensor
//! Note: the best way for tensor copy is just set the dst device, left
//! layout empty, when copying the dst layout will be set the same with
//! src
void copy_from(const Tensor& src);
//! share memory with other tensor
void share_memory_with(const Tensor& src_tensor);
//! whether the memory of tensor is continue
bool is_continue_memory() const;
//! update the menbers from the implement
void update_from_implement();
public:
friend class TensorHelper;
private:
std::shared_ptr<TensorImplBase> m_tensor_impl;
//! flag whether the storage of the tensor is pinned, this is only used
//! when the compnode is not in CPU
bool m_is_pinned_host = false;
int m_device_id = 0;
Layout m_layout;
//! the device of the tensor should not be changed after the tensor has
//! constructed
LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU;
};
/**
* \brief a class can hold any type data, but not check whether the visit type
* is valid
*/
class LITE_API LiteAny {
public:
LiteAny() = default;
template <class T>
LiteAny(T value) : m_holder(new AnyHolder<T>(value)) {
m_is_string = std::is_same<std::string, T>();
}
LiteAny(const LiteAny& any) {
m_holder = any.m_holder->clone();
m_is_string = any.is_string();
}
LiteAny& operator=(const LiteAny& any) {
m_holder = any.m_holder->clone();
m_is_string = any.is_string();
return *this;
}
bool is_string() const { return m_is_string; }
class HolderBase {
public:
virtual ~HolderBase() = default;
virtual std::shared_ptr<HolderBase> clone() = 0;
virtual size_t type_length() const = 0;
};
template<class T>
class AnyHolder : public HolderBase {
public:
AnyHolder(const T value) :
m_value(value) {
}
virtual std::shared_ptr<HolderBase> clone() override {
return std::make_shared<AnyHolder>(m_value);
}
virtual size_t type_length() const override { return sizeof(T); }
public:
T m_value;
};
//! if type is miss matching, it will throw
void type_missmatch(size_t expect, size_t get) const;
//! only check the storage type and the visit type length, so it's not safe
template <class T>
T unsafe_cast() const {
if (sizeof(T) != m_holder->type_length()) {
type_missmatch(m_holder->type_length(), sizeof(T));
}
return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value;
}
//! only check the storage type and the visit type length, so it's not safe
void* cast_void_ptr() const {
return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value;
}
private:
std::shared_ptr<HolderBase> m_holder;
bool m_is_string = false;
};
/*********************** special tensor function ***************/
class LITE_API TensorUtils {
public:
//! concat all the input tensor to one on the specified dim, the result
//! tensor reside in dst_device_id of dst_device, if dst_device is
//! LITE_DEVICE_DEFAULT, the device will get from the first tensor
static std::shared_ptr<Tensor> concat(
const std::vector<Tensor>& tensors, int dim,
LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT,
int dst_device_id = -1);
};
} // namespace lite
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file lite-c/include/lite-c/global-c.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_C_GLOBAL_H_
#define LITE_C_GLOBAL_H_
#include "macro.h"
#include "network_c.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! \brief Get version
*/
LITE_API int LITE_get_version(int* major, int* minor, int* patch);
/*! \brief Get the last error message.
* \return the message pointer
*/
LITE_API const char* LITE_get_last_error();
/*! \brief Get device count
* \param[in] device_type device type
* \return the device count
*/
LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count);
/*! \brief try to coalesce all free memory in megenine
*/
LITE_API int LITE_try_coalesce_all_free_memory();
/**
* \brief Model decryption function
*
* \param[in] input_data is the decrypted model memory pointer
* \param[in] input_size the size the decrypted model memory in byte
* \param[in] key_data decryption key data
* \param[in] key_size the size of decryption key data
* \param[out] output_data the data of decrypted data, if output_data is
* nullptr, just query the output memory length, else write the decryted data to
* the output_data
* \return size of decrypted data
*/
typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size,
const uint8_t* key_data, size_t key_size,
const void* output_data);
/**
* \brief Model information parse function
*
* \param[in] info_data is the information memory
* \param[in] info_size the size the information memory
* \param[in] model_name the model name used for check whether the
* infomation match the model
* \param[in] config the model config, ParseInfoFunc can fill it with the
* information in json, the config will influence Network loading later
* \param[in] network_io the model IO, ParseInfoFunc can fill it with the
* information in json, the networkio will influence Network forwarding later
* \param[in] device_id the address to store device_id, default 0
* \param[in] nr_threads the address to store nr_threads, default 1
* \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default
* \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default
* false
*/
typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size,
const char* model_name, LiteConfig* config,
LiteNetworkIO* network_io, int* device_id,
size_t* nr_threads, int* is_cpu_inplace_mode,
int* use_tensorrt);
/**
* \brief register a custom decryption method and key to lite.
*
* \param[in] decrypt_name the name of the decryption, which will act as the
* hash key to find the decryption method.
*
* \param[in] func the decryption function, which will decrypt the model with
* the registered key, return a vector that contain the decrypted model.
* \param[in] key_data the decryption key of the method
* \param[in] key_size the size of decryption key
*/
LITE_API int LITE_register_decryption_and_key(const char* decrypt_name,
const LiteDecryptionFunc func,
const uint8_t* key_data,
size_t key_size);
/**
* \brief update decryption function or key of a custom decryption method.
*
* \param[in] decrypt_name the name of the decryption, which will act as the
* hash key to find the decryption method.
*
* \param[in] func the decryption function, which will decrypt the model with
* the registered key, return a vector that contain the decrypted model. if
* function is nullptr, it will not be updated.
*
* \param[in] key the decryption key of the method, if the size of key is zero,
* it will not be updated
*/
LITE_API int LITE_update_decryption_or_key(const char* decrypt_name,
const LiteDecryptionFunc func,
const uint8_t* key_data,
size_t key_size);
/**
* \brief register a custom parser function to lite.
*
* \param[in] info_type the name of the parser function, which will act as the
* hash key to find the parser method.
*
* \param[in] parse_func the parser function, which will parse the given
* information and modify the Network Config and IO.
*
*/
LITE_API int LITE_register_parse_info_func(const char* info_type,
const LiteParseInfoFunc parse_func);
/*!
* \brief Set the loader to the lite
* \param[in] loader_path is the file path which store the cache
*/
LITE_API int LITE_set_loader_lib_path(const char* loader_path);
/*!
* \brief Set the algo policy cache file for CPU/CUDA ...
* \param[in] cache_path is the file path which store the cache
* \param[in] always_sync sync the cache when cache updated
*/
LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync);
/*!
* \brief Set the tensor policy cache file for CPU/CUDA ...
* \param[in] cache_path is the file path which store the cache
*/
LITE_API int LITE_set_tensor_rt_cache(const char* cache_path);
/*! \brief Set the current log level.
* \param[in] level The new log level
*/
LITE_API int LITE_set_log_level(LiteLogLevel level);
/*! \brief Get the current log level.
* \param[in] level The pointer to log level
*/
LITE_API int LITE_get_log_level(LiteLogLevel* level);
/*!
* \brief dump the algo policy cache to file, if the network is set to profile
* when forward, though this the algo policy will dump to file
* \param[in] cache_path is the file path which store the cache
*/
LITE_API int LITE_dump_persistent_cache(const char* cache_path);
/*!
* \brief dump the tensorrt policy cache to file
*/
LITE_API int LITE_dump_tensor_rt_cache();
#endif
#ifdef __cplusplus
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file lite-c/include/lite-c/network_c.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_C_NETWORK_H_
#define LITE_C_NETWORK_H_
#include "tensor_c.h"
#ifdef __cplusplus
extern "C" {
#endif
/*!
* \brief the inference options which will be translated to megenine
*
* \param weight_preprocess is the option wich optimize the inferece performance
* with preprocess the const weights
*
* \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
* dimshuffle
*
* \param fake_next_exec whether only to perform non-computing tasks (like
* memory allocation and queue initialization) for next exec. This would be
* reset to false when the graph is executed.
*
* \param var_sanity_check_first_run Disable var sanity check on the first run.
* Var sanity check is enabled on the first-time execution by default, and can
* be used to find some potential memory access errors in the operator
* implementation.
*
* \param const_shape This can be used to reduce memory usage since some
* static inference data structures can be omitted.
*
* \param force_dynamic_alloc force dynamic memory alloc for all vars
*
* \param force_output_dynamic_alloc force dynamic memory alloc for output vars
* which are used as CallbackCaller input when call compile() function
*
* \param no_profiling_on_shape_change do not re-profile to select best impl
* algo when input shape changes (use previous algo)
*
* \param jit_level Execute supported operators with JIT (support MLIR,
* NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
* 1 for basic elemwise opr;
* 2 for including reduce operator
*
* \param record_level flag optimize the inference performace with record the
* kernel tasks in first run, hereafter the inference all need to execute the
* recorded tasks.
* level = 0 means the normal inference,
* level = 1 means use record inference,
* level = 2 means record inference with free the extra memory
*
* \param graph_opt_level optimization level:
* 0: disable
* 1: level-1: inplace arith transformations during graph
* construction
* 2: level-2: level-1, plus global optimization before graph
* compiling
* 3: also enable JIT
* <0: corresponding level, with result check for debug
*
* \param async_exec_level exec: dispatch on separate threads for different
* comp_node.
* 0: do not perform async dispatch
* 1: dispatch async if there are more than one comp node with limited queue
* mask 0b10: async if there are multiple comp nodes with
* mask 0b100: always async
*/
typedef struct Options {
int weight_preprocess;
int fuse_preprocess;
int fake_next_exec;
int var_sanity_check_first_run;
int const_shape;
int force_dynamic_alloc;
int force_output_dynamic_alloc;
int no_profiling_on_shape_change;
int jit_level;
int comp_node_seq_record_level;
int graph_opt_level;
int async_exec_level;
//! layout transform options
int enable_nchw44;
int enable_nchw44_dot;
int enable_nchw88;
int enable_nhwcd4;
int enable_nchw4;
int enable_nchw32;
int enable_nchw64;
} LiteOptions;
//! define a default Options
extern LITE_API const LiteOptions default_option;
/*!
* \brief Configuration when load and compile the graph
*
* \param bare_model_cryption_name is the bare model cryption method name, bare
*model is not pack json info inside
*
*\param has_compression flag whether the model is compressed, the compress
*method will read form the model
*/
typedef struct LiteConfig {
int has_compression;
int device_id;
LiteDeviceType device_type;
LiteBackend backend;
const char* bare_model_cryption_name;
LiteOptions options;
} LiteConfig;
//! get default config
LITE_API LiteConfig* default_config();
/*!
* \brief config the network input and output item
*
*/
typedef struct LiteIO {
//! the tensor name in the graph corresponding to the IO
const char* name;
//! Used to mark where the input tensor comes from and the output where copy
//! to, if is_host is true, the input is from host and output copy to host,
//! otherwise device. Sometimes The input is from device and output no need
//! copy to host, default is true.
int is_host;
//! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
//! output tensor value is invaid, only shape will be set, default is VALUE
LiteIOType io_type;
//! The layout of the config from user, if other layout is set before
//! forward or get after forward, this layout will by pass. if no other
//! layout is set before forward, this layout will work. if this layout is
//! no set, the model will forward with its origin layout. if in output, it
//! will used to check.
LiteLayout config_layout;
} LiteIO;
//! define a default IO
extern LITE_API const LiteIO default_io;
/*!
* \brief the input and output information when load the network
* the NetworkIO will remain in the network until the network is destroyed
*/
typedef struct LiteNetworkIO {
LiteIO* inputs;
LiteIO* outputs;
size_t input_size; //! the number IO in inputs
size_t output_size; //! the number IO in outputs
} LiteNetworkIO;
//! get default NetworkIO
LITE_API LiteNetworkIO* default_network_io();
/*!
* \brief A user-implemented allocator function
*/
//! allocate memory of size in the given device with the given align
typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id,
size_t size, size_t align);
//! free the memory pointed by ptr in the given device
typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr);
/*!
* \brief the thread affinith callback type
* \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
* thread_id of (nr_threads - 1) is the main worker thread.
*/
typedef int (*LiteThreadAffinityCallback)(int thread_id);
typedef int (*LiteAsyncCallback)();
/*!
* \brief the start/finish callback function
* \param unordered_map map from the io tensor name to the pair of which is the
* corresponding IO of user config and the realy input or output tensor.
*/
typedef int (*LiteStartCallback)(const LiteIO* inputs,
const LiteTensor* input_tensors, size_t size);
typedef int (*LiteFinishCallback)(const LiteIO* outputs,
const LiteTensor* output_tensors,
size_t size);
/*!
* \brief The network is construct form a model, implement model load, init,
* forward, and display some model information
*/
typedef void* LiteNetwork;
/**
* \brief Create a lite Network object with default config and networkIO.
* \param[out] network The netwrok pointer
* \return int if the return is not zero, error happened, the error message
* can get by LITE_get_last_error
*/
LITE_API int LITE_make_default_network(LiteNetwork* network);
/**
* \brief Create a lite Network object from the given config and networkIO.
* \param[in] config The configration to create the network
* \param[in] network_io The configration io to create the network
* \param[out] network The network pointer
*/
LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config,
const LiteNetworkIO network_io);
/**
* \brief Create a lite Network object from the given config and networkIO.
* \param[in] config The configration to create the network
* \param[out] network The network pointer
*/
LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config);
/**
* \brief load the model to network form memory
* \param[in] model_mem The model in memory
* \param[in] size The size of the model memory
* \param[out] network The network to be load model in
*/
LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
size_t size);
/**
* \brief load the model to network form given path
* \param[in] model_path The model path
* \param[out] network The network to be load model in
*/
LITE_API int LITE_load_model_from_path(LiteNetwork network,
const char* model_path);
/**
* \brief load a new network which will share weights with src network
* \param[in] origin_network The origin network pointer
* \param[out] network The network pointer
*/
LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network,
const LiteNetwork src_network);
/**
* \brief Destroy a lite network object.
* \param[in] network The network pointer
* \return int if the return is not zero, error happened, the error message
* can get by LITE_get_last_error
*/
LITE_API int LITE_destroy_network(LiteNetwork network);
/**
* \brief forward the network with filled input data and fill the output data
* to the output tensor
* \param[in] network The loaded model
*/
LITE_API int LITE_forward(const LiteNetwork network);
/**
* \brief waite until forward finish in sync model
* \param[in] network The loaded model
*/
LITE_API int LITE_wait(const LiteNetwork network);
/**
* \brief get the network input and ouput tensor, the layout of which is
* get from model
* \param[in] network The loaded model
* \param[in] io_name The input or output name
* \param[in] phase The tensor phase
* \param[out] tensor The IO tensor get from the network
*/
LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
LiteTensorPhase phase, LiteTensor* tensor);
/**
* \brief get the input tensor name in the order in loaded model
* \param[in] network The loaded model
* \param[in] index The index of input tensor
* \param[out] name The input tensor name
*/
LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index,
const char** name);
/**
* \brief get the output tensor name in the order in loaded model
* \param[in] network The loaded model
* \param[in] index The index of output tensor
* \param[out] name The output tensor name
*/
LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index,
const char** name);
/**
* \brief get all the input tensor name in the order in loaded model
* \param[in] network The loaded model
* \param[in] size The number of the input tensor
* \param[out] name The input tensor names
*/
LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
const char** name);
/**
* \brief get all the output tensor name in the order in loaded model
* \param[in] network The loaded model
* \param[in] size The number of output tensor
* \param[out] name The output tensor name
*/
LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
const char** name);
/**
* \brief get whether the model is running in cpu inplace mode
* \param[in] network The loaded model
* \param[out] is_cpu_inplace_mode whether is in cpu inplace mode
*/
LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network,
int* is_cpu_inplace_mode);
/**
* \brief get the number of thread the network will run with
* \param[in] network The loaded model
* \param[out] nr_threads the thread number when the network running
*/
LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network,
size_t* nr_threads);
/**
* \brief get the device id the network will run with
* \param[in] network The loaded model
* \param[out] device_id the device id of the network will run
*/
LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id);
/**
* \brief get the stream id the network will run with
* \param[in] network The loaded model
* \param[out] stream_id the stream id of the network will run
*/
LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id);
/**
* \brief get the device type the network will run with
* \param[in] network The loaded model
* \param[out] device_type the device type of the network will run
*/
LITE_API int LITE_get_device_type(const LiteNetwork network,
LiteDeviceType* device_type);
/**
* \brief get the device type the network will run with
* \param[in] network The loaded model
* \param[out] info : the json format memory
* \param[out] info_size: the json format memory size
*/
LITE_API int LITE_get_model_extra_info(const LiteNetwork network,
const char** info, int* info_size);
/**
* \brief Set cpu default mode when device is CPU, in some low computation
* device or single core device, this mode will get good performace
* \param[in] network The loaded model
*/
LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network);
/**
* \brief When device is CPU, this interface will set the to be loaded model
* run in multi thread mode with the given thread number.
* \param[in] network The loaded model
* \param[in] nr_threads The threads number
*/
LITE_API int LITE_set_cpu_threads_number(LiteNetwork network,
size_t nr_threads);
/**
* \brief set device id, default device id = 0
* \param[in] network The loaded model
* \param[in] device_id The device id to be set
*/
LITE_API int LITE_set_device_id(LiteNetwork network, int device_id);
/**
* \brief set stream id, default stream id = 0
* \param[in] network The loaded model
* \param[in] stream_id The stream id to be set
*/
LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id);
/**
* \brief enable tensorrt
* \param[in] network The loaded model
*/
LITE_API int LITE_use_tensorrt(LiteNetwork network);
/**
* \brief set opr algorithm selection strategy in the network
* \param[in] network The loaded model
* \param[in] select_strategy The operator algorithm selection strategy
*/
LITE_API int LITE_set_network_algo_policy(LiteNetwork network,
LiteAlgoSelectStrategy strategy);
/**
* \brief set opr algorithm selection strategy in the network
* \param[in] network The loaded model
* \param[in] shared_batch_size: the batch size used by fastrun,
* Non-zero value means that fastrun use this batch size
* regardless of the batch size of the model. Zero means
* fastrun use batch size of the model
* \param[in] binary_equal_between_batch: if the content of each input batch is
* binary equal,whether the content of each output batch is
* promised to be equal
*/
LITE_API int LITE_set_network_algo_fastrun_config(
LiteNetwork network, unsigned int shared_batch_size,
int binary_equal_between_batch);
/**
* \brief set workspace_limit for oprs with multiple algorithms, set
* workspace limit can save memory but may influence the performance
* \param[in] network The loaded model
* \param[in] workspace_limit The operator algorithm workspace limit
*/
LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network,
size_t workspace_limit);
/**
* \brief set the network forward in async mode and set the async callback
* function
* \param[in] network The loaded model
* \param[in] async_callback when network finish forwarding, the callbak
* will be called
*/
LITE_API int LITE_set_async_callback(LiteNetwork network,
const LiteAsyncCallback async_callback);
/**
* \brief set the start forward callback function, which will be execute beform
* forward, this can be used to check network input or dump model inputs
* for debug
* \param[in] network The loaded model
* \param[in] start_callback when network start forwarding, the callbak
* will be called
*/
LITE_API int LITE_set_start_callback(LiteNetwork network,
const LiteStartCallback start_callback);
/**
* \brief set the finish forward callback function, which will be execute after
* forward, this can be used to dump model outputs for debug
* \param[in] network The loaded model
* \param[in] finish_callback when network finish forwarding, the callbak
* will be called
*/
LITE_API int LITE_set_finish_callback(LiteNetwork network,
const LiteFinishCallback finish_callback);
/**
* \brief set threads affinity callback
* \param[in] network The loaded model
* \param[in] thread_affinity_callback
*/
LITE_API int LITE_set_runtime_thread_affinity(
LiteNetwork network,
const LiteThreadAffinityCallback thread_affinity_callback);
/**
* \brief set the network memroy allocator, the allocator is defined by user
* \param[in] network The loaded model
* \param[in] allocate_fun The allocate function of the user defined allocator
* \param[in] free_fun The free function of the user defined allocator
*/
LITE_API int LITE_set_memory_allocator(LiteNetwork network,
const LiteAllocate allocate_fun,
const LiteFree free_fun);
/**
* \brief the dst_network share the runtime memory with src_network
* \param[in] src_network The source network
* \param[in] dst_network The dst network to shared memory with src_network
*/
LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network,
LiteNetwork dst_network);
/**
* \brief enable profile the network, a JSON format file will be generated
* \param[in] network The loaded model
* \param[in] profile_json_file_path The profile result file path
*/
LITE_API int LITE_enable_profile_performance(
LiteNetwork network, const char* profile_json_file_path);
/**
* \brief Dump input/output values of all internal variables to output file,
* in text format
* \param[in] network The loaded model
* \param[in] io_txt_out_file The dumped txt file name
*/
LITE_API int LITE_enable_io_txt_dump(LiteNetwork network,
const char* io_txt_out_file);
/**
* \brief Dump input/output values of all internal variables to output
* directory, in binary format
* \param[in] network The loaded model
* \param[in] io_bin_out_dir The dumped bin file directory
*/
LITE_API int LITE_enable_io_bin_dump(LiteNetwork network,
const char* io_bin_out_dir);
#ifdef __cplusplus
}
#endif
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file lite-c/include/lite-c/tensor_c.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_TENSOR_C_H_
#define LITE_TENSOR_C_H_
#include "common_enum_c.h"
#include "macro.h"
#ifdef __cplusplus
extern "C" {
#endif
#include "stddef.h"
#include "stdint.h"
#define LAYOUT_MAX_DIM (7)
/*!
* \brief the simple layout description
*/
typedef struct LiteLayout {
size_t shapes[LAYOUT_MAX_DIM];
size_t ndim;
LiteDataType data_type;
} LiteLayout;
//! define a default LiteLayout
extern LITE_API const LiteLayout default_layout;
/*!
* \brief warpper of the MegEngine Tensor
*
* if is_pinned_host is set, the storage memory of the tensor is pinned memory,
* this is used to Optimize the H2D or D2H memory copy, if the device or layout
* is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor
* will be automatically set to pinned tensor
*/
typedef struct LiteTensorDesc {
//! flag whether the storage of the tensor is pinned, this is only used when
//! the compnode is not in CPU
int is_pinned_host;
//! the layout of the tensor
LiteLayout layout;
//! the device of the tensor should not be changed after the tensor has
//! constructed
LiteDeviceType device_type;
//! device id of the tensor
int device_id;
} LiteTensorDesc;
//! define a default TensorDesc
extern LITE_API const LiteTensorDesc default_desc;
/*!
* \brief The pointer to a Lite Tensor object
*/
typedef void* LiteTensor;
/**
* \brief Create a lite tensor object from the given describe.
* \param[in] tensor_describe The description to create the Tensor
* \param[out] tensor The Tensor pointer
* \return int if the return is not zero, error happened, the error message
* can get by LITE_get_last_error
*/
LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe,
LiteTensor* tensor);
/**
* \brief Destroy a lite tensor object.
* \param[in] tensor The Tensor pointer
* \return int if the return is not zero, error happened, the error message
* can get by LITE_get_last_error
*/
LITE_API int LITE_destroy_tensor(LiteTensor tensor);
/**
* \brief change the layout of a Tensor object.
* \param[in] tensor The Tensor
* \param[out] layout The Layout to be set to a tensor
*/
LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout);
/**
* \brief use the user allocated data to reset the memory of the tensor, the
* memory will not be managed by the lite, later, the user should delete
* it.
* \param[in] tensor The Tensor
* \param[in] prepared_data The allocated memory which satisfy the Tensor
* \param[in] data_length_in_byte The length of the allocated memory
* layout
*/
LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
size_t data_length_in_byte);
/**
* \brief use the user allocated data and corresponding layout to reset the
* data and layout of the tensor, the memory will not be managed by lite, later,
* the user should delete it.
* \param[in] tensor The Tensor
* \param[in] layout The Layout to be set to the tensor
* \param[in] prepared_data The allocated memory which satisfy the layout to be
* set
*/
LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
void* prepared_data);
/**
* \brief reshape a tensor with the memroy not change, the total number of
* element in the reshaped tensor must equal to the origin tensor, the input
* shape must only contain one or zero -1 to flag it can be deduced
* automatically.
* \param[in] tensor The Tensor to be reshape
* \param[in] shape the user input shape
* \param[in] size the number of data in shape,
*/
LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size);
/**
* \brief slice a tensor with input param
* \param[in] tensor The Tensor to be slice
* \param[in] start start index of every axis of to be sliced
* \param[in] end end index of every axis of to be sliced
* \param[in] step step of every axis of to be sliced, if nullptr, step will be
* 1
* \param[in] size the number axis to be sliced
* \param[out] sliced_tensor the result tensor sliced from the origin tensor
*/
LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
const size_t* end, const size_t* step,
size_t size, LiteTensor* slice_tensor);
/**
* \brief fill zero to the tensor
* \param[in] tensor The Tensor to be memset
*/
LITE_API int LITE_tensor_fill_zero(LiteTensor tensor);
/**
* \brief copy tensor form other tensor
* \param[out] dst_tensor The Tensor to copy into
* \param[in] src_tensor The Tensor to copy from
*/
LITE_API int LITE_tensor_copy(LiteTensor dst_tensor,
const LiteTensor src_tensor);
/**
* \brief share memory form other tensor
* \param[out] dst_tensor The Tensor to share into
* \param[in] src_tensor The Tensor to be shared
*/
LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
const LiteTensor src_tensor);
/**
* \brief get the memory pointer of a Tensor object.
* \param[in] tensor The input Tensor
* \param[out] data a pointer to void pointer
*/
LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data);
/**
* \brief get the memory pointer of a Tensor object.
* \param[in] tensor The input Tensor
* \param[in] index The coordinate in the tensor
* \param[in] size The lenght of coordinate
* \param[out] data a pointer to void pointer
*/
LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
const size_t* index, size_t size,
void** data);
/**
* \brief get the tensor capacity in byte of a Tensor object.
* \param[in] tensor The input Tensor
* \param[out] size_ptr a pointer to the return size
*/
LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor,
size_t* size);
/**
* \brief get the tensor layout of a Tensor object.
* \param[in] tensor The input Tensor
* \param[out] layout_ptr a pointer will be write with the layout of the tensor
*/
LITE_API int LITE_get_tensor_layout(const LiteTensor tensor,
LiteLayout* layout);
/**
* \brief get the tensor device of a Tensor object.
* \param[in] tensor The input Tensor
* \param[out] device_ptr a pointer will be write with the device of the tensor
*/
LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor,
LiteDeviceType* device_type);
/**
* \brief get the tensor device id of a Tensor object.
* \param[in] tensor The input Tensor
* \param[out] device_id a pointer will be write with the device id of the
* tensor
*/
LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id);
/**
* \brief whether the tensor is is_pinned_host.
* \param[in] tensor The input Tensor
* \param[out] is_pinned_host_ptr a int pointer will be write with whether the
* tensor is pinned host
*/
LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host);
/**
* \brief whether the tensor memory is continue.
* \param[in] tensor The input Tensor
* \param[out] is_continue a int pointer will be write with whether the
* tensor continue
*/
LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue);
/**
* \brief concat the inputs tensor to one big tensor
* \param[in] tensors ptr The input Tensors
* \param[in] nr_tensors number input Tensor
* \param[in] dim the dim concat act on
* \param[in] dst_device the device type of result tensor, when
* LITE_DEVICE_DEFAULT, the result tensor device type will get from the first
* tensor
* \param[in] device_id the device id of result tensor, when -1, the result
* tensor device id will get from the first tensor
* \param[out] result_tensor the result tensor after concat
*/
LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
LiteDeviceType dst_device, int device_id,
LiteTensor* result_tensor);
#ifdef __cplusplus
}
#endif
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file lite-c/src/common.h
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#ifndef LITE_C_COMMON_H_
#define LITE_C_COMMON_H_
#include "../src/misc.h"
#include "lite-c/network_c.h"
#include "lite-c/tensor_c.h"
#include "lite/network.h"
#include <exception>
#include <stdexcept>
//! convert c Layout to lite::Layout
lite::Layout convert_to_layout(const LiteLayout& layout);
//! convert lite::Layout to C Layout
LiteLayout convert_to_clayout(const lite::Layout& layout);
//! convert c config to lite::config
lite::Config convert_to_lite_config(const LiteConfig c_config);
//! convert C NetworkIO io to lite::NetworkIO
lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io);
/*!
* \brief handle exception
* \param e the exception
* \return the return value of the error
*/
int LiteHandleException(const std::exception& e);
#if LITE_ENABLE_EXCEPTION
/*! \brief macro to guard a function */
#define LITE_CAPI_BEGIN() try {
/*! \brief every function starts with LITE_CAPI_BEGIN();
* ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
*/
#define LITE_CAPI_END() \
} \
catch (std::exception & _except_) { \
return LiteHandleException(_except_); \
} \
return 0;
#else
/*! \brief macro to guard a function */
#define LITE_CAPI_BEGIN() {
/*! \brief every function starts with LITE_CAPI_BEGIN();
* ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
*/
#define LITE_CAPI_END() \
} \
return 0;
#endif
/*!
* \brief catch the exception with stms
*/
#define LITE_CAPI_END_WITH_STMS(_stms) \
} \
catch (std::exception & _except_) { \
_stms; \
return LiteHandleException(_except_); \
} \
return 0;
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file lite-c/src/tensor.cpp
*
* This file is part of MegEngine, a deep learning framework developed by
* Megvii.
*
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
*/
#include "lite/global.h"
#include "common.h"
#include "lite-c/global_c.h"
#include <exception>
#include <mutex>
namespace {
class ErrorMsg {
public:
std::string& get_error_msg() { return error_msg; }
void set_error_msg(const std::string& msg) { error_msg = msg; }
private:
std::string error_msg;
};
ErrorMsg& get_global_error() {
static thread_local ErrorMsg error_msg;
return error_msg;
}
} // namespace
int LiteHandleException(const std::exception& e) {
get_global_error().set_error_msg(e.what());
return -1;
}
const char* LITE_get_last_error() {
return get_global_error().get_error_msg().c_str();
}
int LITE_get_version(int* major, int* minor, int* patch) {
LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null");
lite::get_version(*major, *minor, *patch);
return 0;
}
int LITE_get_device_count(LiteDeviceType device_type, size_t* count) {
LITE_CAPI_BEGIN();
LITE_ASSERT(count, "The ptr pass to LITE api is null");
*count = lite::get_device_count(device_type);
LITE_CAPI_END();
}
int LITE_try_coalesce_all_free_memory(){
LITE_CAPI_BEGIN();
lite::try_coalesce_all_free_memory();
LITE_CAPI_END();
}
int LITE_register_decryption_and_key(const char* decrypt_name,
const LiteDecryptionFunc func,
const uint8_t* key_data, size_t key_size) {
LITE_CAPI_BEGIN();
LITE_ASSERT(decrypt_name && key_data && func,
"The ptr pass to LITE api is null");
std::vector<uint8_t> key;
for (size_t i = 0; i < key_size; i++) {
key.push_back(key_data[i]);
}
auto decrypt_func = [func](const void* input_data, size_t input_size,
const std::vector<uint8_t>& key) {
auto size =
func(input_data, input_size, key.data(), key.size(), nullptr);
std::vector<uint8_t> output(size, 0);
func(input_data, input_size, key.data(), key.size(), output.data());
return output;
};
lite::register_decryption_and_key(decrypt_name, decrypt_func, key);
LITE_CAPI_END();
}
int LITE_update_decryption_or_key(const char* decrypt_name,
const LiteDecryptionFunc func,
const uint8_t* key_data, size_t key_size) {
LITE_CAPI_BEGIN();
std::vector<uint8_t> key;
for (size_t i = 0; i < key_size; i++) {
key.push_back(key_data[i]);
}
lite::DecryptionFunc decrypt_func = nullptr;
if (func) {
decrypt_func = [func](const void* input_data, size_t input_size,
const std::vector<uint8_t>& key) {
auto size = func(input_data, input_size, key.data(), key.size(),
nullptr);
std::vector<uint8_t> output(size, 0);
func(input_data, input_size, key.data(), key.size(), output.data());
return output;
};
}
lite::update_decryption_or_key(decrypt_name, decrypt_func, key);
LITE_CAPI_END();
}
int LITE_register_parse_info_func(const char* info_type,
const LiteParseInfoFunc parse_func) {
LITE_CAPI_BEGIN();
LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null");
auto lite_func = [parse_func](
const void* info_data, size_t info_size,
const std::string model_name, lite::Config& config,
lite::NetworkIO& network_io,
std::unordered_map<std::string, lite::LiteAny>&
separate_config_map,
std::string& extra_info) {
LITE_MARK_USED_VAR(extra_info);
size_t nr_threads = 1;
int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false;
LiteNetworkIO c_io;
LiteConfig c_config;
auto ret = parse_func(info_data, info_size, model_name.c_str(),
&c_config, &c_io, &device_id, &nr_threads,
&is_cpu_inplace_mode, &use_tensorrt);
config = convert_to_lite_config(c_config);
network_io = convert_to_lite_io(c_io);
if (device_id != 0) {
separate_config_map["device_id"] = device_id;
}
if (nr_threads != 1) {
separate_config_map["nr_threads"] = nr_threads;
}
if (is_cpu_inplace_mode != false) {
separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode;
}
if (use_tensorrt != false) {
separate_config_map["use_tensorrt"] = use_tensorrt;
}
return ret;
};
lite::register_parse_info_func(info_type, lite_func);
LITE_CAPI_END();
}
int LITE_set_loader_lib_path(const char* loader_path) {
LITE_CAPI_BEGIN();
LITE_ASSERT(loader_path, "The ptr pass to LITE api is null");
lite::set_loader_lib_path(loader_path);
LITE_CAPI_END();
}
int LITE_set_persistent_cache(const char* cache_path, int always_sync) {
LITE_CAPI_BEGIN();
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
lite::set_persistent_cache(cache_path, always_sync);
LITE_CAPI_END();
}
int LITE_set_tensor_rt_cache(const char* cache_path) {
LITE_CAPI_BEGIN();
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
lite::set_tensor_rt_cache(cache_path);
LITE_CAPI_END();
}
int LITE_set_log_level(LiteLogLevel level) {
LITE_CAPI_BEGIN();
lite::set_log_level(level);
LITE_CAPI_END();
}
int LITE_get_log_level(LiteLogLevel* level) {
LITE_CAPI_BEGIN();
LITE_ASSERT(level, "The ptr pass to LITE api is null");
*level = lite::get_log_level();
LITE_CAPI_END();
}
int LITE_dump_persistent_cache(const char* cache_path) {
LITE_CAPI_BEGIN();
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
lite::dump_persistent_cache(cache_path);
LITE_CAPI_END();
}
int LITE_dump_tensor_rt_cache() {
LITE_CAPI_BEGIN();
lite::dump_tensor_rt_cache();
LITE_CAPI_END();
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
此差异已折叠。
此差异已折叠。
# -*- coding: utf-8 -*-
# This file is part of MegEngine, a deep learning framework developed by
# Megvii.
#
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
from .base import *
from .global_setting import *
from .network import *
from .struct import *
from .tensor import *
from .utils import *
此差异已折叠。
# -*- coding: utf-8 -*-
# This file is part of MegEngine, a deep learning framework developed by
# Megvii.
#
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
from ctypes import *
import numpy as np
from .base import _Ctensor, _lib, _LiteCObjBase
from .network import *
from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure
from .tensor import *
LiteDecryptionFunc = CFUNCTYPE(
c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p
)
class _GlobalAPI(_LiteCObjBase):
"""
get the api from the lib
"""
_api_ = [
("LITE_get_device_count", [c_int, POINTER(c_size_t)]),
("LITE_try_coalesce_all_free_memory", []),
(
"LITE_register_decryption_and_key",
[c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t],
),
(
"LITE_update_decryption_or_key",
[c_char_p, c_void_p, POINTER(c_uint8), c_size_t],
),
("LITE_set_loader_lib_path", [c_char_p]),
("LITE_set_persistent_cache", [c_char_p, c_int]),
# ('LITE_set_tensor_rt_cache', [c_char_p]),
("LITE_dump_persistent_cache", [c_char_p]),
("LITE_dump_tensor_rt_cache", [c_char_p]),
]
def decryption_func(func):
"""the decryption function decorator
:type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte
"""
@CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p)
def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data):
in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length)
key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length)
if c_out_data:
out_length = func(in_arr, None)
out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length)
return func(in_arr, key_arr, out_arr)
# just query the output length
else:
return func(in_arr, key_arr, None)
return wrapper
class LiteGlobal(object):
"""
some global config in lite
"""
_api = _GlobalAPI()._lib
@staticmethod
def register_decryption_and_key(decryption_name, decryption_func, key):
c_name = c_char_p(decryption_name.encode("utf-8"))
key_length = len(key)
c_key = (c_uint8 * key_length)(*key)
LiteGlobal._api.LITE_register_decryption_and_key(
c_name, decryption_func, c_key, key_length
)
@staticmethod
def update_decryption_key(decryption_name, key):
c_name = c_char_p(decryption_name.encode("utf-8"))
key_length = len(key)
c_key = (c_uint8 * key_length)(*key)
LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length)
@staticmethod
def set_loader_lib_path(path):
c_path = c_char_p(path.encode("utf-8"))
LiteGlobal._api.LITE_set_loader_lib_path(c_path)
@staticmethod
def set_persistent_cache(path, always_sync=False):
c_path = c_char_p(path.encode("utf-8"))
LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync)
@staticmethod
def set_tensorrt_cache(path):
c_path = c_char_p(path.encode("utf-8"))
LiteGlobal._api.LITE_set_tensorrt_cache(c_path)
@staticmethod
def dump_persistent_cache(path):
c_path = c_char_p(path.encode("utf-8"))
LiteGlobal._api.LITE_dump_persistent_cache(c_path)
@staticmethod
def dump_tensorrt_cache():
LiteGlobal._api.LITE_dump_tensorrt_cache()
@staticmethod
def get_device_count(device_type):
count = c_size_t()
LiteGlobal._api.LITE_get_device_count(device_type, byref(count))
return count.value
@staticmethod
def try_coalesce_all_free_memory():
LiteGlobal._api.LITE_try_coalesce_all_free_memory()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册