diff --git a/lite/.gitattributes b/lite/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..c268a9ab651063425e1c50893566b10bed7f84fc --- /dev/null +++ b/lite/.gitattributes @@ -0,0 +1,10 @@ +test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text +test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text +test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text +test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text +test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text +test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text +test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text +test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text +third_party/librknn_api filter=lfs diff=lfs merge=lfs -text +test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc61418870f59f4dc75931c6ac86ffdebb644024 --- /dev/null +++ b/lite/CMakeLists.txt @@ -0,0 +1,135 @@ +option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON) + +# config lite_build_config.h.in +set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL}) +set(LITE_WITH_CUDA ${MGE_WITH_CUDA}) +set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING}) +set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) +set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC}) + +if(NOT MGB_WITH_FLATBUFFERS) + include(../cmake/flatbuffers.cmake) +endif() + +file(GLOB_RECURSE SRC_FBS src/**/*.fbs) +build_flatbuffers( + "${SRC_FBS}" + "" + lite_fbs_generate + "" + "${CMAKE_CURRENT_BINARY_DIR}" + "" + "" + ) + +file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp) + +if(MGE_WITH_MINIMUM_SIZE) + set(LITE_ENABLE_LOGGING OFF) + set(LITE_ENABLE_EXCEPTION OFF) +endif() + +# Write out lite_build_config.h +# It defines macros needed by lite +configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +# begin config lite +if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) + # FXIME third_party cpp redis do not support build with clang-cl + file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp) + list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS}) + file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp) + list(APPEND SOURCES_LITE ${SOURCES_TACOPIE}) +endif() +add_library(lite_static STATIC ${SOURCES_LITE}) +add_dependencies(lite_static lite_fbs_generate) +include_directories($) + +if(LITE_BUILD_WITH_MGE) + target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) + add_compile_definitions(LITE_BUILD_WITH_MGE=1) + message(STATUS "build lite with MegEngine.") +else() + target_link_libraries(lite_static PUBLIC flatbuffers) +endif() + +include_directories( + PUBLIC $ + PUBLIC $ + PUBLIC $ + PUBLIC $ + PUBLIC $ + PUBLIC $ + ) +# end config lite + +# define a shared lib +add_library(lite_shared SHARED $) +if(LITE_BUILD_WITH_MGE) + target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) +endif() +if(ANDROID) + link_libraries(log) + target_link_libraries(lite_static PRIVATE log) + target_link_libraries(lite_shared PRIVATE log) +endif() + +if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) + # FXIME third_party cpp redis do not support build with clang-cl + target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) + target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) + target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) + target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) +endif() +set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script") +add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT}) +if(NOT MSVC AND NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") +endif() +#TODO: implemente version script for other OS +if (UNIX AND NOT APPLE) + target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) + set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) +endif() + +# config install +install(TARGETS lite_static + LIBRARY DESTINATION lite/lib/${MGE_ARCH} + FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} + ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) + +install(TARGETS lite_shared + LIBRARY DESTINATION lite/lib/${MGE_ARCH} + FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} + ARCHIVE DESTINATION lite/lib/${MGE_ARCH} + ) + +install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c) + +install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") + +install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") + +add_subdirectory(example) +if(MGE_WITH_TEST) + add_subdirectory(test) +endif() + +# tools and example +add_executable(rc4_encryptor tools/rc4_encrypt.cpp) + +target_link_libraries(rc4_encryptor lite_static) +if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(rc4_encryptor megdnn) +endif() +target_include_directories(rc4_encryptor PRIVATE + {PROJECT_SOURCE_DIR}/lite/src/decryption) +install (TARGETS rc4_encryptor + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/tools) diff --git a/lite/README.md b/lite/README.md new file mode 100755 index 0000000000000000000000000000000000000000..8720a1057f228b643abcd07c1cc03c0ff526cee2 --- /dev/null +++ b/lite/README.md @@ -0,0 +1,251 @@ +# Lite + +It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in +user's SDK + +## bazel build + +目前支持内部 bazel 和 CMake 编译,支持 C++/C, Python 接口, +下面是 bazel 中 lite_shared 目标的编译,可以作为其他目标的编译的参考, +该编译依赖内部 bazel 编译以及 megvii3。 + +### 配置编译环境 + +需要使用 megvii3 workspace 来完成 bazel 的编译 + +#### Clone megvii3 安装 bazel + +```bash + git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git + ./utils/bazel/get_bazel.sh +``` + +#### Clone megbrain +``` + git submodule update brain/megbrain brain/midout +``` + +### 编译 x86 CUDA 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ + --compiler="gcc7_cuda10" -c opt +``` + +### 编译 x86 CPU 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ + --compiler="gcc9" -c opt +``` + +### 编译 arm OpenCL 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \ + -c opt --define enable_opencl=1 --define enable_opencl_search=1 +``` +### 编译 arm opencl lite_examples +bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \ +--cpu=android_aarch64 --define enable_opencl=1 --define enable_opencl_search=1 +####如何运行snpe_loder 的lite_exampes 请查看下面的wiki +https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906 + +### 编译 armv7 CPU 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \ + -c opt +``` + +### 编译 arm64 CPU 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ + -c opt +``` + +### 编译 arm64 CPU v8.2 版本 + +```bash + ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ + --copt -march=armv8.2-a+fp16+dotprod -c opt +``` + +## 同时支持cmake构建 +cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine +和RKNPU后端且打开OpenCL的release模式 +```bash +EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \ +-DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh" +``` +* 如果需要支持性能分析的 profile 功能,则需要在编译时候加上 + --copt -DMGB_ENABLE_JSON=1 该参数 +* 如果需要支持 fast-run 功能则需要加上 + --copt -DMGB_ENABLE_FASTRUN=1,开启 fast-run 功能 +* 如果编译 arm64,可以加上 --copt -mcpu=cortex-a53 选项进行优化。 + +### midout 裁减编译 +具体 midout 的裁减原理见 megbrain 中 midout 裁减,裁减方法见 MegBrain +和 MegEngine 的裁减方法 + +## 模型 + +### 支持的模型 + +lite 目前支持只支持 MegEngine dump 的模型格式,可以加载的模型文件包括原始 +的模型文件,原始的加密模型,pack 之后的加密或者非加密模型。加密算法以及 +加密的秘钥可以用户自定义,然后注册到 lite 中,详见 example 中加解密部分。 + +* 原始模型未加密:直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型 +* 原始加密模型:将上述 dump 的模型通过加密算法进行加密,lite 提供两种默认 +的加密算法,在 tools 中,分别为 aes 和 rc4. 对应为:aes_encypt.sh 和 +rc4_encrypt.cpp,rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在 +加载时候需要在 Config 中配置模型的加密方式。 +* pack 之后的模型:模型结构将在下面介绍,可以将上面加密或者未加密的模型,和下面 +定义的 json config 文件一同打包为一个 pack 之后的模型,可以使用 tools 下面 +的 pack_model_and_info.py 工具中完成,pack_model_and_info.py 的使用详见其中 +的 help 输出。 + +### 模型结构 + +不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分. + +* 打包处理之后的文件: + 模型打包过程可以通过脚本 pack_model_and_json.py 来完成,其将模型info文件( + 可以是任意格式,推荐使用JSON,可以加密也可以不加密)和加密或者未加密的模型文件 + 一同打包在一起,并在文件开头加上 Header 来帮助解析。 +* 原始文件和原始的加密文件没有 Header 和模型 info部分,模型加载需要的信息 + 可以通过 Config 和 NetworkIO 进行传递。 + +### Header + +Header 部分最开始为一个明文固定model_tag,目前定义为"packed_model"字符串, +后面主要包含模型文件各个部分的信息,每个部分的加密方式,load 模型时候可以 +调用相应的解密方法对各个部分进行解密,以及model infomation 部分的解析方法。 +具体细节参考lite/src/parse_model/pack_model.fbs + +### Info部分 + +Info 部分主要用来解释模型,如用户关心的:模型的输入数据的格式,模型运行的平台 +等信息,这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。 +由于这个 Info 部分不同的用户需求不一致,想传递的信息也无法统一,所以目前 +Lite 中提供自定义的方式,用户可以自定义自己 Info 部分的类容,并在 Header 中 +指定 **Info 解析方式名字** ,并注册以该名字为 key 的解析函数到 Lite 中, +以这样方式来可以实现用户自定义 Info 格式。同时,Lite 中也提供了一套定义好的 +格式,其名字为 "LITE_default",并已经实现了对应的解析函数,该 info +为 JSON 格式,具体内容定义如下: + +```json +{ + "name": "shufflenet_test", + "valid": true, + "version": "8.9999.0", + "has_compression": false, + "device": { + "type": "CPU", + "device_id": 0, + "number_threads": 1, + "use_tensor_rt": false, + "enable_inplace_model": false + }, + "options":{ + "weight_preprocess": false, + "var_sanity_check_first_run": true, + "const_shape": false, + "jit_level": 0, + "record_level": 0 + }, + "IO":{ + "inputs":[ + { + "name": "data", + "io_type": "value", + "is_host": true, + "dtype": "float32", + "shape": { + "dim0": 1, + "dim1": 3, + "dim2": 224, + "dim3": 224 + } + } + ], + "outputs":[ + { + "name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]", + "io_type": "value", + "is_host": true, + "dtype": "float32", + "shape": { + "dim0": 1, + "dim1": 1000, + "dim2": 0, + "dim3": 0 + } + } + ] + } +} +``` + +* model_name: 指这个模型的名字,用户可以用来验证是否运行了正确的模型, +和 Header 部分中的进行对比 check +* valid: 指在这个 info 文件中的设置是否影响模型的 Config +* version: 指模型对应的 megbrain 的版本号,load 模型时候会进行 check +* has_compression: 标识这个模型文件中 tensor 的数据是否压缩过 +* device: 目前支持字段包括:"CPU","CUDA","OPENCL","ATLAS" +* number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效 +* IO::inputs::type: 包括 value,shape,详见 include"network.h" +* IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端 +* IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端 +* IO::outputs::shape::dimx: 如果为0,则便是该 dim 无效 + +### Model部分 + +可以是加密的模型文件或者未加密的模型文件 + +## 使用 + +丰富的使用方法详见文件 example 中文档和对应的 example。 + +## 工具 + +目前 lite 中有三个工具保存在 tools 目录中,其他 megbrain 工具 +没有包含在内,分别为: + +* pack_model_and_info.py 为上面提到的模型打包工具,其为一个 + python 脚本,可以直接用其对已有的模型和模型 information 的文件,按照上面 + 的格式进行打包模型,用户可以指定模型名字,模型加密方式,模型信息 + 文件加密方式,解析方式等,如下: + + ```bash + python3 pack_model_and_info.py --input-model xxx.mge \ + --model-name="shufflenet_test" \ + --model-cryption="RC4_default" \ + --input-info xxx.json \ + --info-cryption="RC4_default" \ + --info-parser="LITE_default" \ + -o xxx.lite + ``` +* aes_encrypt.sh 为一个 aes 加密方式的加密脚本,可以将一个文件, +通过指定的的 key 加密成一个 aes 加密的文件,其中 key 为 32 个字节 +16进制数。 + ```bash + aes_encrypt.sh xxx.mdl xxx_encrypted.mdl \ + 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F + ``` + +* rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具,这个工具可以通过 + 制定的 key 或者默认的 key 加密制定的文件,支持 rc4 方法和 + simple_fast_rc4 两种方法,支持自定义 key。 + * bazel 编译 x86 命令为: + ```bash + bazel build //brain/megbrain/lite:rc4_encryptor \ + --cpu='k8' --compiler='gcc9' + ``` + * 加密文件,具体用法见 help + ```bash + rc4_encryptor encrypt_predefined_rc4 \ + to_be_encrypt.file encrypted.file + ``` diff --git a/lite/build_config/lite_build_config.h b/lite/build_config/lite_build_config.h new file mode 100644 index 0000000000000000000000000000000000000000..8a606c9fda2f5cf2dc8cfe63cf903da74a4b776e --- /dev/null +++ b/lite/build_config/lite_build_config.h @@ -0,0 +1,32 @@ +/** + * \file lite/build_config/lite_build_config.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ +#ifndef _HEADER_LITE_BUILD_CONFIG +#define _HEADER_LITE_BUILD_CONFIG + +#ifndef LITE_ENABLE_LOGGING +#define LITE_ENABLE_LOGGING 1 +#endif + +#ifndef LITE_ENABLE_EXCEPTION +#if __cpp_exceptions || __EXCEPTIONS || \ + (defined(_MSC_VER) && defined(_CPPUNWIND)) +#define LITE_ENABLE_EXCEPTION 1 +#else +#define LITE_ENABLE_EXCEPTION 0 +#endif +#endif + +#ifndef LITE_WITH_CUDA +#define LITE_WITH_CUDA 0 +#endif + +#ifndef LITE_ASSERT_LOC +#define LITE_ASSERT_LOC 1 +#endif +#endif // _HEADER_LITE_BUILD_CONFIG diff --git a/lite/example/CMakeLists.txt b/lite/example/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9012fded9a2d8bef2b43b0feb07c8d5203628708 --- /dev/null +++ b/lite/example/CMakeLists.txt @@ -0,0 +1,47 @@ +file (GLOB_RECURSE SOURCES ./*.cpp) +add_executable(lite_examples ${SOURCES}) + +if(LITE_BUILD_WITH_RKNPU) + #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_examples PRIVATE "-fuse-ld=gold") +endif() + +target_link_libraries(lite_examples lite_static) +if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(lite_examples megdnn) +endif() + +if(UNIX) + if(APPLE OR ANDROID) + target_link_libraries(lite_examples dl) + else() + target_link_libraries(lite_examples dl rt) + endif() +endif() + +install (TARGETS lite_examples + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) + +# add lite_examples_depends_shared for CI check symbol export valid +add_executable(lite_examples_depends_shared ${SOURCES}) + +if(LITE_BUILD_WITH_RKNPU) + #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold") +endif() + +target_link_libraries(lite_examples_depends_shared lite_shared) + +if(UNIX) + if(APPLE OR ANDROID) + target_link_libraries(lite_examples_depends_shared dl) + else() + target_link_libraries(lite_examples_depends_shared dl rt) + endif() +endif() + +install (TARGETS lite_examples_depends_shared + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) diff --git a/lite/example/example.h b/lite/example/example.h new file mode 100644 index 0000000000000000000000000000000000000000..410ec0e610c6ed4757ada4bfc30bb81402aa300d --- /dev/null +++ b/lite/example/example.h @@ -0,0 +1,101 @@ +/** + * \file example/example.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include + +#include "lite/global.h" +#include "lite/network.h" +#include "lite/tensor.h" + +#include "npy.h" + +#include +#include +#include +#include + +namespace lite { +namespace example { + +void set_cpu_affinity(const std::vector& cpuset); + +struct Args { + int args_parse_ret = 0; + std::string example_name; + std::string model_path; + std::string input_path; + std::string output_path; + std::string loader_path; + static Args from_argv(int argc, char** argv); +}; + +std::shared_ptr parse_npy( + const std::string& path, + LiteBackend backend = LiteBackend::LITE_DEFAULT); + +using ExampleFunc = std::function; +using ExampleFuncMap = std::unordered_map; + +ExampleFuncMap* get_example_function_map(); + +bool register_example(std::string example_name, const ExampleFunc& fuction); + +template +struct Register; + +#if LITE_BUILD_WITH_MGE +#if LITE_WITH_CUDA +bool load_from_path_run_cuda(const Args& args); +#endif +bool basic_load_from_path(const Args& args); +bool basic_load_from_path_with_loader(const Args& args); +bool basic_load_from_memory(const Args& args); +bool cpu_affinity(const Args& args); +bool network_share_same_weights(const Args& args); +bool reset_input(const Args& args); +bool reset_input_output(const Args& args); +bool config_user_allocator(const Args& args); +bool register_cryption_method(const Args& args); +bool update_cryption_key(const Args& args); +bool async_forward(const Args& args); + +#if LITE_WITH_CUDA +bool device_input(const Args& args); +bool device_input_output(const Args& args); +bool pinned_host_input(const Args& args); +#endif +#endif + +} // namespace example +} // namespace lite + +#if LITE_BUILD_WITH_MGE +bool basic_c_interface(const lite::example::Args& args); +bool device_io_c_interface(const lite::example::Args& args); +bool async_c_interface(const lite::example::Args& args); +#endif + +#define CONCAT_IMPL(a, b) a##b +#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) + +#define REGIST_EXAMPLE(name_, func_) \ + REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_) + +#define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_) \ + template <> \ + struct Register { \ + Register() { register_example(name_, func_); } \ + }; \ + namespace { \ + Register MACRO_CONCAT(example_function_, number_); \ + } + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/main.cpp b/lite/example/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9e9400083060e11195d22055ad970c75fe384739 --- /dev/null +++ b/lite/example/main.cpp @@ -0,0 +1,172 @@ +/** + * \file example/example.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/global.h" +#include "lite/network.h" +#include "lite/tensor.h" + +#include "example.h" +#include "npy.h" + +#include +#include +#include +#include + +using namespace lite; +using namespace example; + +Args Args::from_argv(int argc, char** argv) { + Args ret; + if (argc < 4) { + printf("usage: lite_examples .\n"); + printf("*********The output file is optional.*************\n"); + printf("The registered examples include:\n"); + size_t index = 0; + for (auto it : *get_example_function_map()) { + printf("%zu : %s\n", index, it.first.c_str()); + index++; + } + ret.args_parse_ret = -1; + return ret; + } + ret.example_name = argv[1]; + ret.model_path = argv[2]; + ret.input_path = argv[3]; + if (argc > 4) { + ret.output_path = argv[4]; + } + if (argc > 5) { + ret.loader_path = argv[5]; + } + return ret; +} + +ExampleFuncMap* lite::example::get_example_function_map() { + static ExampleFuncMap static_map; + return &static_map; +} + +bool lite::example::register_example(std::string example_name, + const ExampleFunc& fuction) { + auto map = get_example_function_map(); + if (map->find(example_name) != map->end()) { + printf("Error!!! This example is registed yet\n"); + return false; + } + (*map)[example_name] = fuction; + return true; +} + +std::shared_ptr lite::example::parse_npy(const std::string& path, + LiteBackend backend) { + std::string type_str; + std::vector stl_shape; + std::vector raw; + npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); + + auto lite_tensor = + std::make_shared(backend, LiteDeviceType::LITE_CPU); + Layout layout; + layout.ndim = stl_shape.size(); + const std::map type_map = { + {"f4", LiteDataType::LITE_FLOAT}, + {"i4", LiteDataType::LITE_INT}, + {"i1", LiteDataType::LITE_INT8}, + {"u1", LiteDataType::LITE_UINT8}}; + layout.shapes[0] = 1; + for (size_t i = 0; i < layout.ndim; i++) { + layout.shapes[i] = static_cast(stl_shape[i]); + } + + for (auto& item : type_map) { + if (type_str.find(item.first) != std::string::npos) { + layout.data_type = item.second; + break; + } + } + lite_tensor->set_layout(layout); + size_t length = lite_tensor->get_tensor_total_size_in_byte(); + void* dest = lite_tensor->get_memory_ptr(); + memcpy(dest, raw.data(), length); + //! rknn not support reshape now + if (layout.ndim == 3) { + lite_tensor->reshape({1, static_cast(layout.shapes[0]), + static_cast(layout.shapes[1]), + static_cast(layout.shapes[2])}); + } + return lite_tensor; +} + +void lite::example::set_cpu_affinity(const std::vector& cpuset) { +#if defined(__APPLE__) || defined(WIN32) +#pragma message("set_cpu_affinity not enabled on apple and windows platform") +#else + cpu_set_t mask; + CPU_ZERO(&mask); + for (auto i : cpuset) { + CPU_SET(i, &mask); + } + auto err = sched_setaffinity(0, sizeof(mask), &mask); + if (err) { + printf("failed to sched_setaffinity: %s (error ignored)", + strerror(errno)); + } +#endif +} + +int main(int argc, char** argv) { + set_log_level(LiteLogLevel::WARN); + auto&& args = Args::from_argv(argc, argv); + if (args.args_parse_ret) + return -1; + auto map = get_example_function_map(); + auto example = (*map)[args.example_name]; + if (example) { + printf("Begin to run %s example.\n", args.example_name.c_str()); + return example(args); + } else { + printf("The example of %s is not registed.", args.example_name.c_str()); + return -1; + } +} +namespace lite { +namespace example { + +#if LITE_BUILD_WITH_MGE +#if LITE_WITH_CUDA +REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda); +#endif +REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path); +REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader); +REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory); +REGIST_EXAMPLE("cpu_affinity", cpu_affinity); +REGIST_EXAMPLE("register_cryption_method", register_cryption_method); +REGIST_EXAMPLE("update_cryption_key", update_cryption_key); +REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights); +REGIST_EXAMPLE("reset_input", reset_input); +REGIST_EXAMPLE("reset_input_output", reset_input_output); +REGIST_EXAMPLE("config_user_allocator", config_user_allocator); +REGIST_EXAMPLE("async_forward", async_forward); + +REGIST_EXAMPLE("basic_c_interface", basic_c_interface); +REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface); +REGIST_EXAMPLE("async_c_interface", async_c_interface); + +#if LITE_WITH_CUDA +REGIST_EXAMPLE("device_input", device_input); +REGIST_EXAMPLE("device_input_output", device_input_output); +REGIST_EXAMPLE("pinned_host_input", pinned_host_input); +#endif +#endif +} // namespace example +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/README.md b/lite/example/mge/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f41115d22cadd7b147103015a43566e1e098ee7b --- /dev/null +++ b/lite/example/mge/README.md @@ -0,0 +1,166 @@ +# Example + +在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子,主要 +是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example +都是使用 shufflenet 来进行演示。 + +## Example bazel 的编译和运行 + +* 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境,编译 CPU 版本 +```bash + ./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \ + --compiler="gcc9" -c opt +``` +* 运行时需要指定运行的具体 example 名字,运行的模型,模型运行的数据 + * 获取所有的 example 名字 +``` + bazel-bin/brain/megbrain/lite/lite_examples +``` + * 运行 example,下面命令运行 basic_load_from_memory +``` + bazel-bin/brain/megbrain/lite/lite_examples \ + basic_load_from_memory \ + path-to-megbrain/lite/test/resource/lite/shufflenet.mge \ + path-to-megbrain/lite/test/resource/lite/input_data.npy +``` + +## basic 使用 + +* **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和 + basic_load_from_memory** + +* 该 example 使用 lite 来完成基本的 inference 功能,load 模型使用默认的配置, +进行 forward 之前将输入数据 copy 到输入 tensor 中,完成 forward 之后,再将 +数据从输出 tensor 中 copy 到用户的内存中,输入 tensor 和输出 tensor 都是从 +Network 中通过 name 来获取的,输入输出 tensor 的 layout 也可以从对应的 tensor +中直接获取获取,**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。** + +## 输入输出指定的内存 + +* **实现在 reset_io.cpp 中,包括两个 example,reset_input 和 reset_input_output +两个 example。** + +* 该 example 中演示输入 tensor 的内存为用户指定的内存(该内存中已经保存好输入 +数据),输出 tensor 也可以是用户指定的内存,这样 Network 完成 Forward 之后就会将数据 +保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。 + +* 主要是通过 tensor 中的 reset 接口,该接口可以重新指定 tensor 的内存和对应的 +layout,如果 layout 没有指定,默认为 tensor 中原来的 layout。 + +* **该方法中由于内存是用户申请,需要用户提前知道输入,输出 tensor 对应的 layout,然后 +根据 layout 来申请内存,另外通过 reset 设置到 tensor 中的内存,生命周期不由 tensor +管理,由外部用户来管理。** + +## 输入输出指定 device 上内存 + +* **实现在 device_io.cpp 中,device_input 和 device_input_output 两个 example。** + +* 该 example 中配置模型运行在 device(CUDA) 上,并且使用用户提前申请的 device 上的内存 +作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上,不设置默认 +在 CPU 上,其他地方和**输入输出为用户指定的内存**的使用相同 + +* 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端 + +## 申请 pinned host 内存作为输入 + +* **实现在 device_io.cpp 中,函数名字为 pinned_host_input。** + +* 这个 example 中模型运行在 device(CUDA) 上,但是输入输出在 CPU 上,为了加速 host2device 的 +copy,将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出 +output tensor 不是 device 上的时候,默认就是 pinned host 的。 + +* 申请 pinned host 内存的方法是:构建 tensor 的时候指定 device,layout,以及 is_host_pinned +参数,这样申请的内存就是 pinned host 的内存。 + + ```C + bool is_pinned_host = true; + auto tensor_pinned_input = + Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); + ``` + +## 用户指定内存分配器 + +* **实现在 user_allocator.cpp 中,函数名为:config_user_allocator。** + +* 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法,用户自定义 +内存分配器需要继承自 lite 中的 Allocator 基类,并实现 allocate 和 free 两个接口。目前在 CPU +上验证是正确的,其他设备上有待测试。 + +* 设置自定定义内存分配器的接口为 Network 中如下接口: + ```C + Network& set_memory_allocator(std::shared_ptr user_allocator); + ``` + +## 多个 Network 共享同一份模型 weights + +* **实现在 network_share_weights.cpp 中,函数名为:network_share_same_weights。** + +* 很多情况用户希望多个 Network 共享同一份 weights,因为模型中 weights 是只读的,这样可以节省 +模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能,首先创建一个新的 Network, +用户可以指定新的 Config 和 NetworkIO 以及其他一些配置,使得新创建出来的 Network 完成不同的 +功能。 + +* 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口: + ```C + static void shared_weight_with_network( + std::shared_ptr dst_network, + const std::shared_ptr src_network); + ``` + * dst_network: 指新 load 出来的 Network + * src_network:已经 load 的老的 Network + +## CPU 绑核 + +* **实现在 cpu_affinity.cpp 中,函数名为:cpu_affinity。** + +* 该 example 之中指定模型运行在 CPU 多线程上,然后使用 Network 中的 +set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来,用户可以 +根据该 id 决定具体绑核行为,在多线程中,如果线程总数为 n,则 id 为 n-1 的线程为主线程。 + +## 用户注册自定义解密算法和 key + +* **实现在 user_cryption.cpp 中,函数名为:register_cryption_method 和 update_aes_key 。** + +* 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口,实现了使用用户自定的解密算法 +实现模型的 load 操作。在这个 example 中,自定义了一个解密方法,(其实没有做任何事情, +将模型两次异或上 key 之后返回,等于将原始模型直接返回),然后将其注册到 lite 中,后面创建 Network 时候在其 +config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其 +key 的更新操作。 +目前 lite 里面定义好了几种解密算法: + * AES_default : 其 key 是由 32 个 unsighed char 组成,默认为0到31 + * RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char,hash + key 在前,enc_key 在后。 + * SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。 +大概命名规则为:前面大写是具体算法的名字,'_'后面的小写,代表解密 key。 +具体的接口为: + ```C + bool register_decryption_and_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key); + bool update_decryption_or_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key); + ``` +register 接口中必须要求三个参数都是正确的值,update中 decrypt_nam 必须为已有的解密算法, +将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新 + +## 异步执行模式 + +* **实现在 basic.cpp 中,函数名为:async_forward。** + +* 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式, +目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持,在 inference 时异步模式, +主线程可以在工作线程正在执行计算的同时做一些其他的运算,避免长时间等待,但是 +在一些单核处理器上没有收益。 + +## 纯 C example + +* **实现在 lite_c_interface.cpp,函数名为:basic_c_interface, +device_io_c_interface,async_c_interface** + +* Lite 完成对 C++ 接口的封装,对外暴露了纯 C 的接口,用户如果不是源码依赖 Lite +的情况下,应该使用纯 C 接口来完成集成。 +* 纯 C 的所有接口都是返回一个 int,如果这个 int 的数值不为 0,则又错误产生,需要 +调用 LITE_get_last_error 来获取错误信息。 +* 纯 C 的所有 get 函数都需要先定义一个对应的对象,然后将该对象的指针传递进接口, +Lite 会将结果写入到 对应指针的地址里面。 diff --git a/lite/example/mge/basic.cpp b/lite/example/mge/basic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..986f1fc304f1afff9bab6d04c6f1efdffdd75c62 --- /dev/null +++ b/lite/example/mge/basic.cpp @@ -0,0 +1,370 @@ +/** + * \file example/basic.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include +#include "../example.h" +#if LITE_BUILD_WITH_MGE +#include + +#include "misc.h" + +using namespace lite; +using namespace example; + +namespace { +void output_info(std::shared_ptr network, size_t output_size) { + for (size_t index = 0; index < output_size; index++) { + printf("output[%zu] names %s \n", index, + network->get_all_output_name()[index].c_str()); + std::shared_ptr output_tensor = + network->get_output_tensor(index); + size_t ndim = output_tensor->get_layout().ndim; + for (size_t i = 0; i < ndim; i++) { + printf("output[%zu] tensor.shape[%zu] %zu \n", index, i, + output_tensor->get_layout().shapes[i]); + } + } +} + +void output_data_info(std::shared_ptr network, size_t output_size) { + for (size_t index = 0; index < output_size; index++) { + auto output_tensor = network->get_output_tensor(index); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + LiteDataType dtype = output_tensor->get_layout().data_type; + float max = -1000.0f; + float min = 1000.0f; + int max_idx = 0; + int min_idx = 0; + float sum = 0.0f; +#define cb(_dtype, _real_dtype) \ + case LiteDataType::_dtype: { \ + for (size_t i = 0; i < out_length; i++) { \ + _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \ + sum += data; \ + if (max < data) { \ + max = data; \ + max_idx = i; \ + } \ + if (min > data) { \ + min = data; \ + min_idx = i; \ + } \ + } \ + } break; + + switch (dtype) { + cb(LITE_FLOAT, float); + cb(LITE_INT, int); + cb(LITE_INT8, int8_t); + cb(LITE_UINT8, uint8_t); + default: + printf("unknow datatype"); + } + printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n", + out_length, index, max, max_idx, min, min_idx, sum); + } +#undef cb +} +} // namespace + +#if LITE_WITH_CUDA +bool lite::example::load_from_path_run_cuda(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + set_log_level(LiteLogLevel::DEBUG); + //! config the network running in CUDA device + lite::Config config{false, -1, LiteDeviceType::LITE_CUDA}; + //! set NetworkIO + NetworkIO network_io; + std::string input_name = "img0_comp_fullface"; + bool is_host = false; + IO device_input{input_name, is_host}; + network_io.inputs.push_back(device_input); + //! create and load the network + std::shared_ptr network = + std::make_shared(config, network_io); + network->load_model(network_path); + + std::shared_ptr input_tensor = network->get_input_tensor(0); + Layout input_layout = input_tensor->get_layout(); + + //! read data from numpy data file + auto src_tensor = parse_npy(input_path); + + //! malloc the device memory + auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); + + //! copy to the device memory + tensor_device.copy_from(*src_tensor); + + //! Now the device memory if filled with user input data, set it to the + //! input tensor + input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); + + //! forward + { + lite::Timer ltimer("warmup"); + network->forward(); + network->wait(); + ltimer.print_used_time(0); + } + lite::Timer ltimer("forward_iter"); + for (int i = 0; i < 10; i++) { + ltimer.reset_start(); + network->forward(); + network->wait(); + ltimer.print_used_time(i); + } + //! get the output data or read tensor set in network_in + size_t output_size = network->get_all_output_name().size(); + output_info(network, output_size); + output_data_info(network, output_size); + return true; +} +#endif +bool lite::example::basic_load_from_path(const Args& args) { + set_log_level(LiteLogLevel::DEBUG); + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! create and load the network + std::shared_ptr network = std::make_shared(); + network->load_model(network_path); + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + + auto layout = input_tensor->get_layout(); + for (size_t i = 0; i < layout.ndim; i++) { + printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]); + } + + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + void* dst_ptr = input_tensor->get_memory_ptr(); + auto src_tensor = parse_npy(input_path); + auto layout0 = src_tensor->get_layout(); + for (size_t i = 0; i < layout0.ndim; i++) { + printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]); + } + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + + //! forward + { + lite::Timer ltimer("warmup"); + network->forward(); + network->wait(); + ltimer.print_used_time(0); + } + lite::Timer ltimer("forward_iter"); + for (int i = 0; i < 10; i++) { + network->forward(); + network->wait(); + ltimer.print_used_time(i); + } + + //! forward + { + lite::Timer ltimer("warmup"); + network->forward(); + network->wait(); + ltimer.print_used_time(0); + } + for (int i = 0; i < 10; i++) { + ltimer.reset_start(); + network->forward(); + network->wait(); + ltimer.print_used_time(i); + } + + //! get the output data or read tensor set in network_in + size_t output_size = network->get_all_output_name().size(); + output_info(network, output_size); + output_data_info(network, output_size); + return true; +} + +bool lite::example::basic_load_from_path_with_loader(const Args& args) { + set_log_level(LiteLogLevel::DEBUG); + lite::set_loader_lib_path(args.loader_path); + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! create and load the network + std::shared_ptr network = std::make_shared(); + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + + auto input_layout = input_tensor->get_layout(); + + //! copy or forward data to network + auto src_tensor = parse_npy(input_path); + auto src_layout = src_tensor->get_layout(); + if (src_layout.ndim != input_layout.ndim) { + printf("src dim is not equal model input dim\n"); + } + //! pay attention the input shape can change + for (size_t i = 0; i < input_layout.ndim; i++) { + if (input_layout.shapes[i] != src_layout.shapes[i]) { + printf("src shape not equal input shape"); + } + } + input_tensor->set_layout(src_tensor->get_layout()); + + //! reset or forward data to network + input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout()); + + //! forward + network->forward(); + network->wait(); + + //! forward + { + lite::Timer ltimer("warmup"); + network->forward(); + network->wait(); + ltimer.print_used_time(0); + } + lite::Timer ltimer("forward_iter"); + for (int i = 0; i < 10; i++) { + ltimer.reset_start(); + network->forward(); + network->wait(); + ltimer.print_used_time(i); + } + + //! get the output data or read tensor set in network_in + size_t output_size = network->get_all_output_name().size(); + output_info(network, output_size); + output_data_info(network, output_size); + return true; +} + +bool lite::example::basic_load_from_memory(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! create and load the network + std::shared_ptr network = std::make_shared(); + + FILE* fin = fopen(network_path.c_str(), "rb"); + if (!fin) { + printf("failed to open %s.", network_path.c_str()); + } + + fseek(fin, 0, SEEK_END); + size_t size = ftell(fin); + fseek(fin, 0, SEEK_SET); + void* ptr = malloc(size); + std::shared_ptr buf{ptr, ::free}; + auto len = fread(buf.get(), 1, size, fin); + if (len < 1) { + printf("read file failed.\n"); + } + fclose(fin); + + network->load_model(buf.get(), size); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + void* dst_ptr = input_tensor->get_memory_ptr(); + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + printf("length=%zu\n", length); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool lite::example::async_forward(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + Config config; + config.options.var_sanity_check_first_run = false; + + //! create and load the network + std::shared_ptr network = std::make_shared(config); + + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + void* dst_ptr = input_tensor->get_memory_ptr(); + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + + //! set async mode and callback + volatile bool finished = false; + network->set_async_callback([&finished]() { +#if !__DEPLOY_ON_XP_SP2__ + std::cout << "worker thread_id:" << std::this_thread::get_id() + << std::endl; +#endif + finished = true; + }); + +#if !__DEPLOY_ON_XP_SP2__ + std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; +#endif + + //! forward + network->forward(); + size_t count = 0; + while (finished == false) { + count++; + } + printf("Forward finish, count is %zu\n", count); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + printf("length=%zu\n", length); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/cpu_affinity.cpp b/lite/example/mge/cpu_affinity.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0c740b53327b8e536f0e0b4eadb71b0d99cd1511 --- /dev/null +++ b/lite/example/mge/cpu_affinity.cpp @@ -0,0 +1,69 @@ +/** + * \file example/cpu_affinity.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#if LITE_BUILD_WITH_MGE + +using namespace lite; +using namespace example; + +bool lite::example::cpu_affinity(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! create and load the network + std::shared_ptr network = std::make_shared(); + + //! run with multi theads + Runtime::set_cpu_threads_number(network, 4); + + network->load_model(network_path); + + std::vector core_ids = {0, 1, 2, 3}; + auto affinity = [core_ids](int id) { + //! add user define affinity function + set_cpu_affinity({core_ids[id]}); + printf("set thread id = %d with the affinity of core %d.\n", id, + core_ids[id]); + }; + Runtime::set_runtime_thread_affinity(network, affinity); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + void* dst_ptr = input_tensor->get_memory_ptr(); + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + printf("length=%zu\n", length); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/device_io.cpp b/lite/example/mge/device_io.cpp new file mode 100644 index 0000000000000000000000000000000000000000..321bf388bd15ca377824803d8b4f9541db8780e1 --- /dev/null +++ b/lite/example/mge/device_io.cpp @@ -0,0 +1,189 @@ +/** + * \file example/device_io.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include +#include "../example.h" +#if LITE_BUILD_WITH_MGE + +using namespace lite; +using namespace example; + +#if LITE_WITH_CUDA + +bool lite::example::device_input(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! config the network running in CUDA device + lite::Config config{LiteDeviceType::LITE_CUDA}; + + //! set NetworkIO + NetworkIO network_io; + std::string input_name = "data"; + bool is_host = false; + IO device_input{input_name, is_host}; + network_io.inputs.push_back(device_input); + + //! create and load the network + std::shared_ptr network = + std::make_shared(config, network_io); + network->load_model(network_path); + + std::shared_ptr input_tensor = network->get_input_tensor(0); + Layout input_layout = input_tensor->get_layout(); + + //! read data from numpy data file + auto src_tensor = parse_npy(input_path); + + //! malloc the device memory + auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); + + //! copy to the device memory + tensor_device.copy_from(*src_tensor); + + //! Now the device memory if filled with user input data, set it to the + //! input tensor + input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool lite::example::device_input_output(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! config the network running in CUDA device + lite::Config config{LiteDeviceType::LITE_CUDA}; + + //! set NetworkIO include input and output + NetworkIO network_io; + std::string input_name = "data"; + std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + bool is_host = false; + IO device_input{input_name, is_host}; + IO device_output{output_name, is_host}; + network_io.inputs.push_back(device_input); + network_io.outputs.push_back(device_output); + + //! create and load the network + std::shared_ptr network = + std::make_shared(config, network_io); + network->load_model(network_path); + + std::shared_ptr input_tensor_device = network->get_input_tensor(0); + Layout input_layout = input_tensor_device->get_layout(); + + //! read data from numpy data file + auto src_tensor = parse_npy(input_path); + + //! malloc the device memory + auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); + + //! copy to the device memory + tensor_device.copy_from(*src_tensor); + + //! Now the device memory is filled with user input data, set it to the + //! input tensor + input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout); + + //! forward + network->forward(); + network->wait(); + + //! output is in device, should copy it to host + std::shared_ptr output_tensor_device = + network->get_io_tensor(output_name); + + auto output_tensor = std::make_shared(); + output_tensor->copy_from(*output_tensor_device); + + //! get the output data or read tensor set in network_in + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool lite::example::pinned_host_input(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! config the network running in CUDA device + lite::Config config{LiteDeviceType::LITE_CUDA}; + + //! create and load the network + std::shared_ptr network = std::make_shared(config); + network->load_model(network_path); + + std::shared_ptr input_tensor = network->get_input_tensor(0); + Layout input_layout = input_tensor->get_layout(); + + //! read data from numpy data file + auto src_tensor = parse_npy(input_path); + //! malloc the pinned host memory + bool is_pinned_host = true; + auto tensor_pinned_input = + Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); + //! copy to the pinned memory + tensor_pinned_input.copy_from(*src_tensor); + //! set the pinned host memory to the network as input + input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +#endif +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/lite_c_interface.cpp b/lite/example/mge/lite_c_interface.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2a917877a5a213c63e9be48fd16795a60c508878 --- /dev/null +++ b/lite/example/mge/lite_c_interface.cpp @@ -0,0 +1,224 @@ +/** + * \file example/basic_c_interface.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#include "misc.h" +#if LITE_BUILD_WITH_MGE +#include "lite-c/global_c.h" +#include "lite-c/network_c.h" +#include "lite-c/tensor_c.h" + +#include + +#define LITE_CAPI_CHECK(_expr) \ + do { \ + int _ret = (_expr); \ + if (_ret) { \ + LITE_THROW(LITE_get_last_error()); \ + } \ + } while (0) + +bool basic_c_interface(const lite::example::Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! read input data to lite::tensor + auto src_tensor = lite::example::parse_npy(input_path); + void* src_ptr = src_tensor->get_memory_ptr(); + + //! create and load the network + LiteNetwork c_network; + LITE_CAPI_CHECK( + LITE_make_network(&c_network, *default_config(), *default_network_io())); + + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); + + //! set input data to input tensor + LiteTensor c_input_tensor; + LITE_CAPI_CHECK( + LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); + void* dst_ptr; + size_t length_in_byte; + LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, + &length_in_byte)); + LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr)); + //! copy or forward data to network + memcpy(dst_ptr, src_ptr, length_in_byte); + + //! forward + LITE_CAPI_CHECK(LITE_forward(c_network)); + LITE_CAPI_CHECK(LITE_wait(c_network)); + + //! get the output data or read tensor data + const char* output_name; + LiteTensor c_output_tensor; + //! get the first output tensor name + LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, + &c_output_tensor)); + void* output_ptr; + size_t length_output_in_byte; + LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); + LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, + &length_output_in_byte)); + + size_t out_length = length_output_in_byte / sizeof(float); + printf("length=%zu\n", out_length); + + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(output_ptr)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool device_io_c_interface(const lite::example::Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! read input data to lite::tensor + auto src_tensor = lite::example::parse_npy(input_path); + void* src_ptr = src_tensor->get_memory_ptr(); + size_t length_read_in = src_tensor->get_tensor_total_size_in_byte(); + + //! create and load the network + LiteNetwork c_network; + LITE_CAPI_CHECK( + LITE_make_network(&c_network, *default_config(), *default_network_io())); + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); + + //! set input data to input tensor + LiteTensor c_input_tensor; + size_t length_tensor_in; + LITE_CAPI_CHECK( + LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); + LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, + &length_tensor_in)); + if (length_read_in != length_tensor_in) { + LITE_THROW("The input data size is not match the network input tensro " + "size,\n"); + } + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, + length_tensor_in)); + + //! reset the output tensor memory with user allocated memory + size_t out_length = 1000; + LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; + std::shared_ptr ptr(new float[out_length], + [](float* ptr) { delete[] ptr; }); + const char* output_name; + LiteTensor c_output_tensor; + LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, + &c_output_tensor)); + LITE_CAPI_CHECK( + LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); + + //! forward + LITE_CAPI_CHECK(LITE_forward(c_network)); + LITE_CAPI_CHECK(LITE_wait(c_network)); + + printf("length=%zu\n", out_length); + + float max = -1.0f; + float sum = 0.0f; + void* out_data = ptr.get(); + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +namespace { +volatile bool finished = false; +int async_callback(void) { +#if !__DEPLOY_ON_XP_SP2__ + std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; +#endif + finished = true; + return 0; +} +} // namespace + +bool async_c_interface(const lite::example::Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! read input data to lite::tensor + auto src_tensor = lite::example::parse_npy(input_path); + void* src_ptr = src_tensor->get_memory_ptr(); + + LiteNetwork c_network; + LiteConfig config = *default_config(); + config.options.var_sanity_check_first_run = false; + LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io())); + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); + + //! set input data to input tensor + LiteTensor c_input_tensor; + size_t length_tensor_in; + LITE_CAPI_CHECK( + LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); + LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, + &length_tensor_in)); + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, + length_tensor_in)); + +#if !__DEPLOY_ON_XP_SP2__ + std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl; +#endif + + LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback)); + //! forward + LITE_CAPI_CHECK(LITE_forward(c_network)); + size_t count = 0; + while (finished == false) { + count++; + } + printf("The count is %zu\n", count); + finished = false; + + //! get the output data or read tensor data + const char* output_name; + LiteTensor c_output_tensor; + //! get the first output tensor name + LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, + &c_output_tensor)); + void* output_ptr; + size_t length_output_in_byte; + LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); + LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, + &length_output_in_byte)); + + size_t out_length = length_output_in_byte / sizeof(float); + printf("length=%zu\n", out_length); + + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(output_ptr)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/network_share_weights.cpp b/lite/example/mge/network_share_weights.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2b6e741fffa9c7760453aa5650e79412cc55a57 --- /dev/null +++ b/lite/example/mge/network_share_weights.cpp @@ -0,0 +1,78 @@ +/** + * \file example/network_share_weights.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#if LITE_BUILD_WITH_MGE + +using namespace lite; +using namespace example; + +bool lite::example::network_share_same_weights(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! create and load the network + std::shared_ptr network = std::make_shared(); + network->load_model(network_path); + + //! load a new network from the created network and share the same weights, + Config config_new; + config_new.options.const_shape = true; + NetworkIO network_io_new; + std::shared_ptr weight_shared_network = + std::make_shared(config_new, network_io_new); + Runtime::shared_weight_with_network(weight_shared_network, network); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + void* dst_ptr = input_tensor->get_memory_ptr(); + std::shared_ptr input_tensor2 = + weight_shared_network->get_input_tensor(0); + void* dst_ptr2 = input_tensor2->get_memory_ptr(); + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + memcpy(dst_ptr2, src, length); + + //! forward + network->forward(); + network->wait(); + + weight_shared_network->forward(); + weight_shared_network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + std::shared_ptr output_tensor2 = + weight_shared_network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + void* out_data2 = output_tensor2->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + printf("length=%zu\n", length); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + float data2 = static_cast(out_data2)[i]; + if (data != data2) { + printf("the result between the origin network and weight share " + "netwrok is different.\n"); + } + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/reset_io.cpp b/lite/example/mge/reset_io.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d95d834ad31b0303a1295eb7bd59637829c794ac --- /dev/null +++ b/lite/example/mge/reset_io.cpp @@ -0,0 +1,95 @@ +/** + * \file example/reset_io.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#if LITE_BUILD_WITH_MGE + +using namespace lite; +using namespace example; + +bool lite::example::reset_input(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + lite::Config config; + + //! create and load the network + std::shared_ptr network = std::make_shared(config); + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + auto layout = input_tensor->get_layout(); + + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + input_tensor->reset(src, layout); + + //! forward + network->forward(); + network->wait(); + + //! 6. get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool lite::example::reset_input_output(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + lite::Config config; + + //! create and load the network + std::shared_ptr network = std::make_shared(config); + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + auto layout = input_tensor->get_layout(); + + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + input_tensor->reset(src, layout); + + //! set output ptr to store the network output + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < 1000; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/user_allocator.cpp b/lite/example/mge/user_allocator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2fd76e6b6bd1ff8853e3e6d8b19ab3bef42f018b --- /dev/null +++ b/lite/example/mge/user_allocator.cpp @@ -0,0 +1,89 @@ +/** + * \file example/user_allocator.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#if LITE_BUILD_WITH_MGE +using namespace lite; +using namespace example; + +namespace { +class CheckAllocator : public lite::Allocator { +public: + //! allocate memory of size in the given device with the given align + void* allocate(LiteDeviceType, int, size_t size, size_t align) override { +#ifdef WIN32 + return _aligned_malloc(size, align); +#elif defined(__ANDROID__) || defined(ANDROID) + return memalign(align, size); +#else + void* ptr = nullptr; + auto err = posix_memalign(&ptr, align, size); + if (!err) { + printf("failed to malloc %zu bytes with align %zu", size, align); + } + return ptr; +#endif + }; + + //! free the memory pointed by ptr in the given device + void free(LiteDeviceType, int, void* ptr) override { +#ifdef WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif + }; +}; +} // namespace + +bool lite::example::config_user_allocator(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + auto allocator = std::make_shared(); + + //! create and load the network + std::shared_ptr network = std::make_shared(); + + Runtime::set_memory_allocator(network, allocator); + + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + //! copy or forward data to network + size_t length = input_tensor->get_tensor_total_size_in_byte(); + void* dst_ptr = input_tensor->get_memory_ptr(); + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + memcpy(dst_ptr, src, length); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + printf("length=%zu\n", length); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/mge/user_cryption.cpp b/lite/example/mge/user_cryption.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1b6f2f3343255d89ab1ad4fda81b70b8691c1652 --- /dev/null +++ b/lite/example/mge/user_cryption.cpp @@ -0,0 +1,122 @@ +/** + * \file example/user_cryption.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../example.h" +#if LITE_BUILD_WITH_MGE + +using namespace lite; +using namespace example; + +namespace { +std::vector decrypt_model(const void* model_mem, size_t size, + const std::vector& key) { + if (key.size() == 1) { + std::vector ret(size, 0); + const uint8_t* ptr = static_cast(model_mem); + uint8_t key_data = key[0]; + for (size_t i = 0; i < size; i++) { + ret[i] = ptr[i] ^ key_data ^ key_data; + } + return ret; + } else { + printf("the user define decrypt method key length is wrong.\n"); + return {}; + } +} +} // namespace + +bool lite::example::register_cryption_method(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! register the decryption method + register_decryption_and_key("just_for_test", decrypt_model, {15}); + + lite::Config config; + config.bare_model_cryption_name = "just_for_test"; + //! create and load the network + std::shared_ptr network = std::make_shared(config); + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + auto layout = input_tensor->get_layout(); + + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + input_tensor->reset(src, layout); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} + +bool lite::example::update_cryption_key(const Args& args) { + std::string network_path = args.model_path; + std::string input_path = args.input_path; + + //! update the decryption method key + std::vector key(32, 0); + for (size_t i = 0; i < 32; i++) { + key[i] = 31 - i; + } + update_decryption_or_key("AES_default", nullptr, key); + + lite::Config config; + config.bare_model_cryption_name = "AES_default"; + //! create and load the network + std::shared_ptr network = std::make_shared(config); + network->load_model(network_path); + + //! set input data to input tensor + std::shared_ptr input_tensor = network->get_input_tensor(0); + auto layout = input_tensor->get_layout(); + + auto src_tensor = parse_npy(input_path); + void* src = src_tensor->get_memory_ptr(); + input_tensor->reset(src, layout); + + //! forward + network->forward(); + network->wait(); + + //! get the output data or read tensor set in network_in + std::shared_ptr output_tensor = network->get_output_tensor(0); + void* out_data = output_tensor->get_memory_ptr(); + size_t out_length = output_tensor->get_tensor_total_size_in_byte() / + output_tensor->get_layout().get_elem_size(); + float max = -1.0f; + float sum = 0.0f; + for (size_t i = 0; i < out_length; i++) { + float data = static_cast(out_data)[i]; + sum += data; + if (max < data) + max = data; + } + printf("max=%e, sum=%e\n", max, sum); + return true; +} +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/example/npy.h b/lite/example/npy.h new file mode 100644 index 0000000000000000000000000000000000000000..552cda78f7cc203624b58a16ec0213bc65735b60 --- /dev/null +++ b/lite/example/npy.h @@ -0,0 +1,638 @@ +/* + Copyright 2017 Leon Merten Lohse + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#ifndef NPY_H +#define NPY_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace npy { + +/* Compile-time test for byte order. + If your compiler does not define these per default, you may want to define + one of these constants manually. + Defaults to little endian order. */ +#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ + defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ + defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ + defined(__MIBSEB) || defined(__MIBSEB__) +const bool big_endian = true; +#else +const bool big_endian = false; +#endif + +const char magic_string[] = "\x93NUMPY"; +const size_t magic_string_length = 6; + +const char little_endian_char = '<'; +const char big_endian_char = '>'; +const char no_endian_char = '|'; + +constexpr char host_endian_char = + (big_endian ? big_endian_char : little_endian_char); + +/* npy array length */ +typedef unsigned long int ndarray_len_t; + +inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, + unsigned char v_minor = 0) { + ostream.write(magic_string, magic_string_length); + ostream.put(v_major); + ostream.put(v_minor); +} + +inline void read_magic(std::istream& istream, unsigned char& v_major, + unsigned char& v_minor) { + char buf[magic_string_length + 2]; + istream.read(buf, magic_string_length + 2); + + if (!istream) { + fprintf(stderr, "io error: failed reading file"); + } + + if (0 != std::memcmp(buf, magic_string, magic_string_length)) { + fprintf(stderr, "this file does not have a valid npy format."); + } + + v_major = buf[magic_string_length]; + v_minor = buf[magic_string_length + 1]; +} + +// typestring magic +struct Typestring { +private: + char c_endian; + char c_type; + int len; + +public: + inline std::string str() { + const size_t max_buflen = 16; + char buf[max_buflen]; + std::sprintf(buf, "%c%c%u", c_endian, c_type, len); + return std::string(buf); + } + + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'f'}, + len{sizeof(long double)} {} + + Typestring(const std::vector&) + : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} + + Typestring(const std::vector&) + : c_endian{no_endian_char}, + c_type{'u'}, + len{sizeof(unsigned char)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned short)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned int)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned long)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned long long)} {} + + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} +}; + +inline void parse_typestring(std::string typestring) { + std::regex re("'([<>|])([ifuc])(\\d+)'"); + std::smatch sm; + + std::regex_match(typestring, sm, re); + + if (sm.size() != 4) { + fprintf(stderr, "invalid typestring"); + } +} + +namespace pyparse { + +/** + Removes leading and trailing whitespaces + */ +inline std::string trim(const std::string& str) { + const std::string whitespace = " \t"; + auto begin = str.find_first_not_of(whitespace); + + if (begin == std::string::npos) + return ""; + + auto end = str.find_last_not_of(whitespace); + + return str.substr(begin, end - begin + 1); +} + +inline std::string get_value_from_map(const std::string& mapstr) { + size_t sep_pos = mapstr.find_first_of(":"); + if (sep_pos == std::string::npos) + return ""; + + std::string tmp = mapstr.substr(sep_pos + 1); + return trim(tmp); +} + +/** + Parses the string representation of a Python dict + + The keys need to be known and may not appear anywhere else in the data. + */ +inline std::unordered_map parse_dict( + std::string in, std::vector& keys) { + std::unordered_map map; + + if (keys.size() == 0) + return map; + + in = trim(in); + + // unwrap dictionary + if ((in.front() == '{') && (in.back() == '}')) + in = in.substr(1, in.length() - 2); + else { + fprintf(stderr, "Not a Python dictionary."); + } + + std::vector> positions; + + for (auto const& value : keys) { + size_t pos = in.find("'" + value + "'"); + + if (pos == std::string::npos) { + fprintf(stderr, "Missing %s key.", value.c_str()); + } + + std::pair position_pair{pos, value}; + positions.push_back(position_pair); + } + + // sort by position in dict + std::sort(positions.begin(), positions.end()); + + for (size_t i = 0; i < positions.size(); ++i) { + std::string raw_value; + size_t begin{positions[i].first}; + size_t end{std::string::npos}; + + std::string key = positions[i].second; + + if (i + 1 < positions.size()) + end = positions[i + 1].first; + + raw_value = in.substr(begin, end - begin); + + raw_value = trim(raw_value); + + if (raw_value.back() == ',') + raw_value.pop_back(); + + map[key] = get_value_from_map(raw_value); + } + + return map; +} + +/** + Parses the string representation of a Python boolean + */ +inline bool parse_bool(const std::string& in) { + if (in == "True") + return true; + if (in == "False") + return false; + + fprintf(stderr, "Invalid python boolan."); + return false; +} + +/** + Parses the string representation of a Python str + */ +inline std::string parse_str(const std::string& in) { + if ((in.front() == '\'') && (in.back() == '\'')) + return in.substr(1, in.length() - 2); + + fprintf(stderr, "Invalid python string."); + return ""; +} + +/** + Parses the string represenatation of a Python tuple into a vector of its items + */ +inline std::vector parse_tuple(std::string in) { + std::vector v; + const char seperator = ','; + + in = trim(in); + + if ((in.front() == '(') && (in.back() == ')')) + in = in.substr(1, in.length() - 2); + else { + fprintf(stderr, "Invalid Python tuple."); + } + + std::istringstream iss(in); + + for (std::string token; std::getline(iss, token, seperator);) { + v.push_back(token); + } + + return v; +} + +template +inline std::string write_tuple(const std::vector& v) { + if (v.size() == 0) + return ""; + + std::ostringstream ss; + + if (v.size() == 1) { + ss << "(" << v.front() << ",)"; + } else { + const std::string delimiter = ", "; + // v.size() > 1 + ss << "("; + std::copy(v.begin(), v.end() - 1, + std::ostream_iterator(ss, delimiter.c_str())); + ss << v.back(); + ss << ")"; + } + + return ss.str(); +} + +inline std::string write_boolean(bool b) { + if (b) + return "True"; + else + return "False"; +} + +} // namespace pyparse + +inline void parse_header(std::string header, std::string& descr) { + /* + The first 6 bytes are a magic string: exactly "x93NUMPY". + The next 1 byte is an unsigned byte: the major version number of the file + format, e.g. x01. The next 1 byte is an unsigned byte: the minor version + number of the file format, e.g. x00. Note: the version of the file format + is not tied to the version of the numpy package. The next 2 bytes form a + little-endian unsigned short int: the length of the header data + HEADER_LEN. The next HEADER_LEN bytes form the header data describing the + array's format. It is an ASCII string which contains a Python literal + expression of a dictionary. It is terminated by a newline ('n') and + padded with spaces + ('x20') to make the total length of the magic string + 4 + HEADER_LEN be + evenly divisible by 16 for alignment purposes. The dictionary contains + three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. For repeatability and + readability, this dictionary is formatted using pprint.pformat() so the + keys are in alphabetic order. + */ + + // remove trailing newline + if (header.back() != '\n') + fprintf(stderr, "invalid header"); + header.pop_back(); + + // parse the dictionary + std::vector keys{"descr"}; + auto dict_map = npy::pyparse::parse_dict(header, keys); + + if (dict_map.size() == 0) + fprintf(stderr, "invalid dictionary in header"); + + std::string descr_s = dict_map["descr"]; + parse_typestring(descr_s); + // remove + descr = npy::pyparse::parse_str(descr_s); + return; +} + +inline void parse_header(std::string header, std::string& descr, + bool& fortran_order, + std::vector& shape) { + /* + The first 6 bytes are a magic string: exactly "x93NUMPY". + The next 1 byte is an unsigned byte: the major version number of the file + format, e.g. x01. The next 1 byte is an unsigned byte: the minor version + number of the file format, e.g. x00. Note: the version of the file format + is not tied to the version of the numpy package. The next 2 bytes form a + little-endian unsigned short int: the length of the header data + HEADER_LEN. The next HEADER_LEN bytes form the header data describing the + array's format. It is an ASCII string which contains a Python literal + expression of a dictionary. It is terminated by a newline ('n') and + padded with spaces + ('x20') to make the total length of the magic string + 4 + HEADER_LEN be + evenly divisible by 16 for alignment purposes. The dictionary contains + three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. "fortran_order" : bool Whether + the array data is Fortran-contiguous or not. Since Fortran-contiguous + arrays are a common form of non-C-contiguity, we allow them to be written + directly to disk for efficiency. "shape" : tuple of int The shape of the + array. For repeatability and readability, this dictionary is formatted + using pprint.pformat() so the keys are in alphabetic order. + */ + + // remove trailing newline + if (header.back() != '\n') + fprintf(stderr, "invalid header"); + header.pop_back(); + + // parse the dictionary + std::vector keys{"descr", "fortran_order", "shape"}; + auto dict_map = npy::pyparse::parse_dict(header, keys); + + if (dict_map.size() == 0) + fprintf(stderr, "invalid dictionary in header"); + + std::string descr_s = dict_map["descr"]; + std::string fortran_s = dict_map["fortran_order"]; + std::string shape_s = dict_map["shape"]; + + // TODO: extract info from typestring + parse_typestring(descr_s); + // remove + descr = npy::pyparse::parse_str(descr_s); + + // convert literal Python bool to C++ bool + fortran_order = npy::pyparse::parse_bool(fortran_s); + + // parse the shape tuple + auto shape_v = npy::pyparse::parse_tuple(shape_s); + if (shape_v.size() == 0) + fprintf(stderr, "invalid shape tuple in header"); + + for (auto item : shape_v) { + ndarray_len_t dim = static_cast(std::stoul(item)); + shape.push_back(dim); + } +} + +inline std::string write_header_dict(const std::string& descr, + bool fortran_order, + const std::vector& shape) { + std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); + std::string shape_s = npy::pyparse::write_tuple(shape); + + return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + + ", 'shape': " + shape_s + ", }"; +} + +inline void write_header(std::ostream& out, const std::string& descr, + bool fortran_order, + const std::vector& shape_v) { + std::string header_dict = write_header_dict(descr, fortran_order, shape_v); + + size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; + + unsigned char version[2] = {1, 0}; + if (length >= 255 * 255) { + length = magic_string_length + 2 + 4 + header_dict.length() + 1; + version[0] = 2; + version[1] = 0; + } + size_t padding_len = 16 - length % 16; + std::string padding(padding_len, ' '); + + // write magic + write_magic(out, version[0], version[1]); + + // write header length + if (version[0] == 1 && version[1] == 0) { + char header_len_le16[2]; + uint16_t header_len = static_cast(header_dict.length() + + padding.length() + 1); + + header_len_le16[0] = (header_len >> 0) & 0xff; + header_len_le16[1] = (header_len >> 8) & 0xff; + out.write(reinterpret_cast(header_len_le16), 2); + } else { + char header_len_le32[4]; + uint32_t header_len = static_cast(header_dict.length() + + padding.length() + 1); + + header_len_le32[0] = (header_len >> 0) & 0xff; + header_len_le32[1] = (header_len >> 8) & 0xff; + header_len_le32[2] = (header_len >> 16) & 0xff; + header_len_le32[3] = (header_len >> 24) & 0xff; + out.write(reinterpret_cast(header_len_le32), 4); + } + + out << header_dict << padding << '\n'; +} + +inline std::string read_header(std::istream& istream) { + // check magic bytes an version number + unsigned char v_major, v_minor; + read_magic(istream, v_major, v_minor); + + uint32_t header_length = 0; + if (v_major == 1 && v_minor == 0) { + char header_len_le16[2]; + istream.read(header_len_le16, 2); + header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); + + if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { + // TODO: display warning + } + } else if (v_major == 2 && v_minor == 0) { + char header_len_le32[4]; + istream.read(header_len_le32, 4); + + header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | + (header_len_le32[2] << 16) | (header_len_le32[3] << 24); + + if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { + // TODO: display warning + } + } else { + fprintf(stderr, "unsupported file format version"); + } + + auto buf_v = std::vector(); + buf_v.reserve(header_length); + istream.read(buf_v.data(), header_length); + std::string header(buf_v.data(), header_length); + + return header; +} + +inline ndarray_len_t comp_size(const std::vector& shape) { + ndarray_len_t size = 1; + for (ndarray_len_t i : shape) + size *= i; + + return size; +} + +template +inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, + unsigned int n_dims, const unsigned long shape[], + const std::vector& data) { + Typestring typestring_o(data); + std::string typestring = typestring_o.str(); + + std::ofstream stream(filename, std::ofstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::vector shape_v(shape, shape + n_dims); + write_header(stream, typestring, fortran_order, shape_v); + + auto size = static_cast(comp_size(shape_v)); + + stream.write(reinterpret_cast(data.data()), + sizeof(Scalar) * size); +} + +template +inline void LoadArrayFromNumpy(const std::string& filename, + std::vector& shape, + std::vector& data) { + bool fortran_order; + LoadArrayFromNumpy(filename, shape, fortran_order, data); +} + +template +inline void LoadArrayFromNumpy(const std::string& filename, + std::vector& shape, + bool& fortran_order, std::vector& data) { + std::ifstream stream(filename, std::ifstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::string header = read_header(stream); + + // parse header + std::string typestr; + + parse_header(header, typestr, fortran_order, shape); + + // check if the typestring matches the given one + Typestring typestring_o{data}; + std::string expect_typestr = typestring_o.str(); + if (typestr != expect_typestr) { + fprintf(stderr, "formatting error: typestrings not matching"); + } + + // compute the data size based on the shape + auto size = static_cast(comp_size(shape)); + data.resize(size); + + // read the data + stream.read(reinterpret_cast(data.data()), sizeof(Scalar) * size); +} + +inline void LoadArrayFromNumpy(const std::string& filename, + std::string& type_str, + std::vector& shape, + std::vector& data) { + std::ifstream stream(filename, std::ifstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::string header = read_header(stream); + bool fortran_order; + // parse header + parse_header(header, type_str, fortran_order, shape); + + // check if the typestring matches the given one + std::string size_str = type_str.substr(type_str.size() - 1); + size_t elem_size = atoi(size_str.c_str()); + + // compute the data size based on the shape + auto byte_size = elem_size * static_cast(comp_size(shape)); + data.resize(byte_size); + + // read the data + stream.read(reinterpret_cast(data.data()), byte_size); +} + +} // namespace npy + +#endif // NPY_H diff --git a/lite/include/lite/common_enum_c.h b/lite/include/lite/common_enum_c.h new file mode 100644 index 0000000000000000000000000000000000000000..ed4db6c5ef6d658b98d82fa906de4859a59cc91a --- /dev/null +++ b/lite/include/lite/common_enum_c.h @@ -0,0 +1,97 @@ +/** + * \file inlude/lite/common_enum_c.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_COMMON_ENUM_C_H_ +#define LITE_COMMON_ENUM_C_H_ + +/*! + * \brief The log level. + */ +typedef enum LiteLogLevel { + DEBUG = 0, /*!< The lowest level and most verbose */ + INFO = 1, /*!< The lowest level and most verbose */ + WARN = 2, /*!< Print only warning and errors */ + ERROR = 3, /*!< Print only errors */ +} LiteLogLevel; + +typedef enum LiteBackend { + LITE_DEFAULT = 0, //! default backend is mge +} LiteBackend; + +typedef enum LiteDeviceType { + LITE_CPU = 0, + LITE_CUDA = 1, + LITE_ATLAS = 3, + LITE_NPU = 4, + //! when the device information is set in model, so set LITE_DEVICE_DEFAULT + //! in lite + LITE_DEVICE_DEFAULT = 5, +} LiteDeviceType; + +typedef enum LiteDataType { + LITE_FLOAT = 0, + LITE_HALF = 1, + LITE_INT = 2, + LITE_INT16 = 3, + LITE_INT8 = 4, + LITE_UINT8 = 5, + LITE_UINT = 6, + LITE_UINT16 = 7, + LITE_INT64 = 8, +} LiteCDataType; + +typedef enum LiteTensorPhase { + //! Tensor maybe input or output + LITE_IO = 0, + //! Tensor is input + LITE_INPUT = 1, + //! Tensor is output + LITE_OUTPUT = 2, +} LiteTensorPhase; + +/*! + * \brief the input and output type, include SHAPE and VALUE + * sometimes user only need the shape of the output tensor + */ +typedef enum LiteIOType { + LITE_IO_VALUE = 0, + LITE_IO_SHAPE = 1, +} LiteIOType; + +/*! + * \brief operation algorithm seletion strategy type, some operations have + * multi algorithms, different algorithm has different attribute, according to + * the strategy, the best algorithm will be selected. + * + * Note: These strategies can be combined + * + * 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, + * use heuristic instead + * + * 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the + * reproducible algo + * + * 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best + * algorithm from the reproducible algorithms set + * + * 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best + * algorithm form the optimzed algorithms, thus profile will process fast + * + * 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: + * profile the best algorithm form the optimzed and reproducible algorithms + */ +typedef enum LiteAlgoSelectStrategy { + LITE_ALGO_HEURISTIC = 1 << 0, + LITE_ALGO_PROFILE = 1 << 1, + LITE_ALGO_REPRODUCIBLE = 1 << 2, + LITE_ALGO_OPTIMIZED = 1 << 3, +} LiteAlgoSelectStrategy; + +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/include/lite/global.h b/lite/include/lite/global.h new file mode 100644 index 0000000000000000000000000000000000000000..2737f6a38f91f41b8766b14338b4a0b0af71186e --- /dev/null +++ b/lite/include/lite/global.h @@ -0,0 +1,157 @@ +/** + * \file inlude/lite/global.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "macro.h" +#include "network.h" + +#include +#include +#include + +namespace lite { + +/** + * \brief Model decryption function + * + * \param[in] const void* is the decrypted model memory pointer + * \param[in] size_t the size the decrypted model memory in byte + * \param[in] const std::vector& the decryption key vector + */ +using DecryptionFunc = std::function( + const void*, size_t, const std::vector&)>; + +/** + * \brief register a custom decryption method and key to lite. + * + * \param[in] decrypt_name the name of the decryption, which will act as the + * hash key to find the decryption method. + * + * \param[in] func the decryption function, which will decrypt the model with + * the registered key, return a vector that contain the decrypted model. + * + * \param[in] key the decryption key of the method + */ +LITE_API bool register_decryption_and_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key); + +/** + * \brief update decryption function or key of a custom decryption method. + * + * \param[in] decrypt_name the name of the decryption, which will act as the + * hash key to find the decryption method. + * + * \param[in] func the decryption function, which will decrypt the model with + * the registered key, return a vector that contain the decrypted model. if + * function is nullptr, it will not be updated. + * + * \param[in] key the decryption key of the method, if the size of key is zero, + * it will not be updated + */ +LITE_API bool update_decryption_or_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key); + +/** + * \brief Model information parse function + * + * \param[in] const void* is the information memory + * \param[in] size_t the size the information memory + * \param[in] const std::string the model name used for check whether the + * infomation match the model + * \param[in] Config the model config, ParseInfoFunc can fill it with the + * information in json, the config will influence Network loading later + * \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the + * information in json, the networkio will influence Network forwarding later + * \param[in] std::unordered_map& isolated_config_map, the + * other config not inclue in config and networkIO, ParseInfoFunc can fill it + * with the information in json, now support: + * "device_id" : int, default 0 + * "number_threads" : size_t, default 1 + * "is_inplace_model" : bool, default false + * "use_tensorrt" : bool, default false + */ +using ParseInfoFunc = std::function& isolated_config_map, + std::string& extra_info)>; + +/** + * \brief register a custom parser function to lite. + * + * \param[in] info_type the name of the parser function, which will act as the + * hash key to find the parser method. + * + * \param[in] parse_func the parser function, which will parse the given + * information and modify the Network Config and IO. + * + */ +LITE_API bool register_parse_info_func(std::string info_type, + const ParseInfoFunc& parse_func); + +/*! \brief Get version + */ +LITE_API void get_version(int& major, int& minor, int& patch); + +/*! \brief Set the current log level. + * \param[in] level The new log level + */ +LITE_API void set_log_level(LiteLogLevel level); + +/*! \brief Get the current log level. + * \return The current log level + */ +LITE_API LiteLogLevel get_log_level(); + +/*! \brief Get device count + * \param[in] device_type device type + * \return the device count + */ +LITE_API size_t get_device_count(LiteDeviceType device_type); + +/*! \brief try to coalesce all free memory in megenine + */ +LITE_API void try_coalesce_all_free_memory(); + +/*! + * \brief Set the loader to the lite + * \param loader_path is the file path which store the cache + */ +LITE_API void set_loader_lib_path(const std::string& loader_path); + +/*! + * \brief Set the algo policy cache file for CPU/CUDA ... + * \param cache_path is the file path which store the cache + * \param always_sync sync the cache when model run + */ +LITE_API void set_persistent_cache(const std::string& cache_path, + bool always_sync = false); + +/*! + * \brief dump the PersistentCache policy cache to file, if the network is set + * to profile when forward, though this the algo policy will dump to file + */ +LITE_API void dump_persistent_cache(const std::string& cache_path); + +/*! + * \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine + */ +LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path); + +/*! + * \brief dump the TensorRT cache to the file set in set_tensor_rt_cache + */ +LITE_API void dump_tensor_rt_cache(); + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/include/lite/macro.h b/lite/include/lite/macro.h new file mode 100644 index 0000000000000000000000000000000000000000..7f3dc91e34acc304f9c7414038c5d0baaa78dba8 --- /dev/null +++ b/lite/include/lite/macro.h @@ -0,0 +1,20 @@ +/** + * \file include/lite/macro.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_MACRO_H_ +#define LITE_MACRO_H_ + +#if defined(_WIN32) +#define LITE_API __declspec(dllexport) +#else +#define LITE_API __attribute__((visibility("default"))) +#endif +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h new file mode 100644 index 0000000000000000000000000000000000000000..2082d81489ad8cb930e551efd0232fb54ac226c4 --- /dev/null +++ b/lite/include/lite/network.h @@ -0,0 +1,368 @@ +/** + * \file inlude/lite/network.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "macro.h" +#include "tensor.h" + +#include +#include +#include +#include +#include + +namespace lite { + +LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x, + LiteAlgoSelectStrategy y) { + return static_cast(static_cast(x) | + static_cast(y)); +} + +/*! + * \brief the inference options which will be translated to megenine + * + * \param weight_preprocess is the option wich optimize the inferece performance + * with preprocess the const weights + * + * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + + * dimshuffle + * + * \param fake_next_exec whether only to perform non-computing tasks (like + * memory allocation and queue initialization) for next exec. This would be + * reset to false when the graph is executed. + * + * \param var_sanity_check_first_run Disable var sanity check on the first run. + * Var sanity check is enabled on the first-time execution by default, and can + * be used to find some potential memory access errors in the operator + * implementation. + * + * \param const_shape This can be used to reduce memory usage since some + * static inference data structures can be omitted. + * + * \param force_dynamic_alloc force dynamic memory alloc for all vars + * + * \param force_output_dynamic_alloc force dynamic memory alloc for output vars + * which are used as CallbackCaller input when call compile() function + * + * \param no_profiling_on_shape_change do not re-profile to select best impl + * algo when input shape changes (use previous algo) + * + * \param jit_level Execute supported operators with JIT (support MLIR, + * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: + * 1 for basic elemwise opr; + * 2 for including reduce operator + * + * \param record_level flag optimize the inference performace with record the + * kernel tasks in first run, hereafter the inference all need to execute the + * recorded tasks. + * level = 0 means the normal inference, + * level = 1 means use record inference, + * level = 2 means record inference with free the extra memory + * + * \param graph_opt_level optimization level: + * 0: disable + * 1: level-1: inplace arith transformations during graph + * construction + * 2: level-2: level-1, plus global optimization before graph + * compiling + * 3: also enable JIT + * <0: corresponding level, with result check for debug + * + * \param async_exec_level exec: dispatch on separate threads for different + * comp_node. + * 0: do not perform async dispatch + * 1: dispatch async if there are more than one comp node with limited queue + * mask 0b10: async if there are multiple comp nodes with + * mask 0b100: always async + */ +struct LITE_API Options { + bool weight_preprocess = false; + bool fuse_preprocess = false; + bool fake_next_exec = false; + bool var_sanity_check_first_run = true; + bool const_shape = false; + bool force_dynamic_alloc = false; + bool force_output_dynamic_alloc = false; + bool no_profiling_on_shape_change = false; + uint8_t jit_level = 0; + uint8_t comp_node_seq_record_level = 0; + uint8_t graph_opt_level = 2; + uint16_t async_exec_level = 1; + + //! layout transform options + bool enable_nchw44 = false; + bool enable_nchw44_dot = false; + bool enable_nchw88 = false; + bool enable_nhwcd4 = false; + bool enable_nchw4 = false; + bool enable_nchw32 = false; + bool enable_nchw64 = false; +}; + +/*! + * \brief Configuration when load and compile the graph + * + * \param bare_model_cryption_name is the bare model cryption method name, bare + *model is not pack json info inside + * + *\param has_compression flag whether the model is compressed, the compress + *method will read form the model + */ +struct LITE_API Config { + bool has_compression = false; + int device_id = 0; + LiteDeviceType device_type = LiteDeviceType::LITE_CPU; + LiteBackend backend = LiteBackend::LITE_DEFAULT; + std::string bare_model_cryption_name = {}; + Options options = {}; +}; + +/*! + * \brief config the network input and output item + * + */ +struct LITE_API IO { + //! the tensor name in the graph corresponding to the IO + std::string name; + + //! Used to mark where the input tensor comes from and the output where copy + //! to, if is_host is true, the input is from host and output copy to host, + //! otherwise device. Sometimes The input is from device and output no need + //! copy to host, default is true. + bool is_host = true; + + //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or + //! output tensor value is invaid, only shape will be set, default is VALUE + LiteIOType io_type = LiteIOType::LITE_IO_VALUE; + + //! The layout of the config from user, if other layout is set before + //! forward or get after forward by input tensor reset, this layout will by + //! pass. if no other layout is set before forward, this layout will work. + //! if this layout is no set, the model will forward with its origin layout. + //! if in output, it will used to check. + Layout config_layout = {}; +}; + +/*! + * \brief the input and output information when load the network + * the NetworkIO will remain in the network until the network is destroyed + */ +struct LITE_API NetworkIO { + std::vector inputs = {}; + std::vector outputs = {}; +}; + +/*! + * \brief A user-implemented allocator interface + */ +class LITE_API Allocator { +public: + virtual ~Allocator() = default; + + //! allocate memory of size in the given device with the given align + virtual void* allocate(LiteDeviceType device_type, int device_id, + size_t size, size_t align) = 0; + + //! free the memory pointed by ptr in the given device + virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0; +}; + +/*! + * \brief the thread affinith callback type + * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), + * thread_id of (nr_threads - 1) is the main worker thread. + */ +using ThreadAffinityCallback = std::function; + +using AsyncCallback = std::function; + +/*! + * \brief the start/finish callback function + * \param unordered_map map from the io tensor name to the pair of which is the + * corresponding IO of user config and the realy input or output tensor. + */ +using StartCallback = std::function>>&)>; +using FinishCallback = std::function>>&)>; + +/*! + * \brief The network is construct form a model, implement model load, init, + * forward, and display some model information + */ +class LITE_API Network { +public: + class NetworkImplBase; + + ~Network(); + + Network(const Config& config = {}, const NetworkIO& networkio = {}); + + Network(const NetworkIO& networkio, const Config& config = {}); + + //! load the model form memory + void load_model(void* model_mem, size_t size); + + //! load the model from a model path + void load_model(std::string model_path); + + //! only compute the output tensor in user configured + void compute_only_configured_output(); + + //! get the network input and output tensor, the layout of which is + //! sync from mge tensor, when the name of input and output tensor are the + //! same, use LiteTensorPhase to separate + std::shared_ptr get_io_tensor( + std::string io_name, + LiteTensorPhase phase = LiteTensorPhase::LITE_IO); + + //! get the network input by index + std::shared_ptr get_input_tensor(size_t index); + + //! get the network output tensor by index + std::shared_ptr get_output_tensor(size_t index); + + //! set the network forward in async mode and set the async callback + //! function + Network& set_async_callback(const AsyncCallback& async_callback); + + //! set the start forward callback function, which will be execute before + //! forward. this can be used to check network input or dump model inputs + //! for debug + Network& set_start_callback(const StartCallback& start_callback); + + //! set the finish forward callback function, which will be execute after + //! forward. this can be used to dump model outputs for debug + Network& set_finish_callback(const FinishCallback& finish_callback); + + //! forward the network with filled input data and fill the output data + //! to the output tensor + void forward(); + + //! waite until forward finish in sync model + void wait(); + + //! get the input tensor name in the order in load return + std::string get_input_name(size_t index) const; + + //! get the output tensor name in the order in load return + std::string get_output_name(size_t index) const; + + //! get all the input tensor name in the order in load return + std::vector get_all_input_name() const; + + //! get all the output tensor name in the order in load return + std::vector get_all_output_name() const; + + //! set/get device id, default device id = 0 + Network& set_device_id(int device_id); + int get_device_id() const; + + //! set/get stream id, default stream id = 0 + Network& set_stream_id(int stream_id); + int get_stream_id() const; + + //! enable profile the network, a file will be generated + void enable_profile_performance(std::string profile_file_path); + + //! get model extra info + const std::string& get_model_extra_info(); + + //! get device type + LiteDeviceType get_device_type() const; + +public: + friend class NetworkHelper; + +private: + //! update member from implement + void update_from_implement(); + + //! decrypt and parse the model file + void prase_model(std::shared_ptr model_data, size_t size); + +private: + bool m_loaded = false; + Config m_config; + NetworkIO m_network_io; + std::unique_ptr m_impl; + std::string m_extra_info; +}; + +/*********************** MGE special network function ***************/ +class LITE_API Runtime { +public: + //! When device is CPU, this interface will set the to be loaded model + //! run in multi thread mode with the given thread number. + static void set_cpu_threads_number(std::shared_ptr dst_network, + size_t nr_threads); + static size_t get_cpu_threads_number(std::shared_ptr dst_network); + + //! set threads affinity callback; + static void set_runtime_thread_affinity( + std::shared_ptr network, + const ThreadAffinityCallback& thread_affinity_callback); + + //! Set cpu default mode when device is CPU, in some low computation + //! device or single core device, this mode will get good performace + static void set_cpu_inplace_mode(std::shared_ptr dst_network); + static bool is_cpu_inplace_mode(std::shared_ptr dst_network); + + //! Set use tensorrt forward + static void use_tensorrt(std::shared_ptr dst_network); + + //! set opr algorithm selection strategy in the network + //! shared_batch_size: the batch size used by fastrun, + //! Non-zero value means that fastrun use this batch size + //! regardless of the batch size of the model. Zero means + //! fastrun use batch size of the model + //! binary_equal_between_batch: if the content of each input batch is binary + //! equal,whether the content of each output + //! batch is promised to be equal + static void set_network_algo_policy( + std::shared_ptr dst_network, + LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0, + bool binary_equal_between_batch = false); + + //! set workspace_limit for oprs with multiple algorithms, set + //! workspace limitation can save memory but may influence the performance + static void set_network_algo_workspace_limit( + std::shared_ptr dst_network, size_t workspace_limit); + + //! set the network memroy allocator, the allocator is defined by user + static void set_memory_allocator(std::shared_ptr dst_network, + std::shared_ptr user_allocator); + + //! share the runtime memory with other network, the weights is not shared + static void share_runtime_memory_with(std::shared_ptr dst_network, + std::shared_ptr src_network); + + //! Dump input/output values of all internal variables to output + //! file, in txt format + static void enable_io_txt_dump(std::shared_ptr dst_network, + std::string io_txt_out_file); + + //! Dump input/output values of all internal variables to output + //! directory, in binary format + static void enable_io_bin_dump(std::shared_ptr dst_network, + std::string io_bin_out_dir); + + //! load a new network which will share weights with src network + static void shared_weight_with_network( + std::shared_ptr dst_network, + const std::shared_ptr src_network); +}; + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/include/lite/tensor.h b/lite/include/lite/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..6e7a56526d7e8f8590aebf3b7053d51b2b0ea51b --- /dev/null +++ b/lite/include/lite/tensor.h @@ -0,0 +1,224 @@ +/** + * \file inlude/lite/tensor.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "common_enum_c.h" +#include "macro.h" + +#include +#include +#include + +namespace lite { + +/*! + * \brief the simple layout description + */ +struct LITE_API Layout { + static constexpr uint32_t MAXDIM = 7; + size_t shapes[MAXDIM]; + size_t ndim = 0; + LiteDataType data_type = LiteDataType::LITE_FLOAT; + + //! get the total byte of a layout + size_t get_elem_size() const; + + //! compare whether the two layout is equal + bool operator==(const Layout& other) const; +}; + +/*! + * \brief warpper of the MegEngine Tensor + * + * The memory is not alloc directly, when call get_memory_ptr() the memory + * will be allocated in tensor implement, which will be deleted automatically + * + * Note: if the tensor memory is set through reset() interface, the memory is + * managed by the user, it will not be freed by the tensor + * + * If the device or layout is not set, when copy form other source tensor, its + * device and layout will be copy form the source tensor + * + * if is_pinned_host is set, the storage memory of the tensor is pinned memory, + * this is used to Optimize the H2D or D2H memory copy, if the device or layout + * is not set, when copy form other device(CUDA) tensor, this tensor + * will be automatically set to pinned tensor + */ +class LITE_API Tensor { + class TensorImpl; + +public: + class TensorImplBase; + + Tensor(); + Tensor(LiteDeviceType device_type, bool is_pinned_host = false); + Tensor(LiteDeviceType device_type, const Layout& layout, + bool is_pinned_host = false); + Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {}, + bool is_pinned_host = false); + Tensor(int device_id, int stream_id, LiteDeviceType device_type, + bool is_pinned_host = false); + Tensor(LiteBackend backend, + LiteDeviceType device_type = LiteDeviceType::LITE_CPU, + int device_id = 0, const Layout& layout = {}, + bool is_pinned_host = false); + ~Tensor(); + + LiteDeviceType get_device_type() const { return m_device_type; }; + + int get_device_id() const { return m_device_id; }; + + Layout get_layout() const { return m_layout; }; + + bool is_pinned_host() const { return m_is_pinned_host; }; + + //! set layout will change the layout and reallocate memory of the tensor + void set_layout(const Layout& layout); + + //! which will trigger memory alloc in tensor implement + void* get_memory_ptr() const; + + //! get the memory with the offset describe in idx + void* get_memory_ptr(const std::vector& idx) const; + + //! get the tensor capacity in byte + size_t get_tensor_total_size_in_byte() const; + + //! use the user allocated data to reset the memory of the tensor, the + //! memory will not be managed by the lite, later, the user should delete + //! it. + void reset(void* prepared_data, size_t data_length_in_byte); + + //! use the user allocated data and corresponding layout to reset the data + //! and layout of the tensor, the memory will not be managed by lite, later, + //! the user should delete it. + void reset(void* prepared_data, const Layout& layout); + + //! reshape the tensor with new shape, keep the data_type the same + void reshape(const std::vector& shape); + + //! get a new tensor slice from the origin tensor + std::shared_ptr slice(const std::vector& start, + const std::vector& end, + const std::vector& step = {}); + + //! set the tensor memory with zero + void fill_zero(); + + //! copy tensor form other tensor + //! Note: the best way for tensor copy is just set the dst device, left + //! layout empty, when copying the dst layout will be set the same with + //! src + void copy_from(const Tensor& src); + + //! share memory with other tensor + void share_memory_with(const Tensor& src_tensor); + + //! whether the memory of tensor is continue + bool is_continue_memory() const; + + //! update the menbers from the implement + void update_from_implement(); + +public: + friend class TensorHelper; + +private: + std::shared_ptr m_tensor_impl; + + //! flag whether the storage of the tensor is pinned, this is only used + //! when the compnode is not in CPU + bool m_is_pinned_host = false; + int m_device_id = 0; + Layout m_layout; + //! the device of the tensor should not be changed after the tensor has + //! constructed + LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU; +}; + +/** + * \brief a class can hold any type data, but not check whether the visit type + * is valid + */ +class LITE_API LiteAny { +public: + LiteAny() = default; + template + LiteAny(T value) : m_holder(new AnyHolder(value)) { + m_is_string = std::is_same(); + } + + LiteAny(const LiteAny& any) { + m_holder = any.m_holder->clone(); + m_is_string = any.is_string(); + } + LiteAny& operator=(const LiteAny& any) { + m_holder = any.m_holder->clone(); + m_is_string = any.is_string(); + return *this; + } + bool is_string() const { return m_is_string; } + + class HolderBase { + public: + virtual ~HolderBase() = default; + virtual std::shared_ptr clone() = 0; + virtual size_t type_length() const = 0; + }; + + template + class AnyHolder : public HolderBase { + public: + AnyHolder(const T value) : + m_value(value) { + } + virtual std::shared_ptr clone() override { + return std::make_shared(m_value); + } + virtual size_t type_length() const override { return sizeof(T); } + + public: + T m_value; + }; + //! if type is miss matching, it will throw + void type_missmatch(size_t expect, size_t get) const; + + //! only check the storage type and the visit type length, so it's not safe + template + T unsafe_cast() const { + if (sizeof(T) != m_holder->type_length()) { + type_missmatch(m_holder->type_length(), sizeof(T)); + } + return static_cast*>(m_holder.get())->m_value; + } + //! only check the storage type and the visit type length, so it's not safe + void* cast_void_ptr() const { + return &static_cast*>(m_holder.get())->m_value; + } + +private: + std::shared_ptr m_holder; + bool m_is_string = false; +}; + +/*********************** special tensor function ***************/ +class LITE_API TensorUtils { +public: + //! concat all the input tensor to one on the specified dim, the result + //! tensor reside in dst_device_id of dst_device, if dst_device is + //! LITE_DEVICE_DEFAULT, the device will get from the first tensor + static std::shared_ptr concat( + const std::vector& tensors, int dim, + LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT, + int dst_device_id = -1); +}; +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/include/lite-c/global_c.h b/lite/lite-c/include/lite-c/global_c.h new file mode 100644 index 0000000000000000000000000000000000000000..2b19c4e4a957286a147068d10c3ff98761d7b5a0 --- /dev/null +++ b/lite/lite-c/include/lite-c/global_c.h @@ -0,0 +1,169 @@ +/** + * \file lite-c/include/lite-c/global-c.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_C_GLOBAL_H_ +#define LITE_C_GLOBAL_H_ + +#include "macro.h" +#include "network_c.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Get version + */ +LITE_API int LITE_get_version(int* major, int* minor, int* patch); + +/*! \brief Get the last error message. + * \return the message pointer + */ +LITE_API const char* LITE_get_last_error(); + +/*! \brief Get device count + * \param[in] device_type device type + * \return the device count + */ +LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count); + +/*! \brief try to coalesce all free memory in megenine + */ +LITE_API int LITE_try_coalesce_all_free_memory(); + +/** + * \brief Model decryption function + * + * \param[in] input_data is the decrypted model memory pointer + * \param[in] input_size the size the decrypted model memory in byte + * \param[in] key_data decryption key data + * \param[in] key_size the size of decryption key data + * \param[out] output_data the data of decrypted data, if output_data is + * nullptr, just query the output memory length, else write the decryted data to + * the output_data + * \return size of decrypted data + */ +typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size, + const uint8_t* key_data, size_t key_size, + const void* output_data); + +/** + * \brief Model information parse function + * + * \param[in] info_data is the information memory + * \param[in] info_size the size the information memory + * \param[in] model_name the model name used for check whether the + * infomation match the model + * \param[in] config the model config, ParseInfoFunc can fill it with the + * information in json, the config will influence Network loading later + * \param[in] network_io the model IO, ParseInfoFunc can fill it with the + * information in json, the networkio will influence Network forwarding later + * \param[in] device_id the address to store device_id, default 0 + * \param[in] nr_threads the address to store nr_threads, default 1 + * \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default + * \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default + * false + */ +typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size, + const char* model_name, LiteConfig* config, + LiteNetworkIO* network_io, int* device_id, + size_t* nr_threads, int* is_cpu_inplace_mode, + int* use_tensorrt); + +/** + * \brief register a custom decryption method and key to lite. + * + * \param[in] decrypt_name the name of the decryption, which will act as the + * hash key to find the decryption method. + * + * \param[in] func the decryption function, which will decrypt the model with + * the registered key, return a vector that contain the decrypted model. + * \param[in] key_data the decryption key of the method + * \param[in] key_size the size of decryption key + */ +LITE_API int LITE_register_decryption_and_key(const char* decrypt_name, + const LiteDecryptionFunc func, + const uint8_t* key_data, + size_t key_size); + +/** + * \brief update decryption function or key of a custom decryption method. + * + * \param[in] decrypt_name the name of the decryption, which will act as the + * hash key to find the decryption method. + * + * \param[in] func the decryption function, which will decrypt the model with + * the registered key, return a vector that contain the decrypted model. if + * function is nullptr, it will not be updated. + * + * \param[in] key the decryption key of the method, if the size of key is zero, + * it will not be updated + */ +LITE_API int LITE_update_decryption_or_key(const char* decrypt_name, + const LiteDecryptionFunc func, + const uint8_t* key_data, + size_t key_size); + +/** + * \brief register a custom parser function to lite. + * + * \param[in] info_type the name of the parser function, which will act as the + * hash key to find the parser method. + * + * \param[in] parse_func the parser function, which will parse the given + * information and modify the Network Config and IO. + * + */ +LITE_API int LITE_register_parse_info_func(const char* info_type, + const LiteParseInfoFunc parse_func); + +/*! + * \brief Set the loader to the lite + * \param[in] loader_path is the file path which store the cache + */ +LITE_API int LITE_set_loader_lib_path(const char* loader_path); + +/*! + * \brief Set the algo policy cache file for CPU/CUDA ... + * \param[in] cache_path is the file path which store the cache + * \param[in] always_sync sync the cache when cache updated + */ +LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync); + +/*! + * \brief Set the tensor policy cache file for CPU/CUDA ... + * \param[in] cache_path is the file path which store the cache + */ +LITE_API int LITE_set_tensor_rt_cache(const char* cache_path); + +/*! \brief Set the current log level. + * \param[in] level The new log level + */ +LITE_API int LITE_set_log_level(LiteLogLevel level); + +/*! \brief Get the current log level. + * \param[in] level The pointer to log level + */ +LITE_API int LITE_get_log_level(LiteLogLevel* level); +/*! + * \brief dump the algo policy cache to file, if the network is set to profile + * when forward, though this the algo policy will dump to file + * \param[in] cache_path is the file path which store the cache + */ +LITE_API int LITE_dump_persistent_cache(const char* cache_path); + +/*! + * \brief dump the tensorrt policy cache to file + */ +LITE_API int LITE_dump_tensor_rt_cache(); +#endif +#ifdef __cplusplus +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h new file mode 100644 index 0000000000000000000000000000000000000000..84b13502f0f24f0ce0859cedc106e10cff5efb5c --- /dev/null +++ b/lite/lite-c/include/lite-c/network_c.h @@ -0,0 +1,525 @@ +/** + * \file lite-c/include/lite-c/network_c.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_C_NETWORK_H_ +#define LITE_C_NETWORK_H_ + +#include "tensor_c.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief the inference options which will be translated to megenine + * + * \param weight_preprocess is the option wich optimize the inferece performance + * with preprocess the const weights + * + * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + + * dimshuffle + * + * \param fake_next_exec whether only to perform non-computing tasks (like + * memory allocation and queue initialization) for next exec. This would be + * reset to false when the graph is executed. + * + * \param var_sanity_check_first_run Disable var sanity check on the first run. + * Var sanity check is enabled on the first-time execution by default, and can + * be used to find some potential memory access errors in the operator + * implementation. + * + * \param const_shape This can be used to reduce memory usage since some + * static inference data structures can be omitted. + * + * \param force_dynamic_alloc force dynamic memory alloc for all vars + * + * \param force_output_dynamic_alloc force dynamic memory alloc for output vars + * which are used as CallbackCaller input when call compile() function + * + * \param no_profiling_on_shape_change do not re-profile to select best impl + * algo when input shape changes (use previous algo) + * + * \param jit_level Execute supported operators with JIT (support MLIR, + * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: + * 1 for basic elemwise opr; + * 2 for including reduce operator + * + * \param record_level flag optimize the inference performace with record the + * kernel tasks in first run, hereafter the inference all need to execute the + * recorded tasks. + * level = 0 means the normal inference, + * level = 1 means use record inference, + * level = 2 means record inference with free the extra memory + * + * \param graph_opt_level optimization level: + * 0: disable + * 1: level-1: inplace arith transformations during graph + * construction + * 2: level-2: level-1, plus global optimization before graph + * compiling + * 3: also enable JIT + * <0: corresponding level, with result check for debug + * + * \param async_exec_level exec: dispatch on separate threads for different + * comp_node. + * 0: do not perform async dispatch + * 1: dispatch async if there are more than one comp node with limited queue + * mask 0b10: async if there are multiple comp nodes with + * mask 0b100: always async + */ +typedef struct Options { + int weight_preprocess; + int fuse_preprocess; + int fake_next_exec; + int var_sanity_check_first_run; + int const_shape; + int force_dynamic_alloc; + int force_output_dynamic_alloc; + int no_profiling_on_shape_change; + int jit_level; + int comp_node_seq_record_level; + int graph_opt_level; + int async_exec_level; + + //! layout transform options + int enable_nchw44; + int enable_nchw44_dot; + int enable_nchw88; + int enable_nhwcd4; + int enable_nchw4; + int enable_nchw32; + int enable_nchw64; +} LiteOptions; + +//! define a default Options +extern LITE_API const LiteOptions default_option; + +/*! + * \brief Configuration when load and compile the graph + * + * \param bare_model_cryption_name is the bare model cryption method name, bare + *model is not pack json info inside + * + *\param has_compression flag whether the model is compressed, the compress + *method will read form the model + */ +typedef struct LiteConfig { + int has_compression; + int device_id; + LiteDeviceType device_type; + LiteBackend backend; + const char* bare_model_cryption_name; + LiteOptions options; +} LiteConfig; + +//! get default config +LITE_API LiteConfig* default_config(); + +/*! + * \brief config the network input and output item + * + */ +typedef struct LiteIO { + //! the tensor name in the graph corresponding to the IO + const char* name; + + //! Used to mark where the input tensor comes from and the output where copy + //! to, if is_host is true, the input is from host and output copy to host, + //! otherwise device. Sometimes The input is from device and output no need + //! copy to host, default is true. + int is_host; + + //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or + //! output tensor value is invaid, only shape will be set, default is VALUE + LiteIOType io_type; + + //! The layout of the config from user, if other layout is set before + //! forward or get after forward, this layout will by pass. if no other + //! layout is set before forward, this layout will work. if this layout is + //! no set, the model will forward with its origin layout. if in output, it + //! will used to check. + LiteLayout config_layout; +} LiteIO; + +//! define a default IO +extern LITE_API const LiteIO default_io; + +/*! + * \brief the input and output information when load the network + * the NetworkIO will remain in the network until the network is destroyed + */ +typedef struct LiteNetworkIO { + LiteIO* inputs; + LiteIO* outputs; + size_t input_size; //! the number IO in inputs + size_t output_size; //! the number IO in outputs +} LiteNetworkIO; + +//! get default NetworkIO +LITE_API LiteNetworkIO* default_network_io(); + +/*! + * \brief A user-implemented allocator function + */ +//! allocate memory of size in the given device with the given align +typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id, + size_t size, size_t align); +//! free the memory pointed by ptr in the given device +typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr); + +/*! + * \brief the thread affinith callback type + * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), + * thread_id of (nr_threads - 1) is the main worker thread. + */ +typedef int (*LiteThreadAffinityCallback)(int thread_id); + +typedef int (*LiteAsyncCallback)(); + +/*! + * \brief the start/finish callback function + * \param unordered_map map from the io tensor name to the pair of which is the + * corresponding IO of user config and the realy input or output tensor. + */ + +typedef int (*LiteStartCallback)(const LiteIO* inputs, + const LiteTensor* input_tensors, size_t size); + +typedef int (*LiteFinishCallback)(const LiteIO* outputs, + const LiteTensor* output_tensors, + size_t size); + +/*! + * \brief The network is construct form a model, implement model load, init, + * forward, and display some model information + */ +typedef void* LiteNetwork; + +/** + * \brief Create a lite Network object with default config and networkIO. + * \param[out] network The netwrok pointer + * \return int if the return is not zero, error happened, the error message + * can get by LITE_get_last_error + */ +LITE_API int LITE_make_default_network(LiteNetwork* network); + +/** + * \brief Create a lite Network object from the given config and networkIO. + * \param[in] config The configration to create the network + * \param[in] network_io The configration io to create the network + * \param[out] network The network pointer + */ +LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config, + const LiteNetworkIO network_io); + +/** + * \brief Create a lite Network object from the given config and networkIO. + * \param[in] config The configration to create the network + * \param[out] network The network pointer + */ +LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config); + + +/** + * \brief load the model to network form memory + * \param[in] model_mem The model in memory + * \param[in] size The size of the model memory + * \param[out] network The network to be load model in + */ +LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, + size_t size); + +/** + * \brief load the model to network form given path + * \param[in] model_path The model path + * \param[out] network The network to be load model in + */ +LITE_API int LITE_load_model_from_path(LiteNetwork network, + const char* model_path); + +/** + * \brief load a new network which will share weights with src network + * \param[in] origin_network The origin network pointer + * \param[out] network The network pointer + */ +LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network, + const LiteNetwork src_network); + +/** + * \brief Destroy a lite network object. + * \param[in] network The network pointer + * \return int if the return is not zero, error happened, the error message + * can get by LITE_get_last_error + */ +LITE_API int LITE_destroy_network(LiteNetwork network); + +/** + * \brief forward the network with filled input data and fill the output data + * to the output tensor + * \param[in] network The loaded model + */ +LITE_API int LITE_forward(const LiteNetwork network); + +/** + * \brief waite until forward finish in sync model + * \param[in] network The loaded model + */ +LITE_API int LITE_wait(const LiteNetwork network); + +/** + * \brief get the network input and ouput tensor, the layout of which is + * get from model + * \param[in] network The loaded model + * \param[in] io_name The input or output name + * \param[in] phase The tensor phase + * \param[out] tensor The IO tensor get from the network + */ +LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name, + LiteTensorPhase phase, LiteTensor* tensor); + +/** + * \brief get the input tensor name in the order in loaded model + * \param[in] network The loaded model + * \param[in] index The index of input tensor + * \param[out] name The input tensor name + */ +LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index, + const char** name); + +/** + * \brief get the output tensor name in the order in loaded model + * \param[in] network The loaded model + * \param[in] index The index of output tensor + * \param[out] name The output tensor name + */ +LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index, + const char** name); + +/** + * \brief get all the input tensor name in the order in loaded model + * \param[in] network The loaded model + * \param[in] size The number of the input tensor + * \param[out] name The input tensor names + */ +LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size, + const char** name); + +/** + * \brief get all the output tensor name in the order in loaded model + * \param[in] network The loaded model + * \param[in] size The number of output tensor + * \param[out] name The output tensor name + */ +LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size, + const char** name); + +/** + * \brief get whether the model is running in cpu inplace mode + * \param[in] network The loaded model + * \param[out] is_cpu_inplace_mode whether is in cpu inplace mode + */ +LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network, + int* is_cpu_inplace_mode); + +/** + * \brief get the number of thread the network will run with + * \param[in] network The loaded model + * \param[out] nr_threads the thread number when the network running + */ +LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network, + size_t* nr_threads); + +/** + * \brief get the device id the network will run with + * \param[in] network The loaded model + * \param[out] device_id the device id of the network will run + */ +LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id); + +/** + * \brief get the stream id the network will run with + * \param[in] network The loaded model + * \param[out] stream_id the stream id of the network will run + */ +LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id); + +/** + * \brief get the device type the network will run with + * \param[in] network The loaded model + * \param[out] device_type the device type of the network will run + */ +LITE_API int LITE_get_device_type(const LiteNetwork network, + LiteDeviceType* device_type); + +/** + * \brief get the device type the network will run with + * \param[in] network The loaded model + * \param[out] info : the json format memory + * \param[out] info_size: the json format memory size + */ +LITE_API int LITE_get_model_extra_info(const LiteNetwork network, + const char** info, int* info_size); + +/** + * \brief Set cpu default mode when device is CPU, in some low computation + * device or single core device, this mode will get good performace + * \param[in] network The loaded model + */ +LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network); + +/** + * \brief When device is CPU, this interface will set the to be loaded model + * run in multi thread mode with the given thread number. + * \param[in] network The loaded model + * \param[in] nr_threads The threads number + */ +LITE_API int LITE_set_cpu_threads_number(LiteNetwork network, + size_t nr_threads); + +/** + * \brief set device id, default device id = 0 + * \param[in] network The loaded model + * \param[in] device_id The device id to be set + */ +LITE_API int LITE_set_device_id(LiteNetwork network, int device_id); + +/** + * \brief set stream id, default stream id = 0 + * \param[in] network The loaded model + * \param[in] stream_id The stream id to be set + */ +LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id); + +/** + * \brief enable tensorrt + * \param[in] network The loaded model + */ +LITE_API int LITE_use_tensorrt(LiteNetwork network); + +/** + * \brief set opr algorithm selection strategy in the network + * \param[in] network The loaded model + * \param[in] select_strategy The operator algorithm selection strategy + */ +LITE_API int LITE_set_network_algo_policy(LiteNetwork network, + LiteAlgoSelectStrategy strategy); + +/** + * \brief set opr algorithm selection strategy in the network + * \param[in] network The loaded model + * \param[in] shared_batch_size: the batch size used by fastrun, + * Non-zero value means that fastrun use this batch size + * regardless of the batch size of the model. Zero means + * fastrun use batch size of the model + * \param[in] binary_equal_between_batch: if the content of each input batch is + * binary equal,whether the content of each output batch is + * promised to be equal + */ +LITE_API int LITE_set_network_algo_fastrun_config( + LiteNetwork network, unsigned int shared_batch_size, + int binary_equal_between_batch); + +/** + * \brief set workspace_limit for oprs with multiple algorithms, set + * workspace limit can save memory but may influence the performance + * \param[in] network The loaded model + * \param[in] workspace_limit The operator algorithm workspace limit + */ +LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network, + size_t workspace_limit); + +/** + * \brief set the network forward in async mode and set the async callback + * function + * \param[in] network The loaded model + * \param[in] async_callback when network finish forwarding, the callbak + * will be called + */ +LITE_API int LITE_set_async_callback(LiteNetwork network, + const LiteAsyncCallback async_callback); + +/** + * \brief set the start forward callback function, which will be execute beform + * forward, this can be used to check network input or dump model inputs + * for debug + * \param[in] network The loaded model + * \param[in] start_callback when network start forwarding, the callbak + * will be called + */ +LITE_API int LITE_set_start_callback(LiteNetwork network, + const LiteStartCallback start_callback); + +/** + * \brief set the finish forward callback function, which will be execute after + * forward, this can be used to dump model outputs for debug + * \param[in] network The loaded model + * \param[in] finish_callback when network finish forwarding, the callbak + * will be called + */ +LITE_API int LITE_set_finish_callback(LiteNetwork network, + const LiteFinishCallback finish_callback); + +/** + * \brief set threads affinity callback + * \param[in] network The loaded model + * \param[in] thread_affinity_callback + */ +LITE_API int LITE_set_runtime_thread_affinity( + LiteNetwork network, + const LiteThreadAffinityCallback thread_affinity_callback); + +/** + * \brief set the network memroy allocator, the allocator is defined by user + * \param[in] network The loaded model + * \param[in] allocate_fun The allocate function of the user defined allocator + * \param[in] free_fun The free function of the user defined allocator + */ +LITE_API int LITE_set_memory_allocator(LiteNetwork network, + const LiteAllocate allocate_fun, + const LiteFree free_fun); + +/** + * \brief the dst_network share the runtime memory with src_network + * \param[in] src_network The source network + * \param[in] dst_network The dst network to shared memory with src_network + */ +LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network, + LiteNetwork dst_network); + +/** + * \brief enable profile the network, a JSON format file will be generated + * \param[in] network The loaded model + * \param[in] profile_json_file_path The profile result file path + */ +LITE_API int LITE_enable_profile_performance( + LiteNetwork network, const char* profile_json_file_path); + +/** + * \brief Dump input/output values of all internal variables to output file, + * in text format + * \param[in] network The loaded model + * \param[in] io_txt_out_file The dumped txt file name + */ +LITE_API int LITE_enable_io_txt_dump(LiteNetwork network, + const char* io_txt_out_file); + +/** + * \brief Dump input/output values of all internal variables to output + * directory, in binary format + * \param[in] network The loaded model + * \param[in] io_bin_out_dir The dumped bin file directory + */ +LITE_API int LITE_enable_io_bin_dump(LiteNetwork network, + const char* io_bin_out_dir); + +#ifdef __cplusplus +} +#endif +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/include/lite-c/tensor_c.h b/lite/lite-c/include/lite-c/tensor_c.h new file mode 100644 index 0000000000000000000000000000000000000000..96316a9a9f526a1c138e4a7aea263ae197a74b83 --- /dev/null +++ b/lite/lite-c/include/lite-c/tensor_c.h @@ -0,0 +1,251 @@ +/** + * \file lite-c/include/lite-c/tensor_c.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_TENSOR_C_H_ +#define LITE_TENSOR_C_H_ + +#include "common_enum_c.h" +#include "macro.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#include "stddef.h" +#include "stdint.h" + +#define LAYOUT_MAX_DIM (7) + +/*! + * \brief the simple layout description + */ +typedef struct LiteLayout { + size_t shapes[LAYOUT_MAX_DIM]; + size_t ndim; + LiteDataType data_type; +} LiteLayout; + +//! define a default LiteLayout +extern LITE_API const LiteLayout default_layout; + +/*! + * \brief warpper of the MegEngine Tensor + * + * if is_pinned_host is set, the storage memory of the tensor is pinned memory, + * this is used to Optimize the H2D or D2H memory copy, if the device or layout + * is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor + * will be automatically set to pinned tensor + */ +typedef struct LiteTensorDesc { + //! flag whether the storage of the tensor is pinned, this is only used when + //! the compnode is not in CPU + int is_pinned_host; + + //! the layout of the tensor + LiteLayout layout; + + //! the device of the tensor should not be changed after the tensor has + //! constructed + LiteDeviceType device_type; + + //! device id of the tensor + int device_id; +} LiteTensorDesc; + +//! define a default TensorDesc +extern LITE_API const LiteTensorDesc default_desc; + +/*! + * \brief The pointer to a Lite Tensor object + */ +typedef void* LiteTensor; + +/** + * \brief Create a lite tensor object from the given describe. + * \param[in] tensor_describe The description to create the Tensor + * \param[out] tensor The Tensor pointer + * \return int if the return is not zero, error happened, the error message + * can get by LITE_get_last_error + */ +LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe, + LiteTensor* tensor); + +/** + * \brief Destroy a lite tensor object. + * \param[in] tensor The Tensor pointer + * \return int if the return is not zero, error happened, the error message + * can get by LITE_get_last_error + */ +LITE_API int LITE_destroy_tensor(LiteTensor tensor); + +/** + * \brief change the layout of a Tensor object. + * \param[in] tensor The Tensor + * \param[out] layout The Layout to be set to a tensor + */ +LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout); + +/** + * \brief use the user allocated data to reset the memory of the tensor, the + * memory will not be managed by the lite, later, the user should delete + * it. + * \param[in] tensor The Tensor + * \param[in] prepared_data The allocated memory which satisfy the Tensor + * \param[in] data_length_in_byte The length of the allocated memory + * layout + */ +LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, + size_t data_length_in_byte); + +/** + * \brief use the user allocated data and corresponding layout to reset the + * data and layout of the tensor, the memory will not be managed by lite, later, + * the user should delete it. + * \param[in] tensor The Tensor + * \param[in] layout The Layout to be set to the tensor + * \param[in] prepared_data The allocated memory which satisfy the layout to be + * set + */ +LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, + void* prepared_data); + +/** + * \brief reshape a tensor with the memroy not change, the total number of + * element in the reshaped tensor must equal to the origin tensor, the input + * shape must only contain one or zero -1 to flag it can be deduced + * automatically. + * \param[in] tensor The Tensor to be reshape + * \param[in] shape the user input shape + * \param[in] size the number of data in shape, + */ +LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size); + +/** + * \brief slice a tensor with input param + * \param[in] tensor The Tensor to be slice + * \param[in] start start index of every axis of to be sliced + * \param[in] end end index of every axis of to be sliced + * \param[in] step step of every axis of to be sliced, if nullptr, step will be + * 1 + * \param[in] size the number axis to be sliced + * \param[out] sliced_tensor the result tensor sliced from the origin tensor + */ +LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, + const size_t* end, const size_t* step, + size_t size, LiteTensor* slice_tensor); + +/** + * \brief fill zero to the tensor + * \param[in] tensor The Tensor to be memset + */ +LITE_API int LITE_tensor_fill_zero(LiteTensor tensor); + +/** + * \brief copy tensor form other tensor + * \param[out] dst_tensor The Tensor to copy into + * \param[in] src_tensor The Tensor to copy from + */ +LITE_API int LITE_tensor_copy(LiteTensor dst_tensor, + const LiteTensor src_tensor); + +/** + * \brief share memory form other tensor + * \param[out] dst_tensor The Tensor to share into + * \param[in] src_tensor The Tensor to be shared + */ +LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor, + const LiteTensor src_tensor); + +/** + * \brief get the memory pointer of a Tensor object. + * \param[in] tensor The input Tensor + * \param[out] data a pointer to void pointer + */ +LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data); + +/** + * \brief get the memory pointer of a Tensor object. + * \param[in] tensor The input Tensor + * \param[in] index The coordinate in the tensor + * \param[in] size The lenght of coordinate + * \param[out] data a pointer to void pointer + */ +LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor, + const size_t* index, size_t size, + void** data); + +/** + * \brief get the tensor capacity in byte of a Tensor object. + * \param[in] tensor The input Tensor + * \param[out] size_ptr a pointer to the return size + + */ +LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, + size_t* size); + +/** + * \brief get the tensor layout of a Tensor object. + * \param[in] tensor The input Tensor + * \param[out] layout_ptr a pointer will be write with the layout of the tensor + */ +LITE_API int LITE_get_tensor_layout(const LiteTensor tensor, + LiteLayout* layout); + +/** + * \brief get the tensor device of a Tensor object. + * \param[in] tensor The input Tensor + * \param[out] device_ptr a pointer will be write with the device of the tensor + */ +LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor, + LiteDeviceType* device_type); + +/** + * \brief get the tensor device id of a Tensor object. + * \param[in] tensor The input Tensor + * \param[out] device_id a pointer will be write with the device id of the + * tensor + */ +LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id); + +/** + * \brief whether the tensor is is_pinned_host. + * \param[in] tensor The input Tensor + * \param[out] is_pinned_host_ptr a int pointer will be write with whether the + * tensor is pinned host + */ +LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host); + +/** + * \brief whether the tensor memory is continue. + * \param[in] tensor The input Tensor + * \param[out] is_continue a int pointer will be write with whether the + * tensor continue + */ +LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue); +/** + * \brief concat the inputs tensor to one big tensor + * \param[in] tensors ptr The input Tensors + * \param[in] nr_tensors number input Tensor + * \param[in] dim the dim concat act on + * \param[in] dst_device the device type of result tensor, when + * LITE_DEVICE_DEFAULT, the result tensor device type will get from the first + * tensor + * \param[in] device_id the device id of result tensor, when -1, the result + * tensor device id will get from the first tensor + * \param[out] result_tensor the result tensor after concat + */ +LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, + LiteDeviceType dst_device, int device_id, + LiteTensor* result_tensor); + +#ifdef __cplusplus +} +#endif +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/src/common.h b/lite/lite-c/src/common.h new file mode 100644 index 0000000000000000000000000000000000000000..47208c386fa12c08d55d6737fe62e92fc3f1d16f --- /dev/null +++ b/lite/lite-c/src/common.h @@ -0,0 +1,73 @@ +/** + * \file lite-c/src/common.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#ifndef LITE_C_COMMON_H_ +#define LITE_C_COMMON_H_ + +#include "../src/misc.h" +#include "lite-c/network_c.h" +#include "lite-c/tensor_c.h" +#include "lite/network.h" + +#include +#include + +//! convert c Layout to lite::Layout +lite::Layout convert_to_layout(const LiteLayout& layout); + +//! convert lite::Layout to C Layout +LiteLayout convert_to_clayout(const lite::Layout& layout); + +//! convert c config to lite::config +lite::Config convert_to_lite_config(const LiteConfig c_config); + +//! convert C NetworkIO io to lite::NetworkIO +lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io); + +/*! + * \brief handle exception + * \param e the exception + * \return the return value of the error + */ +int LiteHandleException(const std::exception& e); +#if LITE_ENABLE_EXCEPTION +/*! \brief macro to guard a function */ +#define LITE_CAPI_BEGIN() try { +/*! \brief every function starts with LITE_CAPI_BEGIN(); + * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS + */ +#define LITE_CAPI_END() \ + } \ + catch (std::exception & _except_) { \ + return LiteHandleException(_except_); \ + } \ + return 0; +#else +/*! \brief macro to guard a function */ +#define LITE_CAPI_BEGIN() { +/*! \brief every function starts with LITE_CAPI_BEGIN(); + * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS + */ +#define LITE_CAPI_END() \ + } \ + return 0; +#endif +/*! + * \brief catch the exception with stms + */ +#define LITE_CAPI_END_WITH_STMS(_stms) \ + } \ + catch (std::exception & _except_) { \ + _stms; \ + return LiteHandleException(_except_); \ + } \ + return 0; + +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/src/global.cpp b/lite/lite-c/src/global.cpp new file mode 100644 index 0000000000000000000000000000000000000000..51145b784a9d99663a5d5147bb18e2f89c010c2a --- /dev/null +++ b/lite/lite-c/src/global.cpp @@ -0,0 +1,192 @@ +/** + * \file lite-c/src/tensor.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/global.h" +#include "common.h" +#include "lite-c/global_c.h" + +#include +#include + +namespace { + +class ErrorMsg { +public: + std::string& get_error_msg() { return error_msg; } + void set_error_msg(const std::string& msg) { error_msg = msg; } + +private: + std::string error_msg; +}; +ErrorMsg& get_global_error() { + static thread_local ErrorMsg error_msg; + return error_msg; +} +} // namespace + +int LiteHandleException(const std::exception& e) { + get_global_error().set_error_msg(e.what()); + return -1; +} + +const char* LITE_get_last_error() { + return get_global_error().get_error_msg().c_str(); +} + +int LITE_get_version(int* major, int* minor, int* patch) { + LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null"); + lite::get_version(*major, *minor, *patch); + return 0; +} + +int LITE_get_device_count(LiteDeviceType device_type, size_t* count) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(count, "The ptr pass to LITE api is null"); + *count = lite::get_device_count(device_type); + LITE_CAPI_END(); +} + +int LITE_try_coalesce_all_free_memory(){ + LITE_CAPI_BEGIN(); + lite::try_coalesce_all_free_memory(); + LITE_CAPI_END(); +} + +int LITE_register_decryption_and_key(const char* decrypt_name, + const LiteDecryptionFunc func, + const uint8_t* key_data, size_t key_size) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(decrypt_name && key_data && func, + "The ptr pass to LITE api is null"); + std::vector key; + for (size_t i = 0; i < key_size; i++) { + key.push_back(key_data[i]); + } + auto decrypt_func = [func](const void* input_data, size_t input_size, + const std::vector& key) { + auto size = + func(input_data, input_size, key.data(), key.size(), nullptr); + std::vector output(size, 0); + func(input_data, input_size, key.data(), key.size(), output.data()); + return output; + }; + lite::register_decryption_and_key(decrypt_name, decrypt_func, key); + LITE_CAPI_END(); +} + +int LITE_update_decryption_or_key(const char* decrypt_name, + const LiteDecryptionFunc func, + const uint8_t* key_data, size_t key_size) { + LITE_CAPI_BEGIN(); + std::vector key; + for (size_t i = 0; i < key_size; i++) { + key.push_back(key_data[i]); + } + lite::DecryptionFunc decrypt_func = nullptr; + if (func) { + decrypt_func = [func](const void* input_data, size_t input_size, + const std::vector& key) { + auto size = func(input_data, input_size, key.data(), key.size(), + nullptr); + std::vector output(size, 0); + func(input_data, input_size, key.data(), key.size(), output.data()); + return output; + }; + } + lite::update_decryption_or_key(decrypt_name, decrypt_func, key); + LITE_CAPI_END(); +} + +int LITE_register_parse_info_func(const char* info_type, + const LiteParseInfoFunc parse_func) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null"); + auto lite_func = [parse_func]( + const void* info_data, size_t info_size, + const std::string model_name, lite::Config& config, + lite::NetworkIO& network_io, + std::unordered_map& + separate_config_map, + std::string& extra_info) { + LITE_MARK_USED_VAR(extra_info); + size_t nr_threads = 1; + int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false; + LiteNetworkIO c_io; + LiteConfig c_config; + auto ret = parse_func(info_data, info_size, model_name.c_str(), + &c_config, &c_io, &device_id, &nr_threads, + &is_cpu_inplace_mode, &use_tensorrt); + config = convert_to_lite_config(c_config); + network_io = convert_to_lite_io(c_io); + if (device_id != 0) { + separate_config_map["device_id"] = device_id; + } + if (nr_threads != 1) { + separate_config_map["nr_threads"] = nr_threads; + } + if (is_cpu_inplace_mode != false) { + separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode; + } + if (use_tensorrt != false) { + separate_config_map["use_tensorrt"] = use_tensorrt; + } + return ret; + }; + lite::register_parse_info_func(info_type, lite_func); + LITE_CAPI_END(); +} + +int LITE_set_loader_lib_path(const char* loader_path) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(loader_path, "The ptr pass to LITE api is null"); + lite::set_loader_lib_path(loader_path); + LITE_CAPI_END(); +} + +int LITE_set_persistent_cache(const char* cache_path, int always_sync) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); + lite::set_persistent_cache(cache_path, always_sync); + LITE_CAPI_END(); +} + +int LITE_set_tensor_rt_cache(const char* cache_path) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); + lite::set_tensor_rt_cache(cache_path); + LITE_CAPI_END(); +} + +int LITE_set_log_level(LiteLogLevel level) { + LITE_CAPI_BEGIN(); + lite::set_log_level(level); + LITE_CAPI_END(); +} + +int LITE_get_log_level(LiteLogLevel* level) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(level, "The ptr pass to LITE api is null"); + *level = lite::get_log_level(); + LITE_CAPI_END(); +} + +int LITE_dump_persistent_cache(const char* cache_path) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); + lite::dump_persistent_cache(cache_path); + LITE_CAPI_END(); +} + +int LITE_dump_tensor_rt_cache() { + LITE_CAPI_BEGIN(); + lite::dump_tensor_rt_cache(); + LITE_CAPI_END(); +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp new file mode 100644 index 0000000000000000000000000000000000000000..409924287909058d399701b360377a4ab3ea2e23 --- /dev/null +++ b/lite/lite-c/src/network.cpp @@ -0,0 +1,580 @@ +/** + * \file lite-c/src/network.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/network.h" +#include "common.h" +#include "lite-c/network_c.h" + +#include "../../src/network_impl_base.h" + +#include +#include +#include +#include + +//! define a default Options +const LiteOptions default_option = { + .weight_preprocess = false, + .fuse_preprocess = false, + .fake_next_exec = false, + .var_sanity_check_first_run = true, + .const_shape = false, + .force_dynamic_alloc = false, + .force_output_dynamic_alloc = false, + .no_profiling_on_shape_change = false, + .jit_level = 0, + .comp_node_seq_record_level = 0, + .graph_opt_level = 2, + .async_exec_level = 1, + //! layout transform options + .enable_nchw44 = 0, + .enable_nchw44_dot = 0, + .enable_nchw88 = 0, + .enable_nhwcd4 = 0, + .enable_nchw4 = 0, + .enable_nchw32 = 0, + .enable_nchw64 = 0, + +}; + +//! define a default config +LiteConfig default_config_t = {.has_compression = false, + .device_id = -1, + .device_type = LiteDeviceType::LITE_CPU, + .backend = LiteBackend::LITE_DEFAULT, + .bare_model_cryption_name = nullptr, + .options = default_option}; +LiteConfig* default_config() { + return &default_config_t; +} + +//! define a default IO +const LiteIO default_io = {.name = nullptr, + .is_host = true, + .io_type = LiteIOType::LITE_IO_VALUE, + .config_layout = default_layout}; + +//! define a default NetworkIO +LiteNetworkIO default_network_io_t = {.inputs = nullptr, + .outputs = nullptr, + .input_size = 0, + .output_size = 0}; +LiteNetworkIO* default_network_io() { + return &default_network_io_t; +} + +namespace { +std::unordered_map>& +get_gloabl_network_holder() { + static thread_local std::unordered_map> + network_holder; + return network_holder; +} + +/*! + * \brief A user-implemented allocator interface + */ +class UserAllocator : public lite::Allocator { +public: + UserAllocator(LiteAllocate allocate_func, LiteFree free_func) + : m_allocator(allocate_func), m_free(free_func) { + LITE_ASSERT(m_allocator && m_free); + } + + //! allocate memory of size in the given device with the given align + void* allocate(LiteDeviceType device_type, int device_id, size_t size, + size_t align) override { + return m_allocator(device_type, device_id, size, align); + } + + //! free the memory pointed by ptr in the given device + void free(LiteDeviceType device_type, int device_id, void* ptr) override { + m_free(device_type, device_id, ptr); + } + +private: + LiteAllocate m_allocator; + LiteFree m_free; +}; +} // namespace + +//! convert c config to lite::config +lite::Config convert_to_lite_config(const LiteConfig c_config) { + lite::Config lite_config; + lite_config.device_type = c_config.device_type; + if (c_config.bare_model_cryption_name) { + lite_config.bare_model_cryption_name = + c_config.bare_model_cryption_name; + } + lite_config.backend = c_config.backend; + lite_config.has_compression = c_config.has_compression; + lite_config.device_id = c_config.device_id; + + lite_config.options.weight_preprocess = c_config.options.weight_preprocess; + lite_config.options.fuse_preprocess = c_config.options.fuse_preprocess; + lite_config.options.fake_next_exec = c_config.options.fake_next_exec; + lite_config.options.var_sanity_check_first_run = + c_config.options.var_sanity_check_first_run; + lite_config.options.const_shape = c_config.options.const_shape; + lite_config.options.force_dynamic_alloc = c_config.options.const_shape; + lite_config.options.force_output_dynamic_alloc = + c_config.options.force_output_dynamic_alloc; + lite_config.options.no_profiling_on_shape_change = + c_config.options.no_profiling_on_shape_change; + lite_config.options.jit_level = c_config.options.jit_level; + lite_config.options.comp_node_seq_record_level = + c_config.options.comp_node_seq_record_level; + lite_config.options.graph_opt_level = c_config.options.graph_opt_level; + lite_config.options.async_exec_level = c_config.options.async_exec_level; + + lite_config.options.enable_nchw44 = c_config.options.enable_nchw44; + lite_config.options.enable_nchw44_dot = c_config.options.enable_nchw44_dot; + lite_config.options.enable_nchw88 = c_config.options.enable_nchw88; + lite_config.options.enable_nchw4 = c_config.options.enable_nchw4; + lite_config.options.enable_nhwcd4 = c_config.options.enable_nhwcd4; + lite_config.options.enable_nchw32 = c_config.options.enable_nchw32; + lite_config.options.enable_nchw64 = c_config.options.enable_nchw64; + + return lite_config; +} + +//! convert C NetworkIO io to lite::NetworkIO +lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io) { + lite::NetworkIO network_io; + for (size_t i = 0; i < c_network_io.input_size; i++) { + LiteIO* c_io = c_network_io.inputs + i; + LITE_ASSERT(c_io->name, "input name of io tensor must set."); + network_io.inputs.push_back( + {c_io->name, static_cast(c_io->is_host), c_io->io_type, + convert_to_layout(c_io->config_layout)}); + } + for (size_t i = 0; i < c_network_io.output_size; i++) { + LiteIO* c_io = c_network_io.outputs + i; + LITE_ASSERT(c_io->name, "output name of io tensor must set."); + network_io.outputs.push_back( + {c_io->name, static_cast(c_io->is_host), c_io->io_type, + convert_to_layout(c_io->config_layout)}); + } + return network_io; +} + +int LITE_make_default_network(LiteNetwork* network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto lite_network = std::make_shared(); + get_gloabl_network_holder()[lite_network.get()] = lite_network; + *network = lite_network.get(); + LITE_CAPI_END(); +} + +int LITE_make_network(LiteNetwork* network, const LiteConfig config, + const LiteNetworkIO network_io) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto lite_network = std::make_shared( + convert_to_lite_config(config), convert_to_lite_io(network_io)); + get_gloabl_network_holder()[lite_network.get()] = lite_network; + *network = lite_network.get(); + LITE_CAPI_END(); +} + +int LITE_make_network_config(LiteNetwork* network, const LiteConfig config) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto lite_network = + std::make_shared(convert_to_lite_config(config)); + get_gloabl_network_holder()[lite_network.get()] = lite_network; + *network = lite_network.get(); + LITE_CAPI_END(); +} + +int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, + size_t size) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(model_mem, "The model memory pass to LITE api is null"); + static_cast(network)->load_model(model_mem, size); + LITE_CAPI_END(); +} + +int LITE_load_model_from_path(LiteNetwork network, const char* model_path) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(model_path, "The model path pass to LITE api is null"); + static_cast(network)->load_model(model_path); + LITE_CAPI_END(); +} + +int LITE_destroy_network(LiteNetwork network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + get_gloabl_network_holder().erase(network); + LITE_CAPI_END(); +} + +int LITE_forward(const LiteNetwork network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + static_cast(network)->forward(); + LITE_CAPI_END(); +} + +int LITE_wait(const LiteNetwork network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + static_cast(network)->wait(); + LITE_CAPI_END(); +} + +int LITE_get_io_tensor(LiteNetwork network, const char* io_name, + LiteTensorPhase phase, LiteTensor* tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto io_tensor = + static_cast(network)->get_io_tensor(io_name, phase); + *tensor = io_tensor.get(); + LITE_CAPI_END(); +} + +int LITE_get_input_name(const LiteNetwork network, size_t index, + const char** name) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network && name, "The network pass to LITE api is null"); + *name = lite::NetworkHelper::implement(static_cast(network)) + ->get_input_name(index); + LITE_CAPI_END(); +} + +int LITE_get_output_name(const LiteNetwork network, size_t index, + const char** name) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(name, "The name ptr pass to LITE api is null"); + *name = lite::NetworkHelper::implement(static_cast(network)) + ->get_output_name(index); + LITE_CAPI_END(); +} + +int LITE_get_all_input_name(const LiteNetwork network, size_t* size, + const char** name) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto&& names = + lite::NetworkHelper::implement(static_cast(network)) + ->get_all_input_name(); + if (size) + *size = names.size(); + if (name) { + for (auto in_name : names) { + *name = in_name; + name++; + } + } + LITE_CAPI_END(); +} + +int LITE_get_all_output_name(const LiteNetwork network, size_t* size, + const char** name) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto&& names = + lite::NetworkHelper::implement(static_cast(network)) + ->get_all_output_name(); + if (size) + *size = names.size(); + if (name) { + for (auto in_name : names) { + *name = in_name; + name++; + } + } + LITE_CAPI_END(); +} + +int LITE_set_device_id(LiteNetwork network, int device_id) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + static_cast(network)->set_device_id(device_id); + LITE_CAPI_END(); +} + +int LITE_get_device_id(const LiteNetwork network, int* device_id) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(device_id, "The device_id pass to LITE api is null"); + *device_id = static_cast(network)->get_device_id(); + LITE_CAPI_END(); +} + +int LITE_set_stream_id(LiteNetwork network, int stream_id) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + static_cast(network)->set_stream_id(stream_id); + LITE_CAPI_END(); +} + +int LITE_get_stream_id(const LiteNetwork network, int* stream_id) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(stream_id, "The stream_id pass to LITE api is null"); + *stream_id = static_cast(network)->get_stream_id(); + LITE_CAPI_END(); +} + +int LITE_get_model_extra_info(const LiteNetwork network, const char** info, + int* info_size) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(info_size, "The info and info_size are all null"); + auto& extra_info = + static_cast(network)->get_model_extra_info(); + *info_size = extra_info.size(); + *info = extra_info.c_str(); + LITE_MARK_USED_VAR(info); + LITE_CAPI_END(); +} + +int LITE_get_device_type(const LiteNetwork network, + LiteDeviceType* device_type) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(device_type, "The device_type pass to LITE api is null"); + *device_type = static_cast(network)->get_device_type(); + LITE_CAPI_END(); +} + +int LITE_set_async_callback(LiteNetwork network, + const LiteAsyncCallback async_callback) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(async_callback, "The ptr pass to LITE api is null"); + static_cast(network)->set_async_callback( + std::move(async_callback)); + LITE_CAPI_END(); +} + +int LITE_set_start_callback(LiteNetwork network, + const LiteStartCallback start_callback) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto lite_start_callback = + [start_callback]( + const std::unordered_map< + std::string, + std::pair>>& + inputs_map) -> void { + std::vector ios; + std::vector io_tensors; + size_t nr_io = 0; + for (const auto& io : inputs_map) { + nr_io++; + auto&& lite_io = io.second.first; + ios.push_back({lite_io.name.c_str(), lite_io.is_host, + lite_io.io_type, + convert_to_clayout(lite_io.config_layout)}); + io_tensors.push_back(io.second.second.get()); + } + start_callback(ios.data(), io_tensors.data(), nr_io); + }; + static_cast(network)->set_start_callback( + lite_start_callback); + LITE_CAPI_END(); +} + +int LITE_set_finish_callback(LiteNetwork network, + const LiteFinishCallback finish_callback) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + auto lite_finish_callback = + [finish_callback]( + const std::unordered_map< + std::string, + std::pair>>& + outputs_map) -> void { + std::vector ios; + std::vector io_tensors; + size_t nr_io = 0; + for (const auto& io : outputs_map) { + nr_io++; + auto&& lite_io = io.second.first; + ios.push_back({lite_io.name.c_str(), lite_io.is_host, + lite_io.io_type, + convert_to_clayout(lite_io.config_layout)}); + io_tensors.push_back(io.second.second.get()); + } + finish_callback(ios.data(), io_tensors.data(), nr_io); + }; + static_cast(network)->set_finish_callback( + lite_finish_callback); + LITE_CAPI_END(); +} + +int LITE_enable_profile_performance(LiteNetwork network, + const char* profile_json_file_path) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + static_cast(network)->enable_profile_performance( + profile_json_file_path); + LITE_CAPI_END(); +} + +int LITE_is_cpu_inplace_mode(const LiteNetwork network, + int* is_cpu_inplace_mode) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network && is_cpu_inplace_mode, + "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + *is_cpu_inplace_mode = lite::Runtime::is_cpu_inplace_mode(network_shared); + LITE_CAPI_END(); +} + +int LITE_get_cpu_threads_number(const LiteNetwork network, size_t* nr_threads) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + LITE_ASSERT(nr_threads, "The ptr pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + *nr_threads = lite::Runtime::get_cpu_threads_number(network_shared); + LITE_CAPI_END(); +} + +int LITE_set_cpu_inplace_mode(LiteNetwork network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_cpu_inplace_mode(network_shared); + LITE_CAPI_END(); +} + +int LITE_use_tensorrt(LiteNetwork network){ + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::use_tensorrt(network_shared); + LITE_CAPI_END(); +} + +int LITE_set_cpu_threads_number(LiteNetwork network, size_t nr_threads) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_cpu_threads_number(network_shared, nr_threads); + LITE_CAPI_END(); +} + +int LITE_set_network_algo_policy(LiteNetwork network, + LiteAlgoSelectStrategy strategy) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_network_algo_policy(network_shared, strategy); + LITE_CAPI_END(); +} + +int LITE_set_network_algo_fastrun_config(LiteNetwork network, + unsigned int shared_batch_size, + int binary_equal_between_batch) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_network_algo_policy( + network_shared, LiteAlgoSelectStrategy(0), shared_batch_size, + binary_equal_between_batch); + LITE_CAPI_END(); +} + +int LITE_set_network_algo_workspace_limit(LiteNetwork network, + size_t workspace_limit) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_network_algo_workspace_limit(network_shared, + workspace_limit); + LITE_CAPI_END(); +} + +int LITE_set_runtime_thread_affinity( + LiteNetwork network, + const LiteThreadAffinityCallback thread_affinity_callback) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_runtime_thread_affinity( + network_shared, std::move(thread_affinity_callback)); + LITE_CAPI_END(); +} + +int LITE_set_memory_allocator(LiteNetwork network, + const LiteAllocate allocate_fun, + const LiteFree free_fun) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network && allocate_fun && free_fun, + "The ptr pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::set_memory_allocator( + network_shared, + std::make_shared(allocate_fun, free_fun)); + LITE_CAPI_END(); +} + +int LITE_enable_io_txt_dump(LiteNetwork network, const char* io_txt_out_file) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::enable_io_txt_dump(network_shared, io_txt_out_file); + LITE_CAPI_END(); +} + +int LITE_enable_io_bin_dump(LiteNetwork network, const char* io_bin_out_dir) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(network, "The network pass to LITE api is null"); + std::shared_ptr network_shared{ + static_cast(network), [](void*) {}}; + lite::Runtime::enable_io_bin_dump(network_shared, io_bin_out_dir); + LITE_CAPI_END(); +} + +int LITE_shared_weight_with_network(LiteNetwork dst_network, + const LiteNetwork src_network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(dst_network && src_network, + "The network pass to LITE api is null"); + const std::shared_ptr src_shared_net{ + static_cast(src_network), [](void*) {}}; + std::shared_ptr dst_shared_net{ + static_cast(dst_network), [](void*) {}}; + lite::Runtime::shared_weight_with_network(dst_shared_net, src_shared_net); + LITE_CAPI_END(); +} + +int LITE_share_runtime_memroy(LiteNetwork dst_network, + LiteNetwork src_network) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(src_network && dst_network, + "The network pass to LITE api is null"); + std::shared_ptr src_shared{ + static_cast(src_network), [](void*) {}}; + std::shared_ptr dst_shared{ + static_cast(dst_network), [](void*) {}}; + lite::Runtime::share_runtime_memory_with(dst_shared, src_shared); + LITE_CAPI_END(); +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/lite-c/src/tensor.cpp b/lite/lite-c/src/tensor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6ed8499398b8f1cbdebe252d33690401402201e2 --- /dev/null +++ b/lite/lite-c/src/tensor.cpp @@ -0,0 +1,257 @@ +/** + * \file lite-c/src/tensor.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/tensor.h" +#include "../../src/tensor_impl_base.h" +#include "common.h" +#include "lite-c/tensor_c.h" +#include +#include +#include + +const LiteLayout default_layout = {.shapes = {0, 0, 0, 0, 0}, + .ndim = 0, + .data_type = LiteDataType::LITE_FLOAT}; + +const LiteTensorDesc default_desc = {.is_pinned_host = false, + .layout = default_layout, + .device_type = LiteDeviceType::LITE_CPU, + .device_id = 0}; +namespace { +std::unordered_map>& +get_global_tensor_holder() { + static thread_local std::unordered_map> + global_holder; + return global_holder; +} +std::unordered_map& +get_global_tensor_attr_holder() { + static thread_local std::unordered_map + global_holder; + return global_holder; +} +} // namespace + +//! convert the lite::Layout to Layout +LiteLayout convert_to_clayout(const lite::Layout& layout) { + LiteLayout clayout; + clayout.ndim = layout.ndim; + LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "layout ndim is to large"); + for (size_t i = 0; i < layout.ndim; i++) { + clayout.shapes[i] = layout.shapes[i]; + } + clayout.data_type = layout.data_type; + return clayout; +} + +//! convert the C Layout to lite::Layout +lite::Layout convert_to_layout(const LiteLayout& clayout) { + lite::Layout layout; + layout.ndim = clayout.ndim; + LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "clayout ndim is to large"); + for (size_t i = 0; i < layout.ndim; i++) { + layout.shapes[i] = clayout.shapes[i]; + } + layout.data_type = clayout.data_type; + return layout; +} + +int LITE_make_tensor(const LiteTensorDesc tensor_describe, LiteTensor* tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE_make_tensor is null"); + lite::Layout layout = convert_to_layout(tensor_describe.layout); + auto lite_tensor = std::make_shared( + tensor_describe.device_id, tensor_describe.device_type, layout, + tensor_describe.is_pinned_host); + get_global_tensor_holder()[lite_tensor.get()] = lite_tensor; + *tensor = lite_tensor.get(); + LITE_CAPI_END(); +} + +int LITE_destroy_tensor(LiteTensor tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + get_global_tensor_holder().erase(tensor); + LITE_CAPI_END(); +} + +int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + auto tensor_ptr = static_cast(tensor); + tensor_ptr->set_layout(convert_to_layout(layout)); + LITE_CAPI_END(); +} + +int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, + size_t data_length_in_byte) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); + static_cast(tensor)->reset(prepared_data, + data_length_in_byte); + LITE_CAPI_END(); +} + +int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, + void* prepared_data) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); + static_cast(tensor)->reset(prepared_data, + convert_to_layout(layout)); + LITE_CAPI_END(); +} + +int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor && shape, "The tensor pass to LITE c_api is null"); + std::vector shapes; + for (int i = 0; i < size; i++) { + shapes.push_back(shape[i]); + } + static_cast(tensor)->reshape(shapes); + LITE_CAPI_END(); +} + +int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, + const size_t* end, const size_t* step, size_t size, + LiteTensor* slice_tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor && start && end && slice_tensor, + "The tensor pass to LITE c_api is null"); + std::vector starts, ends, steps; + for (size_t i = 0; i < size; i++) { + starts.push_back(start[i]); + ends.push_back(end[i]); + if (step) { + steps.push_back(step[i]); + } + } + auto ret_tensor = + static_cast(tensor)->slice(starts, ends, steps); + get_global_tensor_holder()[ret_tensor.get()] = ret_tensor; + *slice_tensor = ret_tensor.get(); + LITE_CAPI_END(); +} + +int LITE_tensor_fill_zero(LiteTensor tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + static_cast(tensor)->fill_zero(); + LITE_CAPI_END(); +} + +int LITE_tensor_copy(LiteTensor dst_tensor, const LiteTensor src_tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(dst_tensor && src_tensor, + "The tensor pass to LITE c_api is null"); + static_cast(dst_tensor) + ->copy_from(*static_cast(src_tensor)); + LITE_CAPI_END(); +} + +int LITE_tensor_share_memory_with(LiteTensor dst_tensor, + const LiteTensor src_tensor) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(dst_tensor && src_tensor, + "The tensor pass to LITE c_api is null"); + static_cast(dst_tensor) + ->share_memory_with(*static_cast(src_tensor)); + LITE_CAPI_END(); +} + +int LITE_get_tensor_memory(const LiteTensor tensor, void** data) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(data, "The data ptr pass to LITE c_api is null"); + *data = static_cast(tensor)->get_memory_ptr(); + LITE_CAPI_END(); +} + +int LITE_get_tensor_memory_with_index(const LiteTensor tensor, + const size_t* index, size_t size, + void** data) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor && index && data, + "The tensor pass to LITE c_api is null"); + std::vector index_v; + for (size_t i = 0; i < size; i++) { + index_v.push_back(index[i]); + } + *data = static_cast(tensor)->get_memory_ptr(index_v); + LITE_CAPI_END(); +} + +int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, size_t* size) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(size, "The size ptr pass to LITE c_api is null"); + *size = static_cast(tensor)->get_tensor_total_size_in_byte(); + LITE_CAPI_END(); +} + +int LITE_get_tensor_layout(const LiteTensor tensor, LiteLayout* layout) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(layout, "The layout ptr pass to LITE c_api is null"); + *layout = convert_to_clayout( + static_cast(tensor)->get_layout()); + LITE_CAPI_END(); +} + +int LITE_get_tensor_device_type(const LiteTensor tensor, + LiteDeviceType* device_type) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(device_type, "The device ptr pass to LITE c_api is null"); + *device_type = static_cast(tensor)->get_device_type(); + LITE_CAPI_END(); +} + +int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor && device_id, "The tensor pass to LITE c_api is null"); + *device_id = static_cast(tensor)->get_device_id(); + LITE_CAPI_END(); +} + +int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(is_pinned_host, + "The is_pinned_host ptr pass to LITE c_api is null"); + *is_pinned_host = static_cast(tensor)->is_pinned_host(); + LITE_CAPI_END(); +} + +int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue) { + LITE_CAPI_BEGIN(); + LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); + LITE_ASSERT(is_continue, "The is_continue ptr pass to LITE c_api is null"); + *is_continue = static_cast(tensor)->is_continue_memory(); + LITE_CAPI_END(); +} + +int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, + LiteDeviceType dst_device, int device_id, + LiteTensor* result_tensor) { + LITE_CAPI_BEGIN(); + std::vector v_tensors; + for (int i = 0; i < nr_tensor; i++) { + v_tensors.push_back(*static_cast(tensors[i])); + } + auto tensor = + lite::TensorUtils::concat(v_tensors, dim, dst_device, device_id); + get_global_tensor_holder()[tensor.get()] = tensor; + *result_tensor = tensor.get(); + LITE_CAPI_END() +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/pylite/megenginelite/__init__.py b/lite/pylite/megenginelite/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9e52af7b0737de25971d23de8b5e7c8a46c18c1 --- /dev/null +++ b/lite/pylite/megenginelite/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +from .base import * +from .global_setting import * +from .network import * +from .struct import * +from .tensor import * +from .utils import * diff --git a/lite/pylite/megenginelite/base.py b/lite/pylite/megenginelite/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f29718b464e38d11acb52a9359967b6ceea1e094 --- /dev/null +++ b/lite/pylite/megenginelite/base.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import ctypes +import glob +import logging +import os +import sys +from ctypes import * + +if sys.platform == "win32": + lib_path = os.path.join(os.path.dirname(__file__), "libs") + dll_paths = list(filter(os.path.exists, [lib_path,])) + assert len(dll_paths) > 0 + + kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) + has_load_library_attr = hasattr(kernel32, "AddDllDirectory") + old_error_mode = kernel32.SetErrorMode(0x0001) + + kernel32.LoadLibraryW.restype = ctypes.c_void_p + if has_load_library_attr: + kernel32.AddDllDirectory.restype = ctypes.c_void_p + kernel32.LoadLibraryExW.restype = ctypes.c_void_p + + for dll_path in dll_paths: + if sys.version_info >= (3, 8): + os.add_dll_directory(dll_path) + elif has_load_library_attr: + res = kernel32.AddDllDirectory(dll_path) + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += ' Error adding "{}" to the DLL search PATH.'.format( + dll_path + ) + raise err + else: + print("WARN: python or OS env have some issue, may load DLL failed!!!") + + import glob + + dlls = glob.glob(os.path.join(lib_path, "*.dll")) + path_patched = False + for dll in dlls: + is_loaded = False + if has_load_library_attr: + res = kernel32.LoadLibraryExW(dll, None, 0x00001100) + last_error = ctypes.get_last_error() + if res is None and last_error != 126: + err = ctypes.WinError(last_error) + err.strerror += ' Error loading "{}" or one of its dependencies.'.format( + dll + ) + raise err + elif res is not None: + is_loaded = True + if not is_loaded: + if not path_patched: + os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]]) + path_patched = True + res = kernel32.LoadLibraryW(dll) + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += ' Error loading "{}" or one of its dependencies.'.format( + dll + ) + raise err + + kernel32.SetErrorMode(old_error_mode) + + +class _LiteCLib: + def __init__(self): + cwd = os.getcwd() + package_dir = os.path.dirname(os.path.realpath(__file__)) + debug_path = os.getenv("LITE_LIB_PATH") + os.chdir(package_dir) + lite_libs = glob.glob("libs/liblite*") + os.chdir(cwd) + + if debug_path is None: + assert len(lite_libs) == 1 + self._lib = CDLL(os.path.join(package_dir, lite_libs[0])) + else: + self._lib = CDLL(debug_path) + self._register_api( + "LITE_get_version", [POINTER(c_int), POINTER(c_int), POINTER(c_int)] + ) + self.lib.LITE_get_version.restype = None + self._register_api("LITE_set_log_level", [c_int]) + self._register_api("LITE_get_log_level", []) + self._register_api("LITE_get_last_error", [], False) + self.lib.LITE_get_last_error.restype = c_char_p + + def _errcheck(self, result, func, args): + if result: + error = self.lib.LITE_get_last_error() + msg = error.decode("utf-8") + logging.error("{}".format(msg)) + raise RuntimeError("{}".format(msg)) + return result + + def _register_api(self, api_name, arg_types, error_check=True): + func = getattr(self.lib, api_name) + func.argtypes = arg_types + func.restype = c_int + if error_check: + func.errcheck = self._errcheck + + @property + def lib(self): + return self._lib + + @property + def version(self): + major = c_int() + minor = c_int() + patch = c_int() + self.lib.LITE_get_version(byref(major), byref(minor), byref(patch)) + return "{}.{}.{}".format(major.value, minor.value, patch.value) + + def set_log_level(self, level): + self.lib.LITE_set_log_level(level) + + def get_log_level(self): + return self.lib.LITE_get_log_level() + + +_lib = _LiteCLib() +version = _lib.version +set_log_level = _lib.set_log_level +get_log_level = _lib.get_log_level + +_Cnetwork = c_void_p +_Ctensor = c_void_p + + +class _LiteCObjMetaClass(type): + """metaclass for lite object""" + + def __new__(cls, name, bases, attrs): + for api in attrs["_api_"]: + _lib._register_api(*api) + del attrs["_api_"] + attrs["_lib"] = _lib.lib + return super().__new__(cls, name, bases, attrs) + + +class _LiteCObjBase(metaclass=_LiteCObjMetaClass): + _api_ = [] diff --git a/lite/pylite/megenginelite/global_setting.py b/lite/pylite/megenginelite/global_setting.py new file mode 100644 index 0000000000000000000000000000000000000000..fe04b3014f53c4af758cd761afc6685d10e4c4b6 --- /dev/null +++ b/lite/pylite/megenginelite/global_setting.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +from ctypes import * + +import numpy as np + +from .base import _Ctensor, _lib, _LiteCObjBase +from .network import * +from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure +from .tensor import * + +LiteDecryptionFunc = CFUNCTYPE( + c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p +) + + +class _GlobalAPI(_LiteCObjBase): + """ + get the api from the lib + """ + + _api_ = [ + ("LITE_get_device_count", [c_int, POINTER(c_size_t)]), + ("LITE_try_coalesce_all_free_memory", []), + ( + "LITE_register_decryption_and_key", + [c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t], + ), + ( + "LITE_update_decryption_or_key", + [c_char_p, c_void_p, POINTER(c_uint8), c_size_t], + ), + ("LITE_set_loader_lib_path", [c_char_p]), + ("LITE_set_persistent_cache", [c_char_p, c_int]), + # ('LITE_set_tensor_rt_cache', [c_char_p]), + ("LITE_dump_persistent_cache", [c_char_p]), + ("LITE_dump_tensor_rt_cache", [c_char_p]), + ] + + +def decryption_func(func): + """the decryption function decorator + :type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte + """ + + @CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p) + def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data): + in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length) + key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length) + if c_out_data: + out_length = func(in_arr, None) + out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length) + return func(in_arr, key_arr, out_arr) + # just query the output length + else: + return func(in_arr, key_arr, None) + + return wrapper + + +class LiteGlobal(object): + """ + some global config in lite + """ + + _api = _GlobalAPI()._lib + + @staticmethod + def register_decryption_and_key(decryption_name, decryption_func, key): + c_name = c_char_p(decryption_name.encode("utf-8")) + key_length = len(key) + c_key = (c_uint8 * key_length)(*key) + LiteGlobal._api.LITE_register_decryption_and_key( + c_name, decryption_func, c_key, key_length + ) + + @staticmethod + def update_decryption_key(decryption_name, key): + c_name = c_char_p(decryption_name.encode("utf-8")) + key_length = len(key) + c_key = (c_uint8 * key_length)(*key) + LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length) + + @staticmethod + def set_loader_lib_path(path): + c_path = c_char_p(path.encode("utf-8")) + LiteGlobal._api.LITE_set_loader_lib_path(c_path) + + @staticmethod + def set_persistent_cache(path, always_sync=False): + c_path = c_char_p(path.encode("utf-8")) + LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync) + + @staticmethod + def set_tensorrt_cache(path): + c_path = c_char_p(path.encode("utf-8")) + LiteGlobal._api.LITE_set_tensorrt_cache(c_path) + + @staticmethod + def dump_persistent_cache(path): + c_path = c_char_p(path.encode("utf-8")) + LiteGlobal._api.LITE_dump_persistent_cache(c_path) + + @staticmethod + def dump_tensorrt_cache(): + LiteGlobal._api.LITE_dump_tensorrt_cache() + + @staticmethod + def get_device_count(device_type): + count = c_size_t() + LiteGlobal._api.LITE_get_device_count(device_type, byref(count)) + return count.value + + @staticmethod + def try_coalesce_all_free_memory(): + LiteGlobal._api.LITE_try_coalesce_all_free_memory() diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py new file mode 100644 index 0000000000000000000000000000000000000000..856dc757c714117647fdb5476c6b24177789ae53 --- /dev/null +++ b/lite/pylite/megenginelite/network.py @@ -0,0 +1,531 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +from ctypes import * + +import numpy as np + +from .base import _Cnetwork, _Ctensor, _lib, _LiteCObjBase +from .struct import * +from .tensor import * + + +class LiteOptions(Structure): + """ + the inference options will be used to config a network + """ + + _fields_ = [ + ("weight_preprocess", c_int), + ("fuse_preprocess", c_int), + ("fake_next_exec", c_int), + ("var_sanity_check_first_run", c_int), + ("const_shape", c_int), + ("force_dynamic_alloc", c_int), + ("force_output_dynamic_alloc", c_int), + ("no_profiling_on_shape_change", c_int), + ("jit_level", c_int), + ("comp_node_seq_record_level", c_int), + ("graph_opt_level", c_int), + ("async_exec_level", c_int), + # layout transform options + ("enable_nchw44", c_int), + ("enable_nchw44_dot", c_int), + ("enable_nchw88", c_int), + ("enable_nhwcd4", c_int), + ("enable_nchw4", c_int), + ("enable_nchw32", c_int), + ("enable_nchw64", c_int), + ] + + def __init__(self): + self.weight_preprocess = False + self.fuse_preprocess = False + self.fake_next_exec = False + self.var_sanity_check_first_run = True + self.const_shape = False + self.force_dynamic_alloc = False + self.force_output_dynamic_alloc = False + self.no_profiling_on_shape_change = False + self.jit_level = 0 + self.comp_node_seq_record_level = 0 + self.graph_opt_level = 2 + self.async_exec_level = 1 + + def __repr__(self): + data = { + "weight_preprocess": bool(self.weight_preprocess), + "fuse_preprocess": bool(self.fuse_preprocess), + "fake_next_exec": bool(self.fake_next_exec), + "var_sanity_check_first_run": bool(self.var_sanity_check_first_run), + "const_shape": bool(self.const_shape), + "force_dynamic_alloc": bool(self.force_dynamic_alloc), + "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), + "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), + "jit_level": self.jit_level, + "comp_node_seq_record_level": self.comp_node_seq_record_level, + "graph_opt_level": self.graph_opt_level, + "async_exec_level": self.async_exec_level, + } + return data.__repr__() + + +class LiteConfig(Structure): + """ + Configuration when load and compile the graph + + bare_model_cryption_name: is the bare model cryption method name, bare + model is not pack model info inside + + use_loader_dynamic_param: when model forward with device loader of npu, + use_loader_dynamic_param used to flag whether the loader use device input or + output, if use device input or output it will set Non-zero , else set zero + + has_compression: flag whether the model is compressed, the compress + method will used to read the model + """ + + _fields_ = [ + ("has_compression", c_int), + ("device_id", c_int), + ("device_type", c_int), + ("backend", c_int), + ("bare_model_cryption_name", c_char_p), + ("options", LiteOptions), + ] + + def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None): + self.device_type = device_type + if option: + self.options = option + else: + self.options = LiteOptions() + + self.bare_model_cryption_name = c_char_p(b"") + self.use_loader_dynamic_param = 0 + self.has_compression = 0 + self.backend = LiteBackend.LITE_DEFAULT + + def __repr__(self): + data = { + "has_compression": bool(self.has_compression), + "device_id": LiteDeviceType(self.device_id), + "device_type": LiteDeviceType(self.device_type), + "backend": LiteBackend(self.backend), + "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"), + "options": self.options, + } + return data.__repr__() + + +class LiteIO(Structure): + """ + config the network input and output item + + name: the tensor name in the graph corresponding to the IO + + is_host: Used to mark where the input tensor comes from and the output where copy + to, if is_host is true, the input is from host and output copy to host, + otherwise device. Sometimes The input is from device and output no need + copy to host, default is true. + + io_type: The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or + output tensor value is invaid, only shape will be set, default is VALUE + + config_layout: The layout of the config from user, if other layout is set before + forward or get after forward, this layout will by pass. if no other + layout is set before forward, this layout will work. if this layout is + no set, the model will forward with its origin layout. if in output, it + will used to check. + """ + + _fields_ = [ + ("name", c_char_p), + ("is_host", c_int), + ("io_type", c_int), + ("config_layout", LiteLayout), + ] + + def __init__( + self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None + ): + if type(name) == str: + self.name = c_char_p(name.encode("utf-8")) + else: + self.name = c_char_p(name) + + if layout: + self.config_layout = layout + else: + self.config_layout = LiteLayout() + + self.is_host = is_host + self.io_type = io_type + + def __repr__(self): + data = { + "name": self.name, + "is_host": bool(self.is_host), + "io_type": LiteIOType(self.io_type), + "config_layout": self.config_layout, + } + return data.__repr__() + + def __hash__(self): + return hash(self.name) + + +class _LiteNetworkIO(Structure): + """ + the input and output information when load the network + """ + + _fields_ = [ + ("inputs", POINTER(LiteIO)), + ("outputs", POINTER(LiteIO)), + ("input_size", c_size_t), + ("output_size", c_size_t), + ] + + def __init__(self): + self.inputs = POINTER(LiteIO)() + self.outputs = POINTER(LiteIO)() + self.input_size = 0 + self.output_size = 0 + + +class LiteNetworkIO(object): + """ + the input and output information for user to construct _LiteNetWorkIO + """ + + def __init__(self): + self.inputs = [] + self.outputs = [] + + def add_input(self, input_io): + assert isinstance(input_io, LiteIO) + self.inputs.append(input_io) + + def add_output(self, output_io): + assert isinstance(output_io, LiteIO) + self.outputs.append(output_io) + + def _create_network_io(self): + network_io = _LiteNetworkIO() + length = 1 if len(self.inputs) == 0 else len(self.inputs) + self.c_inputs = (LiteIO * length)(*self.inputs) + length = 1 if len(self.outputs) == 0 else len(self.outputs) + self.c_outputs = (LiteIO * length)(*self.outputs) + network_io.inputs = pointer(self.c_inputs[0]) + network_io.outputs = pointer(self.c_outputs[0]) + network_io.input_size = len(self.inputs) + network_io.output_size = len(self.outputs) + return network_io + + def __repr__(self): + data = {"inputs": list(self.inputs), "outputs": list(self.outputs)} + return data.__repr__() + + +LiteAsyncCallback = CFUNCTYPE(c_int) + + +def start_finish_callback(func): + @CFUNCTYPE(c_int, POINTER(LiteIO), POINTER(_Ctensor), c_size_t) + def wrapper(c_ios, c_tensors, size): + ios = {} + for i in range(size): + tensor = LiteTensor() + tensor._tensor = c_tensors[i] + tensor.update() + io = c_ios[i] + ios[io] = tensor + return func(ios) + + return wrapper + + +class _NetworkAPI(_LiteCObjBase): + """ + get the network api from the lib + """ + + _api_ = [ + ("LITE_make_default_network", [POINTER(_Cnetwork)]), + ("LITE_make_network", [POINTER(_Cnetwork), LiteConfig, _LiteNetworkIO]), + ("LITE_load_model_from_mem", [_Cnetwork, c_void_p, c_size_t]), + ("LITE_load_model_from_path", [_Cnetwork, c_char_p]), + ("LITE_shared_weight_with_network", [_Cnetwork, _Ctensor]), + ("LITE_destroy_network", [_Cnetwork]), + ("LITE_forward", [_Cnetwork]), + ("LITE_wait", [_Cnetwork]), + ("LITE_get_io_tensor", [_Cnetwork, c_char_p, c_int, POINTER(_Ctensor)]), + ("LITE_get_input_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), + ("LITE_get_output_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), + ("LITE_get_all_input_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), + ("LITE_get_all_output_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), + ("LITE_is_cpu_inplace_mode", [_Cnetwork, POINTER(c_int)]), + ("LITE_get_cpu_threads_number", [_Cnetwork, POINTER(c_size_t)]), + ("LITE_get_device_id", [_Cnetwork, POINTER(c_int)]), + ("LITE_set_device_id", [_Cnetwork, c_int]), + ("LITE_set_cpu_inplace_mode", [_Cnetwork]), + ("LITE_use_tensorrt", [_Cnetwork]), + ("LITE_set_cpu_threads_number", [_Cnetwork, c_size_t]), + ("LITE_set_stream_id", [_Cnetwork, c_int]), + ("LITE_get_stream_id", [_Cnetwork, POINTER(c_int)]), + ("LITE_set_network_algo_policy", [_Cnetwork, c_int]), + ("LITE_set_network_algo_fastrun_config", [_Cnetwork, c_int, c_int]), + ("LITE_set_network_algo_workspace_limit", [_Cnetwork, c_size_t]), + ("LITE_share_runtime_memroy", [_Cnetwork, _Cnetwork]), + ("LITE_enable_profile_performance", [_Cnetwork, c_char_p]), + ("LITE_enable_io_txt_dump", [_Cnetwork, c_char_p]), + ("LITE_enable_io_bin_dump", [_Cnetwork, c_char_p]), + ("LITE_set_async_callback", [_Cnetwork, LiteAsyncCallback]), + ("LITE_set_start_callback", [_Cnetwork]), + ("LITE_set_finish_callback", [_Cnetwork]), + ] + + +class LiteNetwork(object): + """ + the network to load a model and forward + """ + + _api = _NetworkAPI()._lib + + def __init__(self, config=None, io=None): + """ + create a network with config and networkio + """ + self._network = _Cnetwork() + + if config: + self.config = config + else: + self.config = LiteConfig() + + if io: + self.network_io = io + else: + self.network_io = LiteNetworkIO() + + c_network_io = self.network_io._create_network_io() + self._api.LITE_make_network(byref(self._network), self.config, c_network_io) + + def __repr__(self): + data = {"config": self.config, "IOs": self.network_io} + return data.__repr__() + + def __del__(self): + self._api.LITE_destroy_network(self._network) + + def load(self, path): + c_path = c_char_p(path.encode("utf-8")) + self._api.LITE_load_model_from_path(self._network, c_path) + + def forward(self): + self._api.LITE_forward(self._network) + + def wait(self): + self._api.LITE_wait(self._network) + + def is_cpu_inplace_mode(self): + """ + whether the network run in cpu inpalce mode + """ + inplace = c_int() + self._api.LITE_is_cpu_inplace_mode(self._network, byref(inplace)) + return bool(inplace.value) + + def enable_cpu_inplace_mode(self): + """ + set cpu forward in inplace mode with which cpu forward only create one + thread + Note: this must be set before the network loaded + """ + self._api.LITE_set_cpu_inplace_mode(self._network) + + def use_tensorrt(self): + """ + Note: this must be set before the network loaded + """ + self._api.LITE_use_tensorrt(self._network) + + @property + def device_id(self): + """ + get the device id + """ + device_id = c_int() + self._api.LITE_get_device_id(self._network, byref(device_id)) + return device_id.value + + @device_id.setter + def device_id(self, device_id): + """ + set the device id + Note: this must be set before the network loaded + """ + self._api.LITE_set_device_id(self._network, device_id) + + @property + def stream_id(self): + """ + get the stream id + """ + stream_id = c_int() + self._api.LITE_get_stream_id(self._network, byref(stream_id)) + return stream_id.value + + @stream_id.setter + def stream_id(self, stream_id): + """ + set the stream id + Note: this must be set before the network loaded + """ + self._api.LITE_set_stream_id(self._network, stream_id) + + @property + def threads_number(self): + """ + get the thread number of the netwrok + """ + nr_thread = c_size_t() + self._api.LITE_get_cpu_threads_number(self._network, byref(nr_thread)) + return nr_thread.value + + @threads_number.setter + def threads_number(self, nr_threads): + """ + set the network forward in multithread mode, and the thread number + Note: this must be set before the network loaded + """ + self._api.LITE_set_cpu_threads_number(self._network, nr_threads) + + def get_io_tensor(self, name, phase=LiteTensorPhase.LITE_IO): + """ + get input or output tensor by its name + """ + if type(name) == str: + c_name = c_char_p(name.encode("utf-8")) + else: + c_name = c_char_p(name) + tensor = LiteTensor() + self._api.LITE_get_io_tensor( + self._network, c_name, phase, byref(tensor._tensor) + ) + tensor.update() + return tensor + + def get_input_name(self, index): + """ + get the input name by the index in the network + """ + c_name = c_char_p() + self._api.LITE_get_input_name(self._network, index, byref(c_name)) + return c_name.value.decode("utf-8") + + def get_output_name(self, index): + """ + get the output name by the index in the network + """ + c_name = c_char_p() + self._api.LITE_get_output_name(self._network, index, byref(c_name)) + return c_name.value.decode("utf-8") + + def get_all_input_name(self): + """ + get all the input tensor name in the network + """ + nr_input = c_size_t() + self._api.LITE_get_all_input_name(self._network, byref(nr_input), None) + + if nr_input.value > 0: + names = (c_char_p * nr_input.value)() + self._api.LITE_get_all_input_name(self._network, None, names) + ret_name = [names[i].decode("utf-8") for i in range(nr_input.value)] + return ret_name + + def get_all_output_name(self): + """ + get all the output tensor name in the network + """ + nr_output = c_size_t() + self._api.LITE_get_all_output_name(self._network, byref(nr_output), None) + + if nr_output.value > 0: + names = (c_char_p * nr_output.value)() + self._api.LITE_get_all_output_name(self._network, None, names) + ret_name = [names[i].decode("utf-8") for i in range(nr_output.value)] + return ret_name + + def share_weights_with(self, src_network): + """ + share weights with the loaded network + """ + assert isinstance(src_network, LiteNetwork) + self._api.LITE_shared_weight_with_network(self._network, src_network._network) + + def share_runtime_memroy(self, src_network): + """ + share runtime memory with the srouce network + """ + assert isinstance(src_network, LiteNetwork) + self._api.LITE_share_runtime_memroy(self._network, src_network._network) + + def async_with_callback(self, async_callback): + async_callback = LiteAsyncCallback(async_callback) + self._api.LITE_set_async_callback(self._network, async_callback) + + def set_start_callback(self, start_callback): + """ + when the network start forward, the callback will be called, + the start_callback with param mapping from LiteIO to the corresponding + LiteTensor + """ + self._api.LITE_set_start_callback(self._network, start_callback) + + def set_finish_callback(self, finish_callback): + """ + when the network finish forward, the callback will be called, + the finish_callback with param mapping from LiteIO to the corresponding + LiteTensor + """ + self._api.LITE_set_finish_callback(self._network, finish_callback) + + def enable_profile_performance(self, profile_file): + c_file = profile_file.encode("utf-8") + self._api.LITE_enable_profile_performance(self._network, c_file) + + def set_network_algo_workspace_limit(self, size_limit): + self._api.LITE_set_network_algo_workspace_limit(self._network, size_limit) + + def set_network_algo_policy( + self, policy, shared_batch_size=0, binary_equal_between_batch=False + ): + """ + shared_batch_size: the batch size used by fastrun, + Non-zero value means that fastrun use this batch size + regardless of the batch size of the model. Zero means + fastrun use batch size of the model + binary_equal_between_batch: if the content of each input batch is + binary equal,whether the content of each output batch is + promised to be equal + + """ + self._api.LITE_set_network_algo_policy(self._network, policy) + self._api.LITE_set_network_algo_fastrun_config( + self._network, shared_batch_size, binary_equal_between_batch + ) + + def io_txt_dump(self, txt_file): + c_file = txt_file.encode("utf-8") + self._api.LITE_enable_io_txt_dump(self._network, c_file) + + def io_bin_dump(self, bin_dir): + c_dir = bin_dir.encode("utf-8") + self._api.LITE_enable_io_bin_dump(self._network, c_dir) diff --git a/lite/pylite/megenginelite/struct.py b/lite/pylite/megenginelite/struct.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae99fe3f0106a796391b440cb5b760912470ff1 --- /dev/null +++ b/lite/pylite/megenginelite/struct.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import logging +from ctypes import * +from enum import Enum, IntEnum + + +class LiteBackend(IntEnum): + LITE_DEFAULT = 0 + + +class LiteDeviceType(IntEnum): + LITE_CPU = 0 + LITE_CUDA = 1 + LITE_ATLAS = 3 + LITE_NPU = 4 + LITE_DEVICE_DEFAULT = 5 + + +class LiteDataType(IntEnum): + LITE_FLOAT = 0 + LITE_HALF = 1 + LITE_INT = 2 + LITE_INT16 = 3 + LITE_INT8 = 4 + LITE_UINT8 = 5 + + +class LiteTensorPhase(IntEnum): + LITE_IO = 0 + LITE_INPUT = 1 + LITE_OUTPUT = 2 + + +class LiteIOType(IntEnum): + """ + the input and output type, include SHAPE and VALUE + sometimes user only need the shape of the output tensor + """ + + LITE_IO_VALUE = 0 + LITE_IO_SHAPE = 1 + + +class LiteAlgoSelectStrategy(IntEnum): + """ + operation algorithm seletion strategy type, some operations have + multi algorithms, different algorithm has different attribute, according to + the strategy, the best algorithm will be selected. + + Note: These strategies can be combined + + LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, + use heuristic instead + + LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the + reproducible algo + + LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best + algorithm from the reproducible algorithms set + + LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best + algorithm form the optimzed algorithms, thus profile will process fast + + LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: + profile the best algorithm form the optimzed and reproducible algorithms + """ + + LITE_ALGO_HEURISTIC = 1 + LITE_ALGO_PROFILE = 2 + LITE_ALGO_REPRODUCIBLE = 4 + LITE_ALGO_OPTIMIZED = 8 + + +class LiteLogLevel(IntEnum): + """ + DEBUG: The most verbose level, printing debugging info + INFO: The default level + WARN: Printing warnings + ERROR: The least verbose level, printing errors only + """ + + DEBUG = 0 + INFO = 1 + WARN = 2 + ERROR = 3 diff --git a/lite/pylite/megenginelite/tensor.py b/lite/pylite/megenginelite/tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2539ce508d5c95c75fad8ca1fa4a03d24bb9cf --- /dev/null +++ b/lite/pylite/megenginelite/tensor.py @@ -0,0 +1,471 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +from ctypes import * + +import numpy as np + +from .base import _Ctensor, _lib, _LiteCObjBase +from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure + +MAX_DIM = 7 + +_lite_type_to_nptypes = { + LiteDataType.LITE_INT: np.int32, + LiteDataType.LITE_FLOAT: np.float32, + LiteDataType.LITE_UINT8: np.uint8, + LiteDataType.LITE_INT8: np.int8, + LiteDataType.LITE_INT16: np.int16, + LiteDataType.LITE_HALF: np.float16, +} + +_nptype_to_lite_type = {val: key for key, val in _lite_type_to_nptypes.items()} + +_str_nptypes_to_lite_nptypes = { + np.dtype("int32"): LiteDataType.LITE_INT, + np.dtype("float32"): LiteDataType.LITE_FLOAT, + np.dtype("uint8"): LiteDataType.LITE_UINT8, + np.dtype("int8"): LiteDataType.LITE_INT8, + np.dtype("int16"): LiteDataType.LITE_INT16, + np.dtype("float16"): LiteDataType.LITE_HALF, +} + +ctype_to_lite_dtypes = { + c_int: LiteDataType.LITE_INT, + c_uint: LiteDataType.LITE_INT, + c_float: LiteDataType.LITE_FLOAT, + c_ubyte: LiteDataType.LITE_UINT8, + c_byte: LiteDataType.LITE_INT8, + c_short: LiteDataType.LITE_INT16, + c_ushort: LiteDataType.LITE_INT16, +} + + +class LiteLayout(Structure): + """ + the simple layout description + """ + + _fields_ = [ + ("shapes", c_size_t * MAX_DIM), + ("ndim", c_size_t), + ("data_type", c_int), + ] + + def __init__(self, shape=None, dtype=None): + if shape: + shape = list(shape) + assert len(shape) <= MAX_DIM, "Layout max dim is 7." + self.shapes = (c_size_t * MAX_DIM)(*shape) + self.ndim = len(shape) + else: + self.shapes = (c_size_t * MAX_DIM)() + self.ndim = 0 + if not dtype: + self.data_type = LiteDataType.LITE_FLOAT + elif isinstance(dtype, LiteDataType): + self.data_type = dtype + elif type(dtype) == str: + self.data_type = _str_nptypes_to_lite_nptypes[np.dtype(dtype)] + elif isinstance(dtype, np.dtype): + ctype = np.ctypeslib.as_ctypes_type(dtype) + self.data_type = ctype_to_lite_dtypes[ctype] + elif isinstance(dtype, type): + self.data_type = _nptype_to_lite_type[dtype] + else: + raise RuntimeError("unkonw data type") + + def __repr__(self): + data = { + "shapes": list(self.shapes), + "ndim": self.ndim, + "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)], + } + return data.__repr__() + + +class _LiteTensorDesc(Structure): + """ + warpper of the MegEngine Tensor + + :is_pinned_host: when set, the storage memory of the tensor is pinned memory, + this is used to Optimize the H2D or D2H memory copy, if the device or layout + is not set, when copy form other device(CUDA) tensor, this tensor + will be automatically set to pinned tensor + """ + + _fields_ = [ + ("is_pinned_host", c_int), + ("layout", LiteLayout), + ("device_type", c_int), + ("device_id", c_int), + ] + + def __init__(self): + self.layout = LiteLayout() + self.device_type = LiteDeviceType.LITE_CPU + self.is_pinned_host = False + self.device_id = 0 + + def __repr__(self): + data = { + "is_pinned_host": self.is_pinned_host, + "layout": LiteLayout(self.layout), + "device_type": LiteDeviceType(self.device_type.value), + "device_id": self.device_id, + } + return data.__repr__() + + +class _TensorAPI(_LiteCObjBase): + """ + get the api from the lib + """ + + _api_ = [ + ("LITE_make_tensor", [_LiteTensorDesc, POINTER(_Ctensor)]), + ("LITE_set_tensor_layout", [_Ctensor, LiteLayout]), + ("LITE_reset_tensor_memory", [_Ctensor, c_void_p, c_size_t]), + ("LITE_reset_tensor", [_Ctensor, LiteLayout, c_void_p]), + ("LITE_tensor_reshape", [_Ctensor, POINTER(c_int), c_int]), + ( + "LITE_tensor_slice", + [ + _Ctensor, + POINTER(c_size_t), + POINTER(c_size_t), + POINTER(c_size_t), + c_size_t, + POINTER(_Ctensor), + ], + ), + ( + "LITE_tensor_concat", + [POINTER(_Ctensor), c_int, c_int, c_int, c_int, POINTER(_Ctensor),], + ), + ("LITE_tensor_fill_zero", [_Ctensor]), + ("LITE_tensor_copy", [_Ctensor, _Ctensor]), + ("LITE_tensor_share_memory_with", [_Ctensor, _Ctensor]), + ("LITE_get_tensor_memory", [_Ctensor, POINTER(c_void_p)]), + ("LITE_get_tensor_total_size_in_byte", [_Ctensor, POINTER(c_size_t)]), + ("LITE_get_tensor_layout", [_Ctensor, POINTER(LiteLayout)]), + ("LITE_get_tensor_device_type", [_Ctensor, POINTER(c_int)]), + ("LITE_get_tensor_device_id", [_Ctensor, POINTER(c_int)]), + ("LITE_destroy_tensor", [_Ctensor]), + ("LITE_is_pinned_host", [_Ctensor, POINTER(c_int)]), + ] + + +class LiteTensor(object): + """ + the tensor to hold a block of data + """ + + _api = _TensorAPI()._lib + + def __init__( + self, + layout=None, + device_type=LiteDeviceType.LITE_CPU, + device_id=0, + is_pinned_host=False, + ): + """ + create a Tensor with layout, device, is_pinned_host param + """ + self._tensor = _Ctensor() + if layout: + self._layout = layout + else: + self._layout = LiteLayout() + self._device_type = device_type + self._device_id = device_id + self._is_pinned_host = is_pinned_host + + tensor_desc = _LiteTensorDesc() + tensor_desc.layout = self._layout + tensor_desc.device_type = device_type + tensor_desc.device_id = device_id + tensor_desc.is_pinned_host = is_pinned_host + self._api.LITE_make_tensor(tensor_desc, byref(self._tensor)) + + def __del__(self): + self._api.LITE_destroy_tensor(self._tensor) + + def fill_zero(self): + """ + fill the buffer memory with zero + """ + self._api.LITE_tensor_fill_zero(self._tensor) + self.update() + + def share_memory_with(self, src_tensor): + """ + share the same memory with the src_tensor, the self memory will be freed + """ + assert isinstance(src_tensor, LiteTensor) + self._api.LITE_tensor_share_memory_with(self._tensor, src_tensor._tensor) + self.update() + + @property + def layout(self): + self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) + return self._layout + + @layout.setter + def layout(self, layout): + assert isinstance(layout, LiteLayout) + self._layout = layout + self._api.LITE_set_tensor_layout(self._tensor, layout) + + @property + def is_pinned_host(self): + """ + whether the tensor is pinned tensor + """ + pinned = c_int() + self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) + self._is_pinned_host = pinned + return bool(self._is_pinned_host) + + @property + def device_type(self): + """ + get device of the tensor + """ + device_type = c_int() + self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) + self._device_type = device_type + return LiteDeviceType(device_type.value) + + @property + def device_id(self): + """ + get device id of the tensor + """ + device_id = c_int() + self._api.LITE_get_tensor_device_id(self._tensor, byref(device_id)) + self._device_id = device_id.value + return device_id.value + + @property + def is_continue(self): + """ + whether the tensor memory is continue + """ + is_continue = c_int() + self._api.LITE_is_memory_continue(self._tensor, byref(is_continue)) + return bool(is_continue.value) + + @property + def nbytes(self): + """ + get the length of the meomry in byte + """ + self.update() + length = c_size_t() + self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length)) + return length.value + + def update(self): + """ + update the member from C, this will auto used after slice, share + """ + pinned = c_int() + self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) + self._is_pinned_host = pinned + device_type = c_int() + self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) + self._device_type = device_type + self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) + + def copy_from(self, src_tensor): + """ + copy memory form the src_tensor + """ + assert isinstance(src_tensor, LiteTensor) + self._api.LITE_tensor_copy(self._tensor, src_tensor._tensor) + self.update() + + def reshape(self, shape): + """ + reshape the tensor with data not change, only change the shape + :param shape: int arrary of dst_shape + """ + shape = list(shape) + length = len(shape) + c_shape = (c_int * length)(*shape) + self._api.LITE_tensor_reshape(self._tensor, c_shape, length) + self.update() + + def slice(self, start, end, step=None): + """ + slice the tensor with gaven start, end, step + :param start: silce begin index of each dim + :param end: silce end index of each dim + :param step: silce step of each dim + """ + start = list(start) + end = list(end) + length = len(start) + assert length == len(end), "slice with different length of start and end." + if step: + assert length == len(step), "slice with different length of start and step." + step = list(step) + else: + step = [1 for i in range(length)] + c_start = (c_size_t * length)(*start) + c_end = (c_size_t * length)(*end) + c_step = (c_size_t * length)(*step) + slice_tensor = LiteTensor() + self._api.LITE_tensor_slice( + self._tensor, c_start, c_end, c_step, length, byref(slice_tensor._tensor) + ) + slice_tensor.update() + return slice_tensor + + def get_ctypes_memory(self): + """ + get the memory of the tensor, return c_void_p of the tensor memory + """ + self.update() + mem = c_void_p() + self._api.LITE_get_tensor_memory(self._tensor, byref(mem)) + return mem + + def set_data_by_share(self, data, length=0, layout=None): + """ + share the data to the tensor + param data: the data will shared to the tensor, it should be a + numpy.ndarray or ctypes data + """ + self.update() + if isinstance(data, np.ndarray): + assert ( + self.is_continue + ), "set_data_by_share can only apply in continue tensor." + assert ( + self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU + ), "set_data_by_share can only apply in cpu tensor or pinned tensor." + + np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] + c_type = np.ctypeslib.as_ctypes_type(np_type) + + if self.nbytes != data.nbytes: + self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type]) + + self._shared_data = data + data = data.ctypes.data_as(POINTER(c_type)) + + if layout is not None: + self.layout = layout + else: + assert length == 0 or length == self.nbytes, "the data length is not match." + self._api.LITE_reset_tensor_memory(self._tensor, data, self.nbytes) + + def set_data_by_copy(self, data, data_length=0, layout=None): + """ + copy the data to the tensor + param data: the data to copy to tensor, it should be list, + numpy.ndarraya or ctypes with length + """ + self.update() + if layout is not None: + self.layout = layout + + assert self.is_continue, "set_data_by_copy can only apply in continue tensor." + assert ( + self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU + ), "set_data_by_copy can only apply in cpu tensor or pinned tensor." + + np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] + c_type = np.ctypeslib.as_ctypes_type(np_type) + + tensor_memory = c_void_p() + + if type(data) == list: + length = len(data) + self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) + tensor_length = self.nbytes + assert ( + length * sizeof(c_type) <= tensor_length + ), "the length of input data to set to the tensor is too large." + arr = (c_type * length)(*data) + memmove(tensor_memory, arr, sizeof(c_type) * length) + + elif type(data) == np.ndarray: + if self.nbytes != data.nbytes: + self.layout = LiteLayout(data.shape, data.dtype) + arr = data.ctypes.data_as(POINTER(c_type)) + self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) + assert self.nbytes == data.nbytes + memmove(tensor_memory, arr, self.nbytes) + else: + assert ( + data_length == self.nbytes or layout is not None + ), "when input data is ctypes, the length of input data or layout must set" + self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) + memmove(tensor_memory, data, data_length) + + def to_numpy(self): + """ + get the buffer of the tensor + """ + self.update() + if self.nbytes <= 0: + return np.array([]) + if self.is_continue and ( + self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU + ): + ptr = c_void_p() + self._api.LITE_get_tensor_memory(self._tensor, byref(ptr)) + + np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] + shape = [self._layout.shapes[i] for i in range(self._layout.ndim)] + np_arr = np.zeros(shape, np_type) + if np_arr.nbytes: + memmove(np_arr.ctypes.data_as(c_void_p), ptr, np_arr.nbytes) + return np_arr + else: + tmp_tensor = LiteTensor(self.layout) + tmp_tensor.copy_from(self) + return tmp_tensor.to_numpy() + + def __repr__(self): + self.update() + data = { + "layout": self._layout, + "device_type": LiteDeviceType(self._device_type.value), + "device_id": int(self.device_id), + "is_pinned_host": bool(self._is_pinned_host), + } + return data.__repr__() + + +def LiteTensorConcat( + tensors, dim, device_type=LiteDeviceType.LITE_DEVICE_DEFAULT, device_id=-1 +): + """ + concat tensor in input dim to one tensor + dim : the dim to act concat + device_type: the result tensor device type + device_id: the result tensor device id + """ + api = _TensorAPI()._lib + length = len(tensors) + c_tensors = [t._tensor for t in tensors] + c_tensors = (_Ctensor * length)(*c_tensors) + result_tensor = LiteTensor() + api.LITE_tensor_concat( + cast(byref(c_tensors), POINTER(c_void_p)), + length, + dim, + device_type, + device_id, + byref(result_tensor._tensor), + ) + result_tensor.update() + return result_tensor diff --git a/lite/pylite/megenginelite/utils.py b/lite/pylite/megenginelite/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aec8188a569d8afb274f5e65b4a8313467d4239f --- /dev/null +++ b/lite/pylite/megenginelite/utils.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import threading + +import numpy as np + +from .base import * +from .struct import * +from .tensor import * + + +class TensorBatchCollector: + """ + this is a tensor utils to collect subtensor in batch continuous + """ + + def __init__( + self, + shape, + dtype=LiteDataType.LITE_INT8, + device_type=LiteDeviceType.LITE_CUDA, + device_id=0, + is_pinned_host=False, + tensor=None, + ): + self._mutex = threading.Lock() + self.dev_type = device_type + self.is_pinned_host = is_pinned_host + self.dev_id = 0 + self.shape = shape + self.dtype = LiteLayout(dtype=dtype).data_type + self._free_list = list(range(self.shape[0])) + + if tensor is not None: + assert ( + tensor.layout.shapes[0 : tensor.layout.ndim] == shape + ), "The tensor set to TensorBatchCollector is not right." + self._tensor = tensor + self.dtype = tensor.layout.data_type + self.device_type = tensor.device_type + self.device_id = tensor.device_type + else: + self._tensor = LiteTensor( + LiteLayout(shape, dtype), device_type, device_id, is_pinned_host + ) + + def collect_id(self, array, batch_id): + if isinstance(array, np.ndarray): + shape = array.shape + assert list(shape) == self.shape[1:] + in_dtype = ctype_to_lite_dtypes[np.ctypeslib.as_ctypes_type(array.dtype)] + assert in_dtype == self.dtype + # get the batch index + with self._mutex: + if batch_id in self._free_list: + self._free_list.remove(batch_id) + # get the subtensor + subtensor = self._tensor.slice([batch_id], [batch_id + 1]) + if subtensor.device_type == LiteDeviceType.LITE_CPU: + subtensor.set_data_by_copy(array) + else: + pinned_tensor = LiteTensor( + subtensor.layout, self.dev_type, self.dev_id, True + ) + pinned_tensor.set_data_by_share(array) + subtensor.copy_from(pinned_tensor) + else: + assert isinstance(array, LiteTensor) + ndim = array.layout.ndim + shape = list(array.layout.shapes)[0:ndim] + assert list(shape) == self.shape[1:] + in_dtype = array.layout.data_type + assert in_dtype == self.dtype + # get the batch index + with self._mutex: + if batch_id in self._free_list: + self._free_list.remove(batch_id) + # get the subtensor + subtensor = self._tensor.slice([batch_id], [batch_id + 1]) + subtensor.copy_from(array) + + return batch_id + + def collect(self, array): + with self._mutex: + if len(self._free_list) == 0: + return -1 + idx = self._free_list.pop(0) + return self.collect_id(array, idx) + + def collect_by_ctypes(self, data, length): + """ + collect with ctypes data input + """ + with self._mutex: + if len(self._free_list) == 0: + return -1 + idx = self._free_list.pop(0) + # get the subtensor + subtensor = self._tensor.slice([idx], [idx + 1]) + if subtensor.device_type == LiteDeviceType.LITE_CPU: + subtensor.set_data_by_copy(data, length) + else: + pinned_tensor = LiteTensor( + subtensor.layout, self.dev_type, self.dev_id, True + ) + pinned_tensor.set_data_by_share(data, length) + subtensor.copy_from(pinned_tensor) + + def free(self, indexes): + with self._mutex: + self._free_list.extend(indexes) + + def get(self): + return self._tensor + + def to_numpy(self): + return self._tensor.to_numpy() diff --git a/lite/pylite/pylite.md b/lite/pylite/pylite.md new file mode 100644 index 0000000000000000000000000000000000000000..183875cc08dd7c19521ec1f74d9145eb8b0a1cef --- /dev/null +++ b/lite/pylite/pylite.md @@ -0,0 +1,199 @@ +# PyLite + +Lite的python接口提供更加方便灵活的使用Lite进行模型Inference,支持各种平台上运行,X86-CUDA,X86-CPU,Arm-CPU,Arm-CUDA平台。 + +## 安装 +### whl包安装 +Lite python接口的whl包会随着megbrain的发版发布,版本号和megbrain保持一致,目前发布的Lite的whl包,包括Linux、windows、macos平台,这些平台可以直接通过pip3安装。 +```shell + python3 -m pip install --upgrade pip + python3 -m pip install megenginelite -i https://pypi.megvii-inc.com/simple +``` +### develop 安装 +开发模式下,可以使用Cmake编译出lite动态库liblite.so/liblite.dll/liblite_shared.dylib,并使用这个动态库进行开发和debug。该方式安装的pylite只能在本地机器上使用,不能copy到其他机器上使用。 +* 编译liblite.so。使用cmake编译出liblite.so + * clone megbrain工程到本地 + ```shell + git clone git@git-core.megvii-inc.com:brain-sdk/MegBrain.git + ``` + * 进行Cmake编译,这里的cmake编译同megbrain的cmake编译,使用参数和宏也完全一样 + * 编译准备 + ```shell + cd MegBrain + sh ./third_party/prepare.sh + mkdir build + cd build + ``` + * 编译X86-CUDA版本 + ```shell + cmake .. -DMGE_WITH_CUDA=ON -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) + ``` + * 编译X86 CPU Only版本 + ```shell + cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) + ``` + * 编译完成之后,liblite.so 保存在build目录中的lite文件下 + * 将liblite.so copy到megenginelite的python源文件目录下,就可以使用megenginelite了。 + ```shell + MegBrain的工程目录为 ${mgb_hone} + cp ${mgb_hone}/build/lite/liblite.so ${mgb_home}/lite/pylite/megenginelite/ + cd ${mgb_home}/lite/pylite + python3 -m "import megenginelite" + ``` + 这样就可以在${mgb_home}/lite/pylite 目录下面开发和debug lite的python接口了 + +## python3中使用megenginelite +Lite的python接口是对其C/C++接口的一层封装,他们使用的模型都是相同的模型格式。megenginelite提供两种数据接口,分别是LiteTensor和LiteNetwork。 + +### LiteTensor +LiteTensor提供了用户对数据的操作接口,提供了接口包括: +* fill_zero: 将tensor的内存设置为全0 +* share_memory_with: 可以和其他LiteTensor的共享内存 +* copy_from: 从其他LiteTensor中copy数据到自身内存中 +* reshape: 改变该LiteTensor的shape,内存数据保持不变 +* slice: 对该LiteTensor中的数据进行切片,需要分别指定每一维切片的start,end,和step。 +* set_data_by_share: 调用之后使得该LiteTensor中的内存共享自输入的array的内存,输入的array必须是numpy的ndarray,并且tensor在CPU上 +* set_data_by_copy: 该LiteTensor将会从输入的data中copy数据,data可以是list和numpy的ndarray,需要保证data的数据量不超过tensor的容量,tensor在CPU上 +* to_numpy: 将该LiteTensor中数据copy到numpy的array中,返回给用户,如果是非连续的LiteTensor,如slice出来的,将copy到连续的numpy array中,该接口主要数为了debug,有性能问题。 + +#### 使用example +* LiteTensor 设置数据example +``` +def test_tensor_set_data(): + layout = LiteLayout([2, 16], "int8") + tensor = LiteTensor(layout) + assert tensor.nbytes == 2 * 16 + + data = [i for i in range(32)] + tensor.set_data_by_copy(data) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == i + + arr = np.ones([2, 16], "int8") + tensor.set_data_by_copy(arr) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == 1 + + for i in range(32): + arr[i // 16][i % 16] = i + tensor.set_data_by_share(arr) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == i + + arr[0][8] = 100 + arr[1][3] = 20 + real_data = tensor.to_numpy() + assert real_data[0][8] == 100 + assert real_data[1][3] == 20 +``` +* tensor 共享内存example +```python +def test_tensor_share_memory_with(): + layout = LiteLayout([4, 32], "int16") + tensor = LiteTensor(layout) + assert tensor.nbytes == 4 * 32 * 2 + + arr = np.ones([4, 32], "int16") + for i in range(128): + arr[i // 32][i % 32] = i + tensor.set_data_by_share(arr) + real_data = tensor.to_numpy() + for i in range(128): + assert real_data[i // 32][i % 32] == i + + tensor2 = LiteTensor(layout) + tensor2.share_memory_with(tensor) + real_data = tensor.to_numpy() + real_data2 = tensor2.to_numpy() + for i in range(128): + assert real_data[i // 32][i % 32] == i + assert real_data2[i // 32][i % 32] == i + + arr[1][18] = 5 + arr[3][7] = 345 + real_data = tensor2.to_numpy() + assert real_data[1][18] == 5 + assert real_data[3][7] == 345 +``` +更多的使用可以参考pylite中test/test_tensor.py中的使用 +### LiteNetwork +LiteNetwork主要为用户提供模型载入,运行等功能。使用的模型见lite的readme中关于模型的部分 +* CPU基本模型载入运行的example +``` +def test_network_basic(): + source_dir = os.getenv("LITE_TEST_RESOUCE") + input_data_path = os.path.join(source_dir, "input_data.npy") + # read input to input_data + input_data = np.load(input_data_path) + model_path = os.path.join(source_dir, "shufflenet.mge") + + network = LiteNetwork() + network.load(model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + assert input_tensor.layout.shapes[0] == 1 + assert input_tensor.layout.shapes[1] == 3 + assert input_tensor.layout.shapes[2] == 224 + assert input_tensor.layout.shapes[3] == 224 + assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT + assert input_tensor.layout.ndim == 4 + + # copy input data to input_tensor of the network + input_tensor.set_data_by_copy(input_data) + for i in range(3): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) +``` +* CUDA上使用device内存作为模型输入,需要在构造network候配置config和IO信息 +``` +def test_network_device_IO(): + source_dir = os.getenv("LITE_TEST_RESOUCE") + input_data_path = os.path.join(source_dir, "input_data.npy") + model_path = os.path.join(source_dir, "shufflenet.mge") + # read input to input_data + input_data = np.load(input_data_path) + input_layout = LiteLayout([1, 3, 224, 224]) + host_input_data = LiteTensor(layout=input_layout) + host_input_data.set_data_by_share(input_data) + dev_input_data = LiteTensor(layout=input_layout, device_type=LiteDeviceType.LITE_CUDA) + dev_input_data.copy_from(host_input_data) + + # construct LiteOption + options = LiteOptions() + options.weight_preprocess = 1 + options.var_sanity_check_first_run = 0 + net_config = LiteConfig(device_type=LiteDeviceType.LITE_CUDA, option=options) + + # constuct LiteIO, is_host=False means the input tensor will use device memory + input_io = LiteIO("data", is_host=False) + ios = LiteNetworkIO() + ios.add_input(input_io) + + network = LiteNetwork(config=net_config, io=ios) + network.load(model_path) + + input_name = network.get_input_name(0) + dev_input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + # copy input data to input_tensor of the network + dev_input_tensor.share_memory_with(dev_input_data) + for i in range(3): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) +``` +更多的使用可以参考pylite中test/test_network.py和test/test_network_cuda.py中的使用 diff --git a/lite/pylite/requires.txt b/lite/pylite/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7287299d95cb849599afdea389994405451369 --- /dev/null +++ b/lite/pylite/requires.txt @@ -0,0 +1 @@ +numpy>=1.18 diff --git a/lite/pylite/scripts/format.sh b/lite/pylite/scripts/format.sh new file mode 100755 index 0000000000000000000000000000000000000000..3b93c50e2b15725706190eb0bca07842367cb0b5 --- /dev/null +++ b/lite/pylite/scripts/format.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -e +cd $(dirname $0)/.. + +ISORT_ARG="" +BLACK_ARG="" + +while getopts 'd' OPT; do + case $OPT in + d) + ISORT_ARG="--diff --check-only" + BLACK_ARG="--diff --check" + ;; + ?) + echo "Usage: `basename $0` [-d]" + esac +done + +isort $ISORT_ARG -j $(nproc) -rc megenginelite test +black $BLACK_ARG --target-version=py35 -- megenginelite test diff --git a/lite/pylite/setup.py b/lite/pylite/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..436c81dadc4a221f630eb029e15a9cab7682fd8f --- /dev/null +++ b/lite/pylite/setup.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import os +import re +import pathlib +import platform +from distutils.file_util import copy_file +from setuptools import setup, find_packages, Extension +from setuptools.command.build_ext import build_ext as _build_ext + +class PrecompiledExtesion(Extension): + def __init__(self, name): + super().__init__(name, sources=[]) + +class build_ext(_build_ext): + + def build_extension(self, ext): + if not isinstance(ext, PrecompiledExtesion): + return super().build_extension(ext) + + if not self.inplace: + fullpath = self.get_ext_fullpath(ext.name) + extdir = pathlib.Path(fullpath) + extdir.parent.mkdir(parents=True, exist_ok=True) + + modpath = self.get_ext_fullname(ext.name).split('.') + if platform.system() == 'Windows': + modpath[-1] += '.dll' + elif platform.system() == 'Darwin': + modpath[-1] += '.dylib' + else: + modpath[-1] += '.so' + modpath = str(pathlib.Path(*modpath).resolve()) + + copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run) + +v = {} +with open("megenginelite/version.py") as fp: + exec(fp.read(), v) +__version__ = v['__version__'] + +email = 'megengine@megvii.com' +# https://www.python.org/dev/peps/pep-0440 +# Public version identifiers: [N!]N(.N)*[{a|b|rc}N][.postN][.devN] +# Local version identifiers: [+] +# PUBLIC_VERSION_POSTFIX use to handle rc or dev info +public_version_postfix = os.environ.get('PUBLIC_VERSION_POSTFIX') +if public_version_postfix: + __version__ = '{}{}'.format(__version__, public_version_postfix) + +local_version = [] +strip_sdk_info = os.environ.get('STRIP_SDK_INFO', 'False').lower() +sdk_name = os.environ.get('SDK_NAME', 'cpu') +if 'true' == strip_sdk_info: + print('wheel version strip sdk info') +else: + local_version.append(sdk_name) +local_postfix = os.environ.get('LOCAL_VERSION') +if local_postfix: + local_version.append(local_postfix) +if len(local_version): + __version__ = '{}+{}'.format(__version__, '.'.join(local_version)) + +packages = find_packages() +megenginelite_data = [ + str(f.relative_to('megenginelite')) + for f in pathlib.Path('megenginelite').glob('**/*') +] + +if platform.system() == 'Windows': + megenginelite_data.remove('libs\\liblite_shared.dll') +elif platform.system() == 'Darwin': + megenginelite_data.remove('libs/liblite_shared.dylib') +else: + megenginelite_data.remove('libs/liblite_shared.so') + +with open('requires.txt') as f: + requires = f.read().splitlines() + +prebuild_modules=[PrecompiledExtesion('megenginelite.libs.liblite_shared')] +setup_kwargs = dict( + name=package_name, + version=__version__, + description='Inference Framework for MegEngine', + author='Megvii Engine Team', + author_email=email, + packages=packages, + package_data={ + 'megenginelite': megenginelite_data, + }, + ext_modules=prebuild_modules, + install_requires=requires, + cmdclass={'build_ext': build_ext}, +) +setup_kwargs.update(dict( + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: C++', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + license='Apache 2.0', + keywords='megengine deep learning', + data_files = [("megengine", [ + "../LICENSE", + "../ACKNOWLEDGMENTS", + ])] +)) + +setup(**setup_kwargs) diff --git a/lite/pylite/test/test_global.py b/lite/pylite/test/test_global.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd4c85a08c6b768e23e08fd3f2236abfb8daa17 --- /dev/null +++ b/lite/pylite/test/test_global.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import os +import unittest + +import numpy as np + +from megenginelite import * + +set_log_level(2) + + +class TestShuffleNet(unittest.TestCase): + source_dir = os.getenv("LITE_TEST_RESOUCE") + input_data_path = os.path.join(source_dir, "input_data.npy") + correct_data_path = os.path.join(source_dir, "output_data.npy") + correct_data = np.load(correct_data_path).flatten() + input_data = np.load(input_data_path) + + def check_correct(self, out_data, error=1e-4): + out_data = out_data.flatten() + assert np.isfinite(out_data.sum()) + assert self.correct_data.size == out_data.size + for i in range(out_data.size): + assert abs(out_data[i] - self.correct_data[i]) < error + + def do_forward(self, network, times=3): + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + input_tensor.set_data_by_copy(self.input_data) + for i in range(times): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + +class TestGlobal(TestShuffleNet): + def test_device_count(self): + LiteGlobal.try_coalesce_all_free_memory() + count = LiteGlobal.get_device_count(LiteDeviceType.LITE_CPU) + assert count > 0 + + def test_register_decryption_method(self): + @decryption_func + def function(in_arr, key_arr, out_arr): + if not out_arr: + return in_arr.size + else: + for i in range(in_arr.size): + out_arr[i] = in_arr[i] ^ key_arr[0] ^ key_arr[0] + return out_arr.size + + LiteGlobal.register_decryption_and_key("just_for_test", function, [15]) + config = LiteConfig() + config.bare_model_cryption_name = "just_for_test".encode("utf-8") + + network = LiteNetwork() + model_path = os.path.join(self.source_dir, "shufflenet.mge") + network.load(model_path) + + self.do_forward(network) + + def test_update_decryption_key(self): + wrong_key = [0] * 32 + LiteGlobal.update_decryption_key("AES_default", wrong_key) + + with self.assertRaises(RuntimeError): + config = LiteConfig() + config.bare_model_cryption_name = "AES_default".encode("utf-8") + network = LiteNetwork(config) + model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") + network.load(model_path) + + right_key = [i for i in range(32)] + LiteGlobal.update_decryption_key("AES_default", right_key) + + config = LiteConfig() + config.bare_model_cryption_name = "AES_default".encode("utf-8") + network = LiteNetwork(config) + model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") + network.load(model_path) + + self.do_forward(network) diff --git a/lite/pylite/test/test_network.py b/lite/pylite/test/test_network.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3a20283cb92bae56825a4ca0ae01fcd0355bdd --- /dev/null +++ b/lite/pylite/test/test_network.py @@ -0,0 +1,405 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import os +import unittest + +import numpy as np + +from megenginelite import * + +set_log_level(2) + + +def test_version(): + print("Lite verson: {}".format(version)) + + +def test_network_io(): + input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE) + input_io2 = LiteIO( + "data2", + is_host=True, + io_type=LiteIOType.LITE_IO_SHAPE, + layout=LiteLayout([2, 4, 4]), + ) + io = LiteNetworkIO() + io.add_input(input_io1) + io.add_input(input_io2) + + output_io1 = LiteIO("out1", is_host=False) + output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000])) + + io.add_output(output_io1) + io.add_output(output_io2) + + assert len(io.inputs) == 2 + assert len(io.outputs) == 2 + + assert io.inputs[0] == input_io1 + assert io.outputs[0] == output_io1 + + c_io = io._create_network_io() + + assert c_io.input_size == 2 + assert c_io.output_size == 2 + + +class TestShuffleNet(unittest.TestCase): + source_dir = os.getenv("LITE_TEST_RESOUCE") + input_data_path = os.path.join(source_dir, "input_data.npy") + correct_data_path = os.path.join(source_dir, "output_data.npy") + model_path = os.path.join(source_dir, "shufflenet.mge") + correct_data = np.load(correct_data_path).flatten() + input_data = np.load(input_data_path) + + def check_correct(self, out_data, error=1e-4): + out_data = out_data.flatten() + assert np.isfinite(out_data.sum()) + assert self.correct_data.size == out_data.size + for i in range(out_data.size): + assert abs(out_data[i] - self.correct_data[i]) < error + + def do_forward(self, network, times=3): + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + input_tensor.set_data_by_copy(self.input_data) + for i in range(times): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + +class TestNetwork(TestShuffleNet): + def test_decryption(self): + model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") + config = LiteConfig() + config.bare_model_cryption_name = "AES_default".encode("utf-8") + network = LiteNetwork(config) + network.load(model_path) + self.do_forward(network) + + def test_pack_model(self): + model_path = os.path.join(self.source_dir, "test_packed_model_rc4.lite") + network = LiteNetwork() + network.load(model_path) + self.do_forward(network) + + def test_network_basic(self): + network = LiteNetwork() + network.load(self.model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + assert input_tensor.layout.shapes[0] == 1 + assert input_tensor.layout.shapes[1] == 3 + assert input_tensor.layout.shapes[2] == 224 + assert input_tensor.layout.shapes[3] == 224 + assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT + assert input_tensor.layout.ndim == 4 + + self.do_forward(network) + + def test_network_shared_data(self): + network = LiteNetwork() + network.load(self.model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + input_tensor.set_data_by_share(self.input_data) + for i in range(3): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + def test_network_get_name(self): + network = LiteNetwork() + network.load(self.model_path) + + input_names = network.get_all_input_name() + assert input_names[0] == "data" + output_names = network.get_all_output_name() + assert output_names[0] == network.get_output_name(0) + + self.do_forward(network) + + def test_network_set_device_id(self): + network = LiteNetwork() + assert network.device_id == 0 + + network.device_id = 1 + network.load(self.model_path) + assert network.device_id == 1 + + with self.assertRaises(RuntimeError): + network.device_id = 1 + + self.do_forward(network) + + def test_network_set_stream_id(self): + network = LiteNetwork() + assert network.stream_id == 0 + + network.stream_id = 1 + network.load(self.model_path) + assert network.stream_id == 1 + + with self.assertRaises(RuntimeError): + network.stream_id = 1 + + self.do_forward(network) + + def test_network_set_thread_number(self): + network = LiteNetwork() + assert network.threads_number == 1 + + network.threads_number = 2 + network.load(self.model_path) + assert network.threads_number == 2 + + with self.assertRaises(RuntimeError): + network.threads_number = 2 + + self.do_forward(network) + + def test_network_cpu_inplace(self): + network = LiteNetwork() + assert network.is_cpu_inplace_mode() == False + + network.enable_cpu_inplace_mode() + network.load(self.model_path) + assert network.is_cpu_inplace_mode() == True + + with self.assertRaises(RuntimeError): + network.enable_cpu_inplace_mode() + + self.do_forward(network) + + def test_network_option(self): + option = LiteOptions() + option.weight_preprocess = 1 + option.var_sanity_check_first_run = 0 + + config = LiteConfig(option=option) + network = LiteNetwork(config=config) + network.load(self.model_path) + + self.do_forward(network) + + def test_network_reset_io(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + + input_io = LiteIO("data") + ios = LiteNetworkIO() + ios.add_input(input_io) + network = LiteNetwork(config=config, io=ios) + network.load(self.model_path) + + input_tensor = network.get_io_tensor("data") + assert input_tensor.device_type == LiteDeviceType.LITE_CPU + + self.do_forward(network) + + def test_network_by_share(self): + network = LiteNetwork() + network.load(self.model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + assert input_tensor.device_type == LiteDeviceType.LITE_CPU + layout = LiteLayout(self.input_data.shape, self.input_data.dtype) + tensor_tmp = LiteTensor(layout=layout) + tensor_tmp.set_data_by_share(self.input_data) + input_tensor.share_memory_with(tensor_tmp) + + for i in range(3): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + def test_network_share_weights(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + + src_network = LiteNetwork(config=config) + src_network.load(self.model_path) + + new_network = LiteNetwork() + new_network.enable_cpu_inplace_mode() + new_network.share_weights_with(src_network) + + self.do_forward(src_network) + self.do_forward(new_network) + + def test_network_share_runtime_memory(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + + src_network = LiteNetwork(config=config) + src_network.load(self.model_path) + + new_network = LiteNetwork() + new_network.enable_cpu_inplace_mode() + new_network.share_runtime_memroy(src_network) + new_network.load(self.model_path) + + self.do_forward(src_network) + self.do_forward(new_network) + + # def test_network_async(self): + # count = 0 + # finished = False + # + # def async_callback(): + # nonlocal finished + # finished = True + # return 0 + # + # option = LiteOptions() + # option.var_sanity_check_first_run = 0 + # config = LiteConfig(option=option) + # + # network = LiteNetwork(config=config) + # network.load(self.model_path) + # + # network.async_with_callback(async_callback) + # + # input_tensor = network.get_io_tensor(network.get_input_name(0)) + # output_tensor = network.get_io_tensor(network.get_output_name(0)) + # + # input_tensor.set_data_by_share(self.input_data) + # network.forward() + # + # while not finished: + # count += 1 + # + # assert count > 0 + # output_data = output_tensor.to_numpy() + # self.check_correct(output_data) + # + # def test_network_start_callback(self): + # network = LiteNetwork() + # network.load(self.model_path) + # start_checked = False + # + # @start_finish_callback + # def start_callback(ios): + # nonlocal start_checked + # start_checked = True + # assert len(ios) == 1 + # for key in ios: + # io = key + # data = ios[key].to_numpy().flatten() + # input_data = self.input_data.flatten() + # assert data.size == input_data.size + # assert io.name.decode("utf-8") == "data" + # for i in range(data.size): + # assert data[i] == input_data[i] + # return 0 + # + # network.set_start_callback(start_callback) + # self.do_forward(network, 1) + # assert start_checked == True + # + # def test_network_finish_callback(self): + # network = LiteNetwork() + # network.load(self.model_path) + # finish_checked = False + # + # @start_finish_callback + # def finish_callback(ios): + # nonlocal finish_checked + # finish_checked = True + # assert len(ios) == 1 + # for key in ios: + # io = key + # data = ios[key].to_numpy().flatten() + # output_data = self.correct_data.flatten() + # assert data.size == output_data.size + # for i in range(data.size): + # assert data[i] == output_data[i] + # return 0 + # + # network.set_finish_callback(finish_callback) + # self.do_forward(network, 1) + # assert finish_checked == True + + def test_enable_profile(self): + network = LiteNetwork() + network.load(self.model_path) + network.enable_profile_performance("./profile.json") + + self.do_forward(network) + + fi = open("./profile.json", "r") + fi.close() + os.remove("./profile.json") + + def test_io_txt_dump(self): + network = LiteNetwork() + network.load(self.model_path) + network.io_txt_dump("./io_txt.txt") + self.do_forward(network) + + def test_io_bin_dump(self): + import shutil + + folder = "./out" + network = LiteNetwork() + network.load(self.model_path) + if not os.path.exists(folder): + os.mkdir(folder) + network.io_bin_dump(folder) + self.do_forward(network) + shutil.rmtree(folder) + + def test_algo_workspace_limit(self): + network = LiteNetwork() + network.load(self.model_path) + print("modify the workspace limit.") + network.set_network_algo_workspace_limit(10000) + self.do_forward(network) + + def test_network_algo_policy(self): + network = LiteNetwork() + network.load(self.model_path) + network.set_network_algo_policy( + LiteAlgoSelectStrategy.LITE_ALGO_PROFILE + | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE + ) + self.do_forward(network) + + def test_network_algo_policy_ignore_batch(self): + network = LiteNetwork() + network.load(self.model_path) + network.set_network_algo_policy( + LiteAlgoSelectStrategy.LITE_ALGO_PROFILE, + shared_batch_size=1, + binary_equal_between_batch=True, + ) + self.do_forward(network) diff --git a/lite/pylite/test/test_network_cuda.py b/lite/pylite/test/test_network_cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..f7fe9e10129fb835bc03ab91cc395ff69ab41e9e --- /dev/null +++ b/lite/pylite/test/test_network_cuda.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import functools +import os +import unittest + +import numpy as np + +from megenginelite import * + +set_log_level(2) + + +def require_cuda(ngpu=1): + """a decorator that disables a testcase if cuda is not enabled""" + + def dector(func): + @functools.wraps(func) + def wrapped(*args, **kwargs): + if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA) >= ngpu: + return func(*args, **kwargs) + + return wrapped + + return dector + + +class TestShuffleNetCuda(unittest.TestCase): + source_dir = os.getenv("LITE_TEST_RESOUCE") + input_data_path = os.path.join(source_dir, "input_data.npy") + correct_data_path = os.path.join(source_dir, "output_data.npy") + model_path = os.path.join(source_dir, "shufflenet.mge") + correct_data = np.load(correct_data_path).flatten() + input_data = np.load(input_data_path) + + def check_correct(self, out_data, error=1e-4): + out_data = out_data.flatten() + assert np.isfinite(out_data.sum()) + assert self.correct_data.size == out_data.size + for i in range(out_data.size): + assert abs(out_data[i] - self.correct_data[i]) < error + + def do_forward(self, network, times=3): + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + input_tensor.set_data_by_copy(self.input_data) + for i in range(times): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + +class TestNetwork(TestShuffleNetCuda): + @require_cuda() + def test_network_basic(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + network.load(self.model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + assert input_tensor.layout.shapes[0] == 1 + assert input_tensor.layout.shapes[1] == 3 + assert input_tensor.layout.shapes[2] == 224 + assert input_tensor.layout.shapes[3] == 224 + assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT + assert input_tensor.layout.ndim == 4 + + self.do_forward(network) + + @require_cuda() + def test_network_shared_data(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + network.load(self.model_path) + + input_name = network.get_input_name(0) + input_tensor = network.get_io_tensor(input_name) + output_name = network.get_output_name(0) + output_tensor = network.get_io_tensor(output_name) + + input_tensor.set_data_by_share(self.input_data) + for i in range(3): + network.forward() + network.wait() + + output_data = output_tensor.to_numpy() + self.check_correct(output_data) + + @require_cuda(2) + def test_network_set_device_id(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + assert network.device_id == 0 + + network.device_id = 1 + network.load(self.model_path) + assert network.device_id == 1 + + with self.assertRaises(RuntimeError): + network.device_id = 1 + + self.do_forward(network) + + @require_cuda() + def test_network_option(self): + option = LiteOptions() + option.weight_preprocess = 1 + option.var_sanity_check_first_run = 0 + + config = LiteConfig(option=option) + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config=config) + network.load(self.model_path) + + self.do_forward(network) + + @require_cuda() + def test_network_reset_io(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + + config.device_type = LiteDeviceType.LITE_CUDA + input_io = LiteIO("data") + ios = LiteNetworkIO() + ios.add_input(input_io) + network = LiteNetwork(config=config, io=ios) + network.load(self.model_path) + + input_tensor = network.get_io_tensor("data") + assert input_tensor.device_type == LiteDeviceType.LITE_CPU + + self.do_forward(network) + + @require_cuda() + def test_network_share_weights(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + config.device_type = LiteDeviceType.LITE_CUDA + + src_network = LiteNetwork(config=config) + src_network.load(self.model_path) + + new_network = LiteNetwork() + new_network.enable_cpu_inplace_mode() + new_network.share_weights_with(src_network) + + self.do_forward(src_network) + self.do_forward(new_network) + + @require_cuda() + def test_network_share_runtime_memory(self): + option = LiteOptions() + option.var_sanity_check_first_run = 0 + config = LiteConfig(option=option) + config.device_type = LiteDeviceType.LITE_CUDA + + src_network = LiteNetwork(config=config) + src_network.load(self.model_path) + + new_network = LiteNetwork() + new_network.enable_cpu_inplace_mode() + new_network.share_runtime_memroy(src_network) + new_network.load(self.model_path) + + self.do_forward(src_network) + self.do_forward(new_network) + + @require_cuda() + def test_enable_profile(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + network.load(self.model_path) + network.enable_profile_performance("./profile.json") + + self.do_forward(network) + + fi = open("./profile.json", "r") + fi.close() + os.remove("./profile.json") + + @require_cuda() + def test_algo_workspace_limit(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + network.load(self.model_path) + print("modify the workspace limit.") + network.set_network_algo_workspace_limit(10000) + self.do_forward(network) + + @require_cuda() + def test_network_algo_policy(self): + config = LiteConfig() + config.device_type = LiteDeviceType.LITE_CUDA + network = LiteNetwork(config) + network.load(self.model_path) + network.set_network_algo_policy( + LiteAlgoSelectStrategy.LITE_ALGO_PROFILE + | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE + ) + self.do_forward(network) diff --git a/lite/pylite/test/test_tensor.py b/lite/pylite/test/test_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..61d6575b074004a7fbff8e53b43715d1e5dc1b49 --- /dev/null +++ b/lite/pylite/test/test_tensor.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import functools + +import numpy as np + +from megenginelite import * + + +def require_cuda(func): + """a decorator that disables a testcase if cuda is not enabled""" + + @functools.wraps(func) + def wrapped(*args, **kwargs): + if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): + return func(*args, **kwargs) + + return wrapped + + +def test_tensor_make(): + empty_layout = LiteLayout() + assert empty_layout.ndim == 0 + assert empty_layout.data_type == int(LiteDataType.LITE_FLOAT) + + empty_tensor = LiteTensor() + assert empty_tensor.layout.ndim == empty_layout.ndim + assert empty_tensor.layout.data_type == empty_layout.data_type + + layout = LiteLayout([4, 16]) + layout = LiteLayout(dtype="float32") + layout = LiteLayout([4, 16], "float32") + layout = LiteLayout([4, 16], "float16") + layout = LiteLayout([4, 16], np.float32) + layout = LiteLayout([4, 16], np.int8) + layout = LiteLayout([4, 16], LiteDataType.LITE_FLOAT) + + tensor = LiteTensor(layout) + tensor = LiteTensor(layout, LiteDeviceType.LITE_CPU) + assert tensor.layout == layout + assert tensor.device_type == LiteDeviceType.LITE_CPU + assert tensor.is_continue == True + assert tensor.is_pinned_host == False + assert tensor.nbytes == 4 * 16 * 4 + assert tensor.device_id == 0 + + tensor = LiteTensor(layout, device_id=1) + assert tensor.device_id == 1 + + +def test_tensor_set_data(): + layout = LiteLayout([2, 16], "int8") + tensor = LiteTensor(layout) + assert tensor.nbytes == 2 * 16 + + data = [i for i in range(32)] + tensor.set_data_by_copy(data) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == i + + arr = np.ones([2, 16], "int8") + tensor.set_data_by_copy(arr) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == 1 + + for i in range(32): + arr[i // 16][i % 16] = i + tensor.set_data_by_share(arr) + real_data = tensor.to_numpy() + for i in range(32): + assert real_data[i // 16][i % 16] == i + + arr[0][8] = 100 + arr[1][3] = 20 + real_data = tensor.to_numpy() + assert real_data[0][8] == 100 + assert real_data[1][3] == 20 + + +def test_fill_zero(): + layout = LiteLayout([4, 8], "int16") + tensor1 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 2 + + tensor1.set_data_by_copy([i for i in range(32)]) + real_data = tensor1.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + + tensor1.fill_zero() + real_data = tensor1.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == 0 + + +def test_copy_from(): + layout = LiteLayout([4, 8], "int16") + tensor1 = LiteTensor(layout) + tensor2 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 2 + assert tensor2.nbytes == 4 * 8 * 2 + + tensor1.set_data_by_copy([i for i in range(32)]) + tensor2.copy_from(tensor1) + real_data = tensor2.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + + tensor1.set_data_by_copy([i + 5 for i in range(32)]) + tensor2.copy_from(tensor1) + real_data = tensor2.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + 5 + + +def test_reshape(): + layout = LiteLayout([4, 8], "int16") + tensor1 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 2 + + tensor1.set_data_by_copy([i for i in range(32)]) + real_data = tensor1.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + + tensor1.reshape([8, 4]) + real_data = tensor1.to_numpy() + for i in range(32): + assert real_data[i // 4][i % 4] == i + + +def test_slice(): + layout = LiteLayout([4, 8], "int32") + tensor1 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 4 + + tensor1.set_data_by_copy([i for i in range(32)]) + real_data_org = tensor1.to_numpy() + for i in range(32): + assert real_data_org[i // 8][i % 8] == i + + tensor2 = tensor1.slice([1, 4], [3, 8]) + assert tensor2.layout.shapes[0] == 2 + assert tensor2.layout.shapes[1] == 4 + assert tensor2.is_continue == False + + real_data = tensor2.to_numpy() + for i in range(8): + row = i // 4 + col = i % 4 + assert real_data[row][col] == real_data_org[row + 1][col + 4] + + +def test_tensor_share_memory(): + layout = LiteLayout([4, 8], "int16") + tensor1 = LiteTensor(layout) + tensor2 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 2 + assert tensor2.nbytes == 4 * 8 * 2 + + tensor1.set_data_by_copy([i for i in range(32)]) + tensor2.share_memory_with(tensor1) + real_data = tensor2.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + + tensor1.set_data_by_copy([i + 5 for i in range(32)]) + real_data = tensor2.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + 5 + + +def test_tensor_share_ctype_memory(): + layout = LiteLayout([4, 8], "int16") + tensor1 = LiteTensor(layout) + assert tensor1.nbytes == 4 * 8 * 2 + + arr = np.ones([4, 8], "int16") + for i in range(32): + arr[i // 8][i % 8] = i + tensor1.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) + real_data = tensor1.to_numpy() + for i in range(32): + assert real_data[i // 8][i % 8] == i + + +@require_cuda +def test_tensor_share_ctype_memory_device(): + layout = LiteLayout([4, 8], "int16") + tensor_cpu = LiteTensor( + layout=layout, device_type=LiteDeviceType.LITE_CUDA, is_pinned_host=True + ) + tensor_cuda1 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) + tensor_cuda2 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) + assert tensor_cpu.nbytes == 4 * 8 * 2 + assert tensor_cuda1.nbytes == 4 * 8 * 2 + assert tensor_cuda2.nbytes == 4 * 8 * 2 + + arr = np.ones([4, 8], "int16") + for i in range(32): + arr[i // 8][i % 8] = i + tensor_cpu.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) + tensor_cuda1.copy_from(tensor_cpu) + device_mem = tensor_cuda1.get_ctypes_memory() + tensor_cuda2.set_data_by_share(device_mem, tensor_cuda1.nbytes) + real_data1 = tensor_cuda1.to_numpy() + real_data2 = tensor_cuda2.to_numpy() + for i in range(32): + assert real_data1[i // 8][i % 8] == i + assert real_data2[i // 8][i % 8] == i + + +def test_tensor_share_memory_with(): + layout = LiteLayout([4, 32], "int16") + tensor = LiteTensor(layout) + assert tensor.nbytes == 4 * 32 * 2 + + arr = np.ones([4, 32], "int16") + for i in range(128): + arr[i // 32][i % 32] = i + tensor.set_data_by_share(arr) + real_data = tensor.to_numpy() + for i in range(128): + assert real_data[i // 32][i % 32] == i + + tensor2 = LiteTensor(layout) + tensor2.share_memory_with(tensor) + real_data = tensor.to_numpy() + real_data2 = tensor2.to_numpy() + for i in range(128): + assert real_data[i // 32][i % 32] == i + assert real_data2[i // 32][i % 32] == i + + arr[1][18] = 5 + arr[3][7] = 345 + real_data = tensor2.to_numpy() + assert real_data[1][18] == 5 + assert real_data[3][7] == 345 + + +def test_empty_tensor(): + empty_tensor = LiteTensor() + assert empty_tensor.layout.ndim == 0 + assert empty_tensor.layout.data_type == int(LiteDataType.LITE_FLOAT) + # check empty tensor to numpy + data = empty_tensor.to_numpy() + + +def test_tensor_by_set_copy_with_new_layout(): + layout = LiteLayout([4, 32], "int16") + tensor = LiteTensor(layout) + assert tensor.nbytes == 4 * 32 * 2 + + arr = np.ones([8, 64], "int32") + tensor.set_data_by_copy(arr) + new_layout = tensor.layout + assert new_layout.ndim == 2 + assert new_layout.shapes[0] == 8 + assert new_layout.shapes[1] == 64 + + tensor = LiteTensor(layout) + tensor.set_data_by_share(arr) + new_layout = tensor.layout + assert new_layout.ndim == 2 + assert new_layout.shapes[0] == 8 + assert new_layout.shapes[1] == 64 + + +def test_tensor_concat(): + layout = LiteLayout([4, 32], "int16") + tensors = [] + arr = np.ones([4, 32], "int16") + for j in range(4): + for i in range(128): + arr[i // 32][i % 32] = j + tensor = LiteTensor(layout) + tensor.set_data_by_copy(arr) + tensors.append(tensor) + new_tensor = LiteTensorConcat(tensors, 0) + + real_data = new_tensor.to_numpy() + for j in range(4): + for i in range(128): + index = j * 128 + i + assert real_data[index // 32][index % 32] == j diff --git a/lite/pylite/test/test_utils.py b/lite/pylite/test/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7435825207c5ec723c7179b9b26f095bd215fe7b --- /dev/null +++ b/lite/pylite/test/test_utils.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import functools + +import numpy as np + +from megenginelite import * + + +def require_cuda(func): + """a decorator that disables a testcase if cuda is not enabled""" + + @functools.wraps(func) + def wrapped(*args, **kwargs): + if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): + return func(*args, **kwargs) + + return wrapped + + +@require_cuda +def test_tensor_collect_batch(): + batch_tensor = TensorBatchCollector( + [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA + ) + arr = np.ones([8, 8], "int32") + for i in range(4): + batch_tensor.collect(arr) + arr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 8 + assert data.shape[2] == 8 + for i in range(4): + for j in range(64): + assert data[i][j // 8][j % 8] == i + 1 + + +def test_tensor_collect_batch_cpu(): + batch_tensor = TensorBatchCollector( + [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU + ) + arr = np.ones([8, 8], "int32") + for i in range(4): + batch_tensor.collect(arr) + arr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 8 + assert data.shape[2] == 8 + for i in range(4): + for j in range(64): + assert data[i][j // 8][j % 8] == i + 1 + + +@require_cuda +def test_tensor_collect_batch_by_index(): + batch_tensor = TensorBatchCollector( + [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA + ) + arr = np.ones([8, 8], "int32") + arr += 1 # ==2 + batch_tensor.collect_id(arr, 1) + arr -= 1 # ==1 + batch_tensor.collect_id(arr, 0) + arr += 2 # ==3 + batch_tensor.collect_id(arr, 2) + arr += 1 # ==4 + batch_tensor.collect_id(arr, 3) + + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 8 + assert data.shape[2] == 8 + for i in range(4): + for j in range(64): + assert data[i][j // 8][j % 8] == i + 1 + + +@require_cuda +def test_tensor_collect_batch_tensor(): + batch_tensor = TensorBatchCollector( + [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA + ) + nparr = np.ones([6, 8], "int32") + tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) + for i in range(4): + tensor.set_data_by_share(nparr) + batch_tensor.collect(tensor) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 + + +def test_tensor_collect_batch_tensor_cpu(): + batch_tensor = TensorBatchCollector( + [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU + ) + nparr = np.ones([6, 8], "int32") + tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) + for i in range(4): + tensor.set_data_by_share(nparr) + batch_tensor.collect(tensor) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 + + +@require_cuda +def test_tensor_collect_batch_ctypes(): + batch_tensor = TensorBatchCollector( + [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA + ) + nparr = np.ones([6, 8], "int32") + for i in range(4): + in_data = nparr.ctypes.data + batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 + + +def test_tensor_collect_batch_ctypes_cpu(): + batch_tensor = TensorBatchCollector( + [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU + ) + nparr = np.ones([6, 8], "int32") + for i in range(4): + in_data = nparr.ctypes.data + batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 + + +@require_cuda +def test_tensor_collect_batch_device_tensor(): + all_tensor = LiteTensor( + LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), + device_type=LiteDeviceType.LITE_CUDA, + ) + batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) + nparr = np.ones([6, 8], "int32") + tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) + for i in range(4): + tensor.set_data_by_share(nparr) + batch_tensor.collect(tensor) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 + + +@require_cuda +def test_tensor_collect_batch_device_numpy(): + all_tensor = LiteTensor( + LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), + device_type=LiteDeviceType.LITE_CUDA, + ) + batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) + nparr = np.ones([6, 8], "int32") + for i in range(4): + batch_tensor.collect(nparr) + nparr += 1 + data = batch_tensor.to_numpy() + assert data.shape[0] == 4 + assert data.shape[1] == 6 + assert data.shape[2] == 8 + for i in range(4): + for j in range(48): + assert data[i][j // 8][j % 8] == i + 1 diff --git a/lite/src/decryption/aes_decrypt.h b/lite/src/decryption/aes_decrypt.h new file mode 100644 index 0000000000000000000000000000000000000000..5f9b134bead76580cd18f6841025df6759b8fa65 --- /dev/null +++ b/lite/src/decryption/aes_decrypt.h @@ -0,0 +1,53 @@ +/** + * \file src/decryption/aes_decrypt.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "./mbedtls/aes.h" +#include "decrypt_base.h" + +namespace lite { + +class AESDcryption { +public: + static std::vector decrypt_model(const void* model_mem, + size_t size, + const std::vector& key) { + mbedtls_aes_context ctx; + mbedtls_aes_init(&ctx); + mbedtls_aes_setkey_dec(&ctx, key.data(), 256); + + auto data = static_cast(model_mem); + //! first 16 bytes is IV + uint8_t iv[16]; + //! last 8 bytes is file size(length) + auto length_ptr = data + size - 8; + size_t length = 0; + for (int i = 0; i < 8; i++) { + length |= length_ptr[i] << (8 * (7 - i)); + } + std::copy(data, data + 16, iv); + auto output = std::vector(size - 24); + mbedtls_aes_crypt_cbc(&ctx, MBEDTLS_AES_DECRYPT, size - 24, iv, + data + 16, output.data()); + mbedtls_aes_free(&ctx); + output.erase(output.begin() + length, output.end()); + return output; + } + + static std::vector get_decrypt_key() { + std::vector key = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F}; + return key; + } +}; +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/decrypt_base.h b/lite/src/decryption/decrypt_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d3df19f900e9d8d49b876df88f6c3253160fcc0f --- /dev/null +++ b/lite/src/decryption/decrypt_base.h @@ -0,0 +1,49 @@ +/** + * \file src/decryption/decrypt_base.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#include "lite/global.h" +#include "misc.h" + +namespace lite { + +struct DecryptionStaticData { + std::unordered_map< + std::string, + std::pair>>> + decryption_methods; + LITE_MUTEX map_mutex; +}; + +DecryptionStaticData& decryption_static_data(); + +template +struct DecryptionRegister; + +} // namespace lite + +#define CONCAT_IMPL(a, b) a##b +#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) + +#define REGIST_DECRYPTION_METHOD(name_, func_, key_) \ + REGIST_DECRYPTION_METHOD_WITH_NUM(__COUNTER__, name_, func_, key_) + +#define REGIST_DECRYPTION_METHOD_WITH_NUM(number_, name_, func_, key_) \ + template <> \ + struct DecryptionRegister { \ + DecryptionRegister() { \ + register_decryption_and_key(name_, func_, key_); \ + } \ + }; \ + namespace { \ + DecryptionRegister MACRO_CONCAT(decryption_, number_); \ + } + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/mbedtls/aes.cc b/lite/src/decryption/mbedtls/aes.cc new file mode 100644 index 0000000000000000000000000000000000000000..83dece97f2a43bf31cdaf90c8648aff338c0b273 --- /dev/null +++ b/lite/src/decryption/mbedtls/aes.cc @@ -0,0 +1,1363 @@ +/* + * FIPS-197 compliant AES implementation + * + * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of mbed TLS (https://tls.mbed.org) + */ +/* + * The AES block cipher was designed by Vincent Rijmen and Joan Daemen. + * + * http://csrc.nist.gov/encryption/aes/rijndael/Rijndael.pdf + * http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf + */ + +/** + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#if !defined(MBEDTLS_CONFIG_FILE) +#include "config.h" +#else +#include MBEDTLS_CONFIG_FILE +#endif + +#if defined(MBEDTLS_AES_C) + +#include + +#include "aes.h" +#if defined(MBEDTLS_PADLOCK_C) +#include "mbedtls/padlock.h" +#endif +#if defined(MBEDTLS_AESNI_C) +#include "mbedtls/aesni.h" +#endif + +#if defined(MBEDTLS_SELF_TEST) +#if defined(MBEDTLS_PLATFORM_C) +#include "mbedtls/platform.h" +#else +#include +#define mbedtls_printf printf +#endif /* MBEDTLS_PLATFORM_C */ +#endif /* MBEDTLS_SELF_TEST */ + +#if !defined(MBEDTLS_AES_ALT) + +/* Implementation that should never be optimized out by the compiler */ +static void mbedtls_zeroize(void *v, size_t n) { + volatile unsigned char *p = (unsigned char *)v; + while (n--) *p++ = 0; +} + +/* + * 32-bit integer manipulation macros (little endian) + */ +#ifndef GET_UINT32_LE +#define GET_UINT32_LE(n, b, i) \ + { \ + (n) = ((uint32_t)(b)[(i)]) | ((uint32_t)(b)[(i) + 1] << 8) | \ + ((uint32_t)(b)[(i) + 2] << 16) | ((uint32_t)(b)[(i) + 3] << 24); \ + } +#endif + +#ifndef PUT_UINT32_LE +#define PUT_UINT32_LE(n, b, i) \ + { \ + (b)[(i)] = (unsigned char)(((n)) & 0xFF); \ + (b)[(i) + 1] = (unsigned char)(((n) >> 8) & 0xFF); \ + (b)[(i) + 2] = (unsigned char)(((n) >> 16) & 0xFF); \ + (b)[(i) + 3] = (unsigned char)(((n) >> 24) & 0xFF); \ + } +#endif + +#if defined(MBEDTLS_PADLOCK_C) && \ + (defined(MBEDTLS_HAVE_X86) || defined(MBEDTLS_PADLOCK_ALIGN16)) +static int aes_padlock_ace = -1; +#endif + +#if defined(MBEDTLS_AES_ROM_TABLES) +/* + * Forward S-box + */ +static const unsigned char FSb[256] = { + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, + 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, + 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, + 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, + 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, + 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, + 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, + 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, + 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, + 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, + 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, + 0xB0, 0x54, 0xBB, 0x16}; + +/* + * Forward tables + */ +#define FT \ + \ + V(A5, 63, 63, C6), V(84, 7C, 7C, F8), V(99, 77, 77, EE), \ + V(8D, 7B, 7B, F6), V(0D, F2, F2, FF), V(BD, 6B, 6B, D6), \ + V(B1, 6F, 6F, DE), V(54, C5, C5, 91), V(50, 30, 30, 60), \ + V(03, 01, 01, 02), V(A9, 67, 67, CE), V(7D, 2B, 2B, 56), \ + V(19, FE, FE, E7), V(62, D7, D7, B5), V(E6, AB, AB, 4D), \ + V(9A, 76, 76, EC), V(45, CA, CA, 8F), V(9D, 82, 82, 1F), \ + V(40, C9, C9, 89), V(87, 7D, 7D, FA), V(15, FA, FA, EF), \ + V(EB, 59, 59, B2), V(C9, 47, 47, 8E), V(0B, F0, F0, FB), \ + V(EC, AD, AD, 41), V(67, D4, D4, B3), V(FD, A2, A2, 5F), \ + V(EA, AF, AF, 45), V(BF, 9C, 9C, 23), V(F7, A4, A4, 53), \ + V(96, 72, 72, E4), V(5B, C0, C0, 9B), V(C2, B7, B7, 75), \ + V(1C, FD, FD, E1), V(AE, 93, 93, 3D), V(6A, 26, 26, 4C), \ + V(5A, 36, 36, 6C), V(41, 3F, 3F, 7E), V(02, F7, F7, F5), \ + V(4F, CC, CC, 83), V(5C, 34, 34, 68), V(F4, A5, A5, 51), \ + V(34, E5, E5, D1), V(08, F1, F1, F9), V(93, 71, 71, E2), \ + V(73, D8, D8, AB), V(53, 31, 31, 62), V(3F, 15, 15, 2A), \ + V(0C, 04, 04, 08), V(52, C7, C7, 95), V(65, 23, 23, 46), \ + V(5E, C3, C3, 9D), V(28, 18, 18, 30), V(A1, 96, 96, 37), \ + V(0F, 05, 05, 0A), V(B5, 9A, 9A, 2F), V(09, 07, 07, 0E), \ + V(36, 12, 12, 24), V(9B, 80, 80, 1B), V(3D, E2, E2, DF), \ + V(26, EB, EB, CD), V(69, 27, 27, 4E), V(CD, B2, B2, 7F), \ + V(9F, 75, 75, EA), V(1B, 09, 09, 12), V(9E, 83, 83, 1D), \ + V(74, 2C, 2C, 58), V(2E, 1A, 1A, 34), V(2D, 1B, 1B, 36), \ + V(B2, 6E, 6E, DC), V(EE, 5A, 5A, B4), V(FB, A0, A0, 5B), \ + V(F6, 52, 52, A4), V(4D, 3B, 3B, 76), V(61, D6, D6, B7), \ + V(CE, B3, B3, 7D), V(7B, 29, 29, 52), V(3E, E3, E3, DD), \ + V(71, 2F, 2F, 5E), V(97, 84, 84, 13), V(F5, 53, 53, A6), \ + V(68, D1, D1, B9), V(00, 00, 00, 00), V(2C, ED, ED, C1), \ + V(60, 20, 20, 40), V(1F, FC, FC, E3), V(C8, B1, B1, 79), \ + V(ED, 5B, 5B, B6), V(BE, 6A, 6A, D4), V(46, CB, CB, 8D), \ + V(D9, BE, BE, 67), V(4B, 39, 39, 72), V(DE, 4A, 4A, 94), \ + V(D4, 4C, 4C, 98), V(E8, 58, 58, B0), V(4A, CF, CF, 85), \ + V(6B, D0, D0, BB), V(2A, EF, EF, C5), V(E5, AA, AA, 4F), \ + V(16, FB, FB, ED), V(C5, 43, 43, 86), V(D7, 4D, 4D, 9A), \ + V(55, 33, 33, 66), V(94, 85, 85, 11), V(CF, 45, 45, 8A), \ + V(10, F9, F9, E9), V(06, 02, 02, 04), V(81, 7F, 7F, FE), \ + V(F0, 50, 50, A0), V(44, 3C, 3C, 78), V(BA, 9F, 9F, 25), \ + V(E3, A8, A8, 4B), V(F3, 51, 51, A2), V(FE, A3, A3, 5D), \ + V(C0, 40, 40, 80), V(8A, 8F, 8F, 05), V(AD, 92, 92, 3F), \ + V(BC, 9D, 9D, 21), V(48, 38, 38, 70), V(04, F5, F5, F1), \ + V(DF, BC, BC, 63), V(C1, B6, B6, 77), V(75, DA, DA, AF), \ + V(63, 21, 21, 42), V(30, 10, 10, 20), V(1A, FF, FF, E5), \ + V(0E, F3, F3, FD), V(6D, D2, D2, BF), V(4C, CD, CD, 81), \ + V(14, 0C, 0C, 18), V(35, 13, 13, 26), V(2F, EC, EC, C3), \ + V(E1, 5F, 5F, BE), V(A2, 97, 97, 35), V(CC, 44, 44, 88), \ + V(39, 17, 17, 2E), V(57, C4, C4, 93), V(F2, A7, A7, 55), \ + V(82, 7E, 7E, FC), V(47, 3D, 3D, 7A), V(AC, 64, 64, C8), \ + V(E7, 5D, 5D, BA), V(2B, 19, 19, 32), V(95, 73, 73, E6), \ + V(A0, 60, 60, C0), V(98, 81, 81, 19), V(D1, 4F, 4F, 9E), \ + V(7F, DC, DC, A3), V(66, 22, 22, 44), V(7E, 2A, 2A, 54), \ + V(AB, 90, 90, 3B), V(83, 88, 88, 0B), V(CA, 46, 46, 8C), \ + V(29, EE, EE, C7), V(D3, B8, B8, 6B), V(3C, 14, 14, 28), \ + V(79, DE, DE, A7), V(E2, 5E, 5E, BC), V(1D, 0B, 0B, 16), \ + V(76, DB, DB, AD), V(3B, E0, E0, DB), V(56, 32, 32, 64), \ + V(4E, 3A, 3A, 74), V(1E, 0A, 0A, 14), V(DB, 49, 49, 92), \ + V(0A, 06, 06, 0C), V(6C, 24, 24, 48), V(E4, 5C, 5C, B8), \ + V(5D, C2, C2, 9F), V(6E, D3, D3, BD), V(EF, AC, AC, 43), \ + V(A6, 62, 62, C4), V(A8, 91, 91, 39), V(A4, 95, 95, 31), \ + V(37, E4, E4, D3), V(8B, 79, 79, F2), V(32, E7, E7, D5), \ + V(43, C8, C8, 8B), V(59, 37, 37, 6E), V(B7, 6D, 6D, DA), \ + V(8C, 8D, 8D, 01), V(64, D5, D5, B1), V(D2, 4E, 4E, 9C), \ + V(E0, A9, A9, 49), V(B4, 6C, 6C, D8), V(FA, 56, 56, AC), \ + V(07, F4, F4, F3), V(25, EA, EA, CF), V(AF, 65, 65, CA), \ + V(8E, 7A, 7A, F4), V(E9, AE, AE, 47), V(18, 08, 08, 10), \ + V(D5, BA, BA, 6F), V(88, 78, 78, F0), V(6F, 25, 25, 4A), \ + V(72, 2E, 2E, 5C), V(24, 1C, 1C, 38), V(F1, A6, A6, 57), \ + V(C7, B4, B4, 73), V(51, C6, C6, 97), V(23, E8, E8, CB), \ + V(7C, DD, DD, A1), V(9C, 74, 74, E8), V(21, 1F, 1F, 3E), \ + V(DD, 4B, 4B, 96), V(DC, BD, BD, 61), V(86, 8B, 8B, 0D), \ + V(85, 8A, 8A, 0F), V(90, 70, 70, E0), V(42, 3E, 3E, 7C), \ + V(C4, B5, B5, 71), V(AA, 66, 66, CC), V(D8, 48, 48, 90), \ + V(05, 03, 03, 06), V(01, F6, F6, F7), V(12, 0E, 0E, 1C), \ + V(A3, 61, 61, C2), V(5F, 35, 35, 6A), V(F9, 57, 57, AE), \ + V(D0, B9, B9, 69), V(91, 86, 86, 17), V(58, C1, C1, 99), \ + V(27, 1D, 1D, 3A), V(B9, 9E, 9E, 27), V(38, E1, E1, D9), \ + V(13, F8, F8, EB), V(B3, 98, 98, 2B), V(33, 11, 11, 22), \ + V(BB, 69, 69, D2), V(70, D9, D9, A9), V(89, 8E, 8E, 07), \ + V(A7, 94, 94, 33), V(B6, 9B, 9B, 2D), V(22, 1E, 1E, 3C), \ + V(92, 87, 87, 15), V(20, E9, E9, C9), V(49, CE, CE, 87), \ + V(FF, 55, 55, AA), V(78, 28, 28, 50), V(7A, DF, DF, A5), \ + V(8F, 8C, 8C, 03), V(F8, A1, A1, 59), V(80, 89, 89, 09), \ + V(17, 0D, 0D, 1A), V(DA, BF, BF, 65), V(31, E6, E6, D7), \ + V(C6, 42, 42, 84), V(B8, 68, 68, D0), V(C3, 41, 41, 82), \ + V(B0, 99, 99, 29), V(77, 2D, 2D, 5A), V(11, 0F, 0F, 1E), \ + V(CB, B0, B0, 7B), V(FC, 54, 54, A8), V(D6, BB, BB, 6D), \ + V(3A, 16, 16, 2C) + +#define V(a, b, c, d) 0x##a##b##c##d +static const uint32_t FT0[256] = {FT}; +#undef V + +#define V(a, b, c, d) 0x##b##c##d##a +static const uint32_t FT1[256] = {FT}; +#undef V + +#define V(a, b, c, d) 0x##c##d##a##b +static const uint32_t FT2[256] = {FT}; +#undef V + +#define V(a, b, c, d) 0x##d##a##b##c +static const uint32_t FT3[256] = {FT}; +#undef V + +#undef FT + +/* + * Reverse S-box + */ +static const unsigned char RSb[256] = { + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, + 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, + 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, + 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, + 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, + 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, + 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, + 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, + 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, + 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, + 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, + 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, + 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, + 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0C, 0x7D}; + +/* + * Reverse tables + */ +#define RT \ + \ + V(50, A7, F4, 51), V(53, 65, 41, 7E), V(C3, A4, 17, 1A), \ + V(96, 5E, 27, 3A), V(CB, 6B, AB, 3B), V(F1, 45, 9D, 1F), \ + V(AB, 58, FA, AC), V(93, 03, E3, 4B), V(55, FA, 30, 20), \ + V(F6, 6D, 76, AD), V(91, 76, CC, 88), V(25, 4C, 02, F5), \ + V(FC, D7, E5, 4F), V(D7, CB, 2A, C5), V(80, 44, 35, 26), \ + V(8F, A3, 62, B5), V(49, 5A, B1, DE), V(67, 1B, BA, 25), \ + V(98, 0E, EA, 45), V(E1, C0, FE, 5D), V(02, 75, 2F, C3), \ + V(12, F0, 4C, 81), V(A3, 97, 46, 8D), V(C6, F9, D3, 6B), \ + V(E7, 5F, 8F, 03), V(95, 9C, 92, 15), V(EB, 7A, 6D, BF), \ + V(DA, 59, 52, 95), V(2D, 83, BE, D4), V(D3, 21, 74, 58), \ + V(29, 69, E0, 49), V(44, C8, C9, 8E), V(6A, 89, C2, 75), \ + V(78, 79, 8E, F4), V(6B, 3E, 58, 99), V(DD, 71, B9, 27), \ + V(B6, 4F, E1, BE), V(17, AD, 88, F0), V(66, AC, 20, C9), \ + V(B4, 3A, CE, 7D), V(18, 4A, DF, 63), V(82, 31, 1A, E5), \ + V(60, 33, 51, 97), V(45, 7F, 53, 62), V(E0, 77, 64, B1), \ + V(84, AE, 6B, BB), V(1C, A0, 81, FE), V(94, 2B, 08, F9), \ + V(58, 68, 48, 70), V(19, FD, 45, 8F), V(87, 6C, DE, 94), \ + V(B7, F8, 7B, 52), V(23, D3, 73, AB), V(E2, 02, 4B, 72), \ + V(57, 8F, 1F, E3), V(2A, AB, 55, 66), V(07, 28, EB, B2), \ + V(03, C2, B5, 2F), V(9A, 7B, C5, 86), V(A5, 08, 37, D3), \ + V(F2, 87, 28, 30), V(B2, A5, BF, 23), V(BA, 6A, 03, 02), \ + V(5C, 82, 16, ED), V(2B, 1C, CF, 8A), V(92, B4, 79, A7), \ + V(F0, F2, 07, F3), V(A1, E2, 69, 4E), V(CD, F4, DA, 65), \ + V(D5, BE, 05, 06), V(1F, 62, 34, D1), V(8A, FE, A6, C4), \ + V(9D, 53, 2E, 34), V(A0, 55, F3, A2), V(32, E1, 8A, 05), \ + V(75, EB, F6, A4), V(39, EC, 83, 0B), V(AA, EF, 60, 40), \ + V(06, 9F, 71, 5E), V(51, 10, 6E, BD), V(F9, 8A, 21, 3E), \ + V(3D, 06, DD, 96), V(AE, 05, 3E, DD), V(46, BD, E6, 4D), \ + V(B5, 8D, 54, 91), V(05, 5D, C4, 71), V(6F, D4, 06, 04), \ + V(FF, 15, 50, 60), V(24, FB, 98, 19), V(97, E9, BD, D6), \ + V(CC, 43, 40, 89), V(77, 9E, D9, 67), V(BD, 42, E8, B0), \ + V(88, 8B, 89, 07), V(38, 5B, 19, E7), V(DB, EE, C8, 79), \ + V(47, 0A, 7C, A1), V(E9, 0F, 42, 7C), V(C9, 1E, 84, F8), \ + V(00, 00, 00, 00), V(83, 86, 80, 09), V(48, ED, 2B, 32), \ + V(AC, 70, 11, 1E), V(4E, 72, 5A, 6C), V(FB, FF, 0E, FD), \ + V(56, 38, 85, 0F), V(1E, D5, AE, 3D), V(27, 39, 2D, 36), \ + V(64, D9, 0F, 0A), V(21, A6, 5C, 68), V(D1, 54, 5B, 9B), \ + V(3A, 2E, 36, 24), V(B1, 67, 0A, 0C), V(0F, E7, 57, 93), \ + V(D2, 96, EE, B4), V(9E, 91, 9B, 1B), V(4F, C5, C0, 80), \ + V(A2, 20, DC, 61), V(69, 4B, 77, 5A), V(16, 1A, 12, 1C), \ + V(0A, BA, 93, E2), V(E5, 2A, A0, C0), V(43, E0, 22, 3C), \ + V(1D, 17, 1B, 12), V(0B, 0D, 09, 0E), V(AD, C7, 8B, F2), \ + V(B9, A8, B6, 2D), V(C8, A9, 1E, 14), V(85, 19, F1, 57), \ + V(4C, 07, 75, AF), V(BB, DD, 99, EE), V(FD, 60, 7F, A3), \ + V(9F, 26, 01, F7), V(BC, F5, 72, 5C), V(C5, 3B, 66, 44), \ + V(34, 7E, FB, 5B), V(76, 29, 43, 8B), V(DC, C6, 23, CB), \ + V(68, FC, ED, B6), V(63, F1, E4, B8), V(CA, DC, 31, D7), \ + V(10, 85, 63, 42), V(40, 22, 97, 13), V(20, 11, C6, 84), \ + V(7D, 24, 4A, 85), V(F8, 3D, BB, D2), V(11, 32, F9, AE), \ + V(6D, A1, 29, C7), V(4B, 2F, 9E, 1D), V(F3, 30, B2, DC), \ + V(EC, 52, 86, 0D), V(D0, E3, C1, 77), V(6C, 16, B3, 2B), \ + V(99, B9, 70, A9), V(FA, 48, 94, 11), V(22, 64, E9, 47), \ + V(C4, 8C, FC, A8), V(1A, 3F, F0, A0), V(D8, 2C, 7D, 56), \ + V(EF, 90, 33, 22), V(C7, 4E, 49, 87), V(C1, D1, 38, D9), \ + V(FE, A2, CA, 8C), V(36, 0B, D4, 98), V(CF, 81, F5, A6), \ + V(28, DE, 7A, A5), V(26, 8E, B7, DA), V(A4, BF, AD, 3F), \ + V(E4, 9D, 3A, 2C), V(0D, 92, 78, 50), V(9B, CC, 5F, 6A), \ + V(62, 46, 7E, 54), V(C2, 13, 8D, F6), V(E8, B8, D8, 90), \ + V(5E, F7, 39, 2E), V(F5, AF, C3, 82), V(BE, 80, 5D, 9F), \ + V(7C, 93, D0, 69), V(A9, 2D, D5, 6F), V(B3, 12, 25, CF), \ + V(3B, 99, AC, C8), V(A7, 7D, 18, 10), V(6E, 63, 9C, E8), \ + V(7B, BB, 3B, DB), V(09, 78, 26, CD), V(F4, 18, 59, 6E), \ + V(01, B7, 9A, EC), V(A8, 9A, 4F, 83), V(65, 6E, 95, E6), \ + V(7E, E6, FF, AA), V(08, CF, BC, 21), V(E6, E8, 15, EF), \ + V(D9, 9B, E7, BA), V(CE, 36, 6F, 4A), V(D4, 09, 9F, EA), \ + V(D6, 7C, B0, 29), V(AF, B2, A4, 31), V(31, 23, 3F, 2A), \ + V(30, 94, A5, C6), V(C0, 66, A2, 35), V(37, BC, 4E, 74), \ + V(A6, CA, 82, FC), V(B0, D0, 90, E0), V(15, D8, A7, 33), \ + V(4A, 98, 04, F1), V(F7, DA, EC, 41), V(0E, 50, CD, 7F), \ + V(2F, F6, 91, 17), V(8D, D6, 4D, 76), V(4D, B0, EF, 43), \ + V(54, 4D, AA, CC), V(DF, 04, 96, E4), V(E3, B5, D1, 9E), \ + V(1B, 88, 6A, 4C), V(B8, 1F, 2C, C1), V(7F, 51, 65, 46), \ + V(04, EA, 5E, 9D), V(5D, 35, 8C, 01), V(73, 74, 87, FA), \ + V(2E, 41, 0B, FB), V(5A, 1D, 67, B3), V(52, D2, DB, 92), \ + V(33, 56, 10, E9), V(13, 47, D6, 6D), V(8C, 61, D7, 9A), \ + V(7A, 0C, A1, 37), V(8E, 14, F8, 59), V(89, 3C, 13, EB), \ + V(EE, 27, A9, CE), V(35, C9, 61, B7), V(ED, E5, 1C, E1), \ + V(3C, B1, 47, 7A), V(59, DF, D2, 9C), V(3F, 73, F2, 55), \ + V(79, CE, 14, 18), V(BF, 37, C7, 73), V(EA, CD, F7, 53), \ + V(5B, AA, FD, 5F), V(14, 6F, 3D, DF), V(86, DB, 44, 78), \ + V(81, F3, AF, CA), V(3E, C4, 68, B9), V(2C, 34, 24, 38), \ + V(5F, 40, A3, C2), V(72, C3, 1D, 16), V(0C, 25, E2, BC), \ + V(8B, 49, 3C, 28), V(41, 95, 0D, FF), V(71, 01, A8, 39), \ + V(DE, B3, 0C, 08), V(9C, E4, B4, D8), V(90, C1, 56, 64), \ + V(61, 84, CB, 7B), V(70, B6, 32, D5), V(74, 5C, 6C, 48), \ + V(42, 57, B8, D0) + +#define V(a, b, c, d) 0x##a##b##c##d +static const uint32_t RT0[256] = {RT}; +#undef V + +#define V(a, b, c, d) 0x##b##c##d##a +static const uint32_t RT1[256] = {RT}; +#undef V + +#define V(a, b, c, d) 0x##c##d##a##b +static const uint32_t RT2[256] = {RT}; +#undef V + +#define V(a, b, c, d) 0x##d##a##b##c +static const uint32_t RT3[256] = {RT}; +#undef V + +#undef RT + +/* + * Round constants + */ +static const uint32_t RCON[10] = { + 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, + 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036}; + +#else /* MBEDTLS_AES_ROM_TABLES */ + +/* + * Forward S-box & tables + */ +static unsigned char FSb[256]; +static uint32_t FT0[256]; +static uint32_t FT1[256]; +static uint32_t FT2[256]; +static uint32_t FT3[256]; + +/* + * Reverse S-box & tables + */ +static unsigned char RSb[256]; +static uint32_t RT0[256]; +static uint32_t RT1[256]; +static uint32_t RT2[256]; +static uint32_t RT3[256]; + +/* + * Round constants + */ +static uint32_t RCON[10]; + +/* + * Tables generation code + */ +#define ROTL8(x) ((x << 8) & 0xFFFFFFFF) | (x >> 24) +#define XTIME(x) ((x << 1) ^ ((x & 0x80) ? 0x1B : 0x00)) +#define MUL(x, y) ((x && y) ? pow[(log[x] + log[y]) % 255] : 0) + +static int aes_init_done = 0; + +static void aes_gen_tables(void) { + int i, x, y, z; + int pow[256]; + int log[256]; + + /* + * compute pow and log tables over GF(2^8) + */ + for (i = 0, x = 1; i < 256; i++) { + pow[i] = x; + log[x] = i; + x = (x ^ XTIME(x)) & 0xFF; + } + + /* + * calculate the round constants + */ + for (i = 0, x = 1; i < 10; i++) { + RCON[i] = (uint32_t)x; + x = XTIME(x) & 0xFF; + } + + /* + * generate the forward and reverse S-boxes + */ + FSb[0x00] = 0x63; + RSb[0x63] = 0x00; + + for (i = 1; i < 256; i++) { + x = pow[255 - log[i]]; + + y = x; + y = ((y << 1) | (y >> 7)) & 0xFF; + x ^= y; + y = ((y << 1) | (y >> 7)) & 0xFF; + x ^= y; + y = ((y << 1) | (y >> 7)) & 0xFF; + x ^= y; + y = ((y << 1) | (y >> 7)) & 0xFF; + x ^= y ^ 0x63; + + FSb[i] = (unsigned char)x; + RSb[x] = (unsigned char)i; + } + + /* + * generate the forward and reverse tables + */ + for (i = 0; i < 256; i++) { + x = FSb[i]; + y = XTIME(x) & 0xFF; + z = (y ^ x) & 0xFF; + + FT0[i] = ((uint32_t)y) ^ ((uint32_t)x << 8) ^ ((uint32_t)x << 16) ^ + ((uint32_t)z << 24); + + FT1[i] = ROTL8(FT0[i]); + FT2[i] = ROTL8(FT1[i]); + FT3[i] = ROTL8(FT2[i]); + + x = RSb[i]; + + RT0[i] = ((uint32_t)MUL(0x0E, x)) ^ ((uint32_t)MUL(0x09, x) << 8) ^ + ((uint32_t)MUL(0x0D, x) << 16) ^ + ((uint32_t)MUL(0x0B, x) << 24); + + RT1[i] = ROTL8(RT0[i]); + RT2[i] = ROTL8(RT1[i]); + RT3[i] = ROTL8(RT2[i]); + } +} + +#endif /* MBEDTLS_AES_ROM_TABLES */ + +void mbedtls_aes_init(mbedtls_aes_context *ctx) { + memset(ctx, 0, sizeof(mbedtls_aes_context)); +} + +void mbedtls_aes_free(mbedtls_aes_context *ctx) { + if (ctx == NULL) return; + + mbedtls_zeroize(ctx, sizeof(mbedtls_aes_context)); +} + +/* + * AES key schedule (encryption) + */ +#if !defined(MBEDTLS_AES_SETKEY_ENC_ALT) +int mbedtls_aes_setkey_enc(mbedtls_aes_context *ctx, const unsigned char *key, + unsigned int keybits) { + unsigned int i; + uint32_t *RK; + +#if !defined(MBEDTLS_AES_ROM_TABLES) + if (aes_init_done == 0) { + aes_gen_tables(); + aes_init_done = 1; + } +#endif + + switch (keybits) { + case 128: + ctx->nr = 10; + break; + case 192: + ctx->nr = 12; + break; + case 256: + ctx->nr = 14; + break; + default: + return (MBEDTLS_ERR_AES_INVALID_KEY_LENGTH); + } + +#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_PADLOCK_ALIGN16) + if (aes_padlock_ace == -1) + aes_padlock_ace = mbedtls_padlock_has_support(MBEDTLS_PADLOCK_ACE); + + if (aes_padlock_ace) + ctx->rk = RK = MBEDTLS_PADLOCK_ALIGN16(ctx->buf); + else +#endif + ctx->rk = RK = ctx->buf; + +#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64) + if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) + return ( + mbedtls_aesni_setkey_enc((unsigned char *)ctx->rk, key, keybits)); +#endif + + for (i = 0; i < (keybits >> 5); i++) { + GET_UINT32_LE(RK[i], key, i << 2); + } + + switch (ctx->nr) { + case 10: + + for (i = 0; i < 10; i++, RK += 4) { + RK[4] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[3] >> 8) & 0xFF]) ^ + ((uint32_t)FSb[(RK[3] >> 16) & 0xFF] << 8) ^ + ((uint32_t)FSb[(RK[3] >> 24) & 0xFF] << 16) ^ + ((uint32_t)FSb[(RK[3]) & 0xFF] << 24); + + RK[5] = RK[1] ^ RK[4]; + RK[6] = RK[2] ^ RK[5]; + RK[7] = RK[3] ^ RK[6]; + } + break; + + case 12: + + for (i = 0; i < 8; i++, RK += 6) { + RK[6] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[5] >> 8) & 0xFF]) ^ + ((uint32_t)FSb[(RK[5] >> 16) & 0xFF] << 8) ^ + ((uint32_t)FSb[(RK[5] >> 24) & 0xFF] << 16) ^ + ((uint32_t)FSb[(RK[5]) & 0xFF] << 24); + + RK[7] = RK[1] ^ RK[6]; + RK[8] = RK[2] ^ RK[7]; + RK[9] = RK[3] ^ RK[8]; + RK[10] = RK[4] ^ RK[9]; + RK[11] = RK[5] ^ RK[10]; + } + break; + + case 14: + + for (i = 0; i < 7; i++, RK += 8) { + RK[8] = RK[0] ^ RCON[i] ^ ((uint32_t)FSb[(RK[7] >> 8) & 0xFF]) ^ + ((uint32_t)FSb[(RK[7] >> 16) & 0xFF] << 8) ^ + ((uint32_t)FSb[(RK[7] >> 24) & 0xFF] << 16) ^ + ((uint32_t)FSb[(RK[7]) & 0xFF] << 24); + + RK[9] = RK[1] ^ RK[8]; + RK[10] = RK[2] ^ RK[9]; + RK[11] = RK[3] ^ RK[10]; + + RK[12] = RK[4] ^ ((uint32_t)FSb[(RK[11]) & 0xFF]) ^ + ((uint32_t)FSb[(RK[11] >> 8) & 0xFF] << 8) ^ + ((uint32_t)FSb[(RK[11] >> 16) & 0xFF] << 16) ^ + ((uint32_t)FSb[(RK[11] >> 24) & 0xFF] << 24); + + RK[13] = RK[5] ^ RK[12]; + RK[14] = RK[6] ^ RK[13]; + RK[15] = RK[7] ^ RK[14]; + } + break; + } + + return (0); +} +#endif /* !MBEDTLS_AES_SETKEY_ENC_ALT */ + +/* + * AES key schedule (decryption) + */ +#if !defined(MBEDTLS_AES_SETKEY_DEC_ALT) +int mbedtls_aes_setkey_dec(mbedtls_aes_context *ctx, const unsigned char *key, + unsigned int keybits) { + int i, j, ret; + mbedtls_aes_context cty; + uint32_t *RK; + uint32_t *SK; + + mbedtls_aes_init(&cty); + +#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_PADLOCK_ALIGN16) + if (aes_padlock_ace == -1) + aes_padlock_ace = mbedtls_padlock_has_support(MBEDTLS_PADLOCK_ACE); + + if (aes_padlock_ace) + ctx->rk = RK = MBEDTLS_PADLOCK_ALIGN16(ctx->buf); + else +#endif + ctx->rk = RK = ctx->buf; + + /* Also checks keybits */ + if ((ret = mbedtls_aes_setkey_enc(&cty, key, keybits)) != 0) goto exit; + + ctx->nr = cty.nr; + +#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64) + if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) { + mbedtls_aesni_inverse_key((unsigned char *)ctx->rk, + (const unsigned char *)cty.rk, ctx->nr); + goto exit; + } +#endif + + SK = cty.rk + cty.nr * 4; + + *RK++ = *SK++; + *RK++ = *SK++; + *RK++ = *SK++; + *RK++ = *SK++; + + for (i = ctx->nr - 1, SK -= 8; i > 0; i--, SK -= 8) { + for (j = 0; j < 4; j++, SK++) { + *RK++ = RT0[FSb[(*SK) & 0xFF]] ^ RT1[FSb[(*SK >> 8) & 0xFF]] ^ + RT2[FSb[(*SK >> 16) & 0xFF]] ^ RT3[FSb[(*SK >> 24) & 0xFF]]; + } + } + + *RK++ = *SK++; + *RK++ = *SK++; + *RK++ = *SK++; + *RK++ = *SK++; + +exit: + mbedtls_aes_free(&cty); + + return (ret); +} +#endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */ + +#define AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + { \ + X0 = *RK++ ^ FT0[(Y0)&0xFF] ^ FT1[(Y1 >> 8) & 0xFF] ^ \ + FT2[(Y2 >> 16) & 0xFF] ^ FT3[(Y3 >> 24) & 0xFF]; \ + \ + X1 = *RK++ ^ FT0[(Y1)&0xFF] ^ FT1[(Y2 >> 8) & 0xFF] ^ \ + FT2[(Y3 >> 16) & 0xFF] ^ FT3[(Y0 >> 24) & 0xFF]; \ + \ + X2 = *RK++ ^ FT0[(Y2)&0xFF] ^ FT1[(Y3 >> 8) & 0xFF] ^ \ + FT2[(Y0 >> 16) & 0xFF] ^ FT3[(Y1 >> 24) & 0xFF]; \ + \ + X3 = *RK++ ^ FT0[(Y3)&0xFF] ^ FT1[(Y0 >> 8) & 0xFF] ^ \ + FT2[(Y1 >> 16) & 0xFF] ^ FT3[(Y2 >> 24) & 0xFF]; \ + } + +#define AES_RROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + { \ + X0 = *RK++ ^ RT0[(Y0)&0xFF] ^ RT1[(Y3 >> 8) & 0xFF] ^ \ + RT2[(Y2 >> 16) & 0xFF] ^ RT3[(Y1 >> 24) & 0xFF]; \ + \ + X1 = *RK++ ^ RT0[(Y1)&0xFF] ^ RT1[(Y0 >> 8) & 0xFF] ^ \ + RT2[(Y3 >> 16) & 0xFF] ^ RT3[(Y2 >> 24) & 0xFF]; \ + \ + X2 = *RK++ ^ RT0[(Y2)&0xFF] ^ RT1[(Y1 >> 8) & 0xFF] ^ \ + RT2[(Y0 >> 16) & 0xFF] ^ RT3[(Y3 >> 24) & 0xFF]; \ + \ + X3 = *RK++ ^ RT0[(Y3)&0xFF] ^ RT1[(Y2 >> 8) & 0xFF] ^ \ + RT2[(Y1 >> 16) & 0xFF] ^ RT3[(Y0 >> 24) & 0xFF]; \ + } + +/* + * AES-ECB block encryption + */ +#if !defined(MBEDTLS_AES_ENCRYPT_ALT) +int mbedtls_internal_aes_encrypt(mbedtls_aes_context *ctx, + const unsigned char input[16], + unsigned char output[16]) { + int i; + uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3; + + RK = ctx->rk; + + GET_UINT32_LE(X0, input, 0); + X0 ^= *RK++; + GET_UINT32_LE(X1, input, 4); + X1 ^= *RK++; + GET_UINT32_LE(X2, input, 8); + X2 ^= *RK++; + GET_UINT32_LE(X3, input, 12); + X3 ^= *RK++; + + for (i = (ctx->nr >> 1) - 1; i > 0; i--) { + AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3); + } + + AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + + X0 = *RK++ ^ ((uint32_t)FSb[(Y0)&0xFF]) ^ + ((uint32_t)FSb[(Y1 >> 8) & 0xFF] << 8) ^ + ((uint32_t)FSb[(Y2 >> 16) & 0xFF] << 16) ^ + ((uint32_t)FSb[(Y3 >> 24) & 0xFF] << 24); + + X1 = *RK++ ^ ((uint32_t)FSb[(Y1)&0xFF]) ^ + ((uint32_t)FSb[(Y2 >> 8) & 0xFF] << 8) ^ + ((uint32_t)FSb[(Y3 >> 16) & 0xFF] << 16) ^ + ((uint32_t)FSb[(Y0 >> 24) & 0xFF] << 24); + + X2 = *RK++ ^ ((uint32_t)FSb[(Y2)&0xFF]) ^ + ((uint32_t)FSb[(Y3 >> 8) & 0xFF] << 8) ^ + ((uint32_t)FSb[(Y0 >> 16) & 0xFF] << 16) ^ + ((uint32_t)FSb[(Y1 >> 24) & 0xFF] << 24); + + X3 = *RK++ ^ ((uint32_t)FSb[(Y3)&0xFF]) ^ + ((uint32_t)FSb[(Y0 >> 8) & 0xFF] << 8) ^ + ((uint32_t)FSb[(Y1 >> 16) & 0xFF] << 16) ^ + ((uint32_t)FSb[(Y2 >> 24) & 0xFF] << 24); + + PUT_UINT32_LE(X0, output, 0); + PUT_UINT32_LE(X1, output, 4); + PUT_UINT32_LE(X2, output, 8); + PUT_UINT32_LE(X3, output, 12); + + return (0); +} +#endif /* !MBEDTLS_AES_ENCRYPT_ALT */ + +/* + * AES-ECB block decryption + */ +#if !defined(MBEDTLS_AES_DECRYPT_ALT) +int mbedtls_internal_aes_decrypt(mbedtls_aes_context *ctx, + const unsigned char input[16], + unsigned char output[16]) { + int i; + uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3; + + RK = ctx->rk; + + GET_UINT32_LE(X0, input, 0); + X0 ^= *RK++; + GET_UINT32_LE(X1, input, 4); + X1 ^= *RK++; + GET_UINT32_LE(X2, input, 8); + X2 ^= *RK++; + GET_UINT32_LE(X3, input, 12); + X3 ^= *RK++; + + for (i = (ctx->nr >> 1) - 1; i > 0; i--) { + AES_RROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + AES_RROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3); + } + + AES_RROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + + X0 = *RK++ ^ ((uint32_t)RSb[(Y0)&0xFF]) ^ + ((uint32_t)RSb[(Y3 >> 8) & 0xFF] << 8) ^ + ((uint32_t)RSb[(Y2 >> 16) & 0xFF] << 16) ^ + ((uint32_t)RSb[(Y1 >> 24) & 0xFF] << 24); + + X1 = *RK++ ^ ((uint32_t)RSb[(Y1)&0xFF]) ^ + ((uint32_t)RSb[(Y0 >> 8) & 0xFF] << 8) ^ + ((uint32_t)RSb[(Y3 >> 16) & 0xFF] << 16) ^ + ((uint32_t)RSb[(Y2 >> 24) & 0xFF] << 24); + + X2 = *RK++ ^ ((uint32_t)RSb[(Y2)&0xFF]) ^ + ((uint32_t)RSb[(Y1 >> 8) & 0xFF] << 8) ^ + ((uint32_t)RSb[(Y0 >> 16) & 0xFF] << 16) ^ + ((uint32_t)RSb[(Y3 >> 24) & 0xFF] << 24); + + X3 = *RK++ ^ ((uint32_t)RSb[(Y3)&0xFF]) ^ + ((uint32_t)RSb[(Y2 >> 8) & 0xFF] << 8) ^ + ((uint32_t)RSb[(Y1 >> 16) & 0xFF] << 16) ^ + ((uint32_t)RSb[(Y0 >> 24) & 0xFF] << 24); + + PUT_UINT32_LE(X0, output, 0); + PUT_UINT32_LE(X1, output, 4); + PUT_UINT32_LE(X2, output, 8); + PUT_UINT32_LE(X3, output, 12); + + return (0); +} +#endif /* !MBEDTLS_AES_DECRYPT_ALT */ + +/* + * AES-ECB block encryption/decryption + */ +int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, int mode, + const unsigned char input[16], + unsigned char output[16]) { +#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64) + if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) + return (mbedtls_aesni_crypt_ecb(ctx, mode, input, output)); +#endif + +#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_HAVE_X86) + if (aes_padlock_ace) { + if (mbedtls_padlock_xcryptecb(ctx, mode, input, output) == 0) + return (0); + + // If padlock data misaligned, we just fall back to + // unaccelerated mode + // + } +#endif + + if (mode == MBEDTLS_AES_ENCRYPT) + return (mbedtls_internal_aes_encrypt(ctx, input, output)); + else + return (mbedtls_internal_aes_decrypt(ctx, input, output)); +} + +#if defined(MBEDTLS_CIPHER_MODE_CBC) +/* + * AES-CBC buffer encryption/decryption + */ +int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, int mode, size_t length, + unsigned char iv[16], const unsigned char *input, + unsigned char *output) { + int i; + unsigned char temp[16]; + + if (length % 16) return (MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH); + +#if defined(MBEDTLS_PADLOCK_C) && defined(MBEDTLS_HAVE_X86) + if (aes_padlock_ace) { + if (mbedtls_padlock_xcryptcbc(ctx, mode, length, iv, input, output) == + 0) + return (0); + + // If padlock data misaligned, we just fall back to + // unaccelerated mode + // + } +#endif + + if (mode == MBEDTLS_AES_DECRYPT) { + while (length > 0) { + memcpy(temp, input, 16); + mbedtls_aes_crypt_ecb(ctx, mode, input, output); + + for (i = 0; i < 16; i++) + output[i] = (unsigned char)(output[i] ^ iv[i]); + + memcpy(iv, temp, 16); + + input += 16; + output += 16; + length -= 16; + } + } else { + while (length > 0) { + for (i = 0; i < 16; i++) + output[i] = (unsigned char)(input[i] ^ iv[i]); + + mbedtls_aes_crypt_ecb(ctx, mode, output, output); + memcpy(iv, output, 16); + + input += 16; + output += 16; + length -= 16; + } + } + + return (0); +} +#endif /* MBEDTLS_CIPHER_MODE_CBC */ + +#if defined(MBEDTLS_CIPHER_MODE_CFB) +/* + * AES-CFB128 buffer encryption/decryption + */ +int mbedtls_aes_crypt_cfb128(mbedtls_aes_context *ctx, int mode, size_t length, + size_t *iv_off, unsigned char iv[16], + const unsigned char *input, + unsigned char *output) { + int c; + size_t n = *iv_off; + + if (mode == MBEDTLS_AES_DECRYPT) { + while (length--) { + if (n == 0) mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv); + + c = *input++; + *output++ = (unsigned char)(c ^ iv[n]); + iv[n] = (unsigned char)c; + + n = (n + 1) & 0x0F; + } + } else { + while (length--) { + if (n == 0) mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv); + + iv[n] = *output++ = (unsigned char)(iv[n] ^ *input++); + + n = (n + 1) & 0x0F; + } + } + + *iv_off = n; + + return (0); +} + +/* + * AES-CFB8 buffer encryption/decryption + */ +int mbedtls_aes_crypt_cfb8(mbedtls_aes_context *ctx, int mode, size_t length, + unsigned char iv[16], const unsigned char *input, + unsigned char *output) { + unsigned char c; + unsigned char ov[17]; + + while (length--) { + memcpy(ov, iv, 16); + mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, iv, iv); + + if (mode == MBEDTLS_AES_DECRYPT) ov[16] = *input; + + c = *output++ = (unsigned char)(iv[0] ^ *input++); + + if (mode == MBEDTLS_AES_ENCRYPT) ov[16] = c; + + memcpy(iv, ov + 1, 16); + } + + return (0); +} +#endif /*MBEDTLS_CIPHER_MODE_CFB */ + +#if defined(MBEDTLS_CIPHER_MODE_CTR) +/* + * AES-CTR buffer encryption/decryption + */ +int mbedtls_aes_crypt_ctr(mbedtls_aes_context *ctx, size_t length, + size_t *nc_off, unsigned char nonce_counter[16], + unsigned char stream_block[16], + const unsigned char *input, unsigned char *output) { + int c, i; + size_t n = *nc_off; + + while (length--) { + if (n == 0) { + mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_counter, + stream_block); + + for (i = 16; i > 0; i--) + if (++nonce_counter[i - 1] != 0) break; + } + c = *input++; + *output++ = (unsigned char)(c ^ stream_block[n]); + + n = (n + 1) & 0x0F; + } + + *nc_off = n; + + return (0); +} +#endif /* MBEDTLS_CIPHER_MODE_CTR */ + +#endif /* !MBEDTLS_AES_ALT */ + +#if defined(MBEDTLS_SELF_TEST) +/* + * AES test vectors from: + * + * http://csrc.nist.gov/archive/aes/rijndael/rijndael-vals.zip + */ +static const unsigned char aes_test_ecb_dec[3][16] = { + {0x44, 0x41, 0x6A, 0xC2, 0xD1, 0xF5, 0x3C, 0x58, 0x33, 0x03, 0x91, 0x7E, + 0x6B, 0xE9, 0xEB, 0xE0}, + {0x48, 0xE3, 0x1E, 0x9E, 0x25, 0x67, 0x18, 0xF2, 0x92, 0x29, 0x31, 0x9C, + 0x19, 0xF1, 0x5B, 0xA4}, + {0x05, 0x8C, 0xCF, 0xFD, 0xBB, 0xCB, 0x38, 0x2D, 0x1F, 0x6F, 0x56, 0x58, + 0x5D, 0x8A, 0x4A, 0xDE}}; + +static const unsigned char aes_test_ecb_enc[3][16] = { + {0xC3, 0x4C, 0x05, 0x2C, 0xC0, 0xDA, 0x8D, 0x73, 0x45, 0x1A, 0xFE, 0x5F, + 0x03, 0xBE, 0x29, 0x7F}, + {0xF3, 0xF6, 0x75, 0x2A, 0xE8, 0xD7, 0x83, 0x11, 0x38, 0xF0, 0x41, 0x56, + 0x06, 0x31, 0xB1, 0x14}, + {0x8B, 0x79, 0xEE, 0xCC, 0x93, 0xA0, 0xEE, 0x5D, 0xFF, 0x30, 0xB4, 0xEA, + 0x21, 0x63, 0x6D, 0xA4}}; + +#if defined(MBEDTLS_CIPHER_MODE_CBC) +static const unsigned char aes_test_cbc_dec[3][16] = { + {0xFA, 0xCA, 0x37, 0xE0, 0xB0, 0xC8, 0x53, 0x73, 0xDF, 0x70, 0x6E, 0x73, + 0xF7, 0xC9, 0xAF, 0x86}, + {0x5D, 0xF6, 0x78, 0xDD, 0x17, 0xBA, 0x4E, 0x75, 0xB6, 0x17, 0x68, 0xC6, + 0xAD, 0xEF, 0x7C, 0x7B}, + {0x48, 0x04, 0xE1, 0x81, 0x8F, 0xE6, 0x29, 0x75, 0x19, 0xA3, 0xE8, 0x8C, + 0x57, 0x31, 0x04, 0x13}}; + +static const unsigned char aes_test_cbc_enc[3][16] = { + {0x8A, 0x05, 0xFC, 0x5E, 0x09, 0x5A, 0xF4, 0x84, 0x8A, 0x08, 0xD3, 0x28, + 0xD3, 0x68, 0x8E, 0x3D}, + {0x7B, 0xD9, 0x66, 0xD5, 0x3A, 0xD8, 0xC1, 0xBB, 0x85, 0xD2, 0xAD, 0xFA, + 0xE8, 0x7B, 0xB1, 0x04}, + {0xFE, 0x3C, 0x53, 0x65, 0x3E, 0x2F, 0x45, 0xB5, 0x6F, 0xCD, 0x88, 0xB2, + 0xCC, 0x89, 0x8F, 0xF0}}; +#endif /* MBEDTLS_CIPHER_MODE_CBC */ + +#if defined(MBEDTLS_CIPHER_MODE_CFB) +/* + * AES-CFB128 test vectors from: + * + * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf + */ +static const unsigned char aes_test_cfb128_key[3][32] = { + {0x2B, 0x7E, 0x15, 0x16, 0x28, 0xAE, 0xD2, 0xA6, 0xAB, 0xF7, 0x15, 0x88, + 0x09, 0xCF, 0x4F, 0x3C}, + {0x8E, 0x73, 0xB0, 0xF7, 0xDA, 0x0E, 0x64, 0x52, 0xC8, 0x10, 0xF3, 0x2B, + 0x80, 0x90, 0x79, 0xE5, 0x62, 0xF8, 0xEA, 0xD2, 0x52, 0x2C, 0x6B, 0x7B}, + {0x60, 0x3D, 0xEB, 0x10, 0x15, 0xCA, 0x71, 0xBE, 0x2B, 0x73, 0xAE, + 0xF0, 0x85, 0x7D, 0x77, 0x81, 0x1F, 0x35, 0x2C, 0x07, 0x3B, 0x61, + 0x08, 0xD7, 0x2D, 0x98, 0x10, 0xA3, 0x09, 0x14, 0xDF, 0xF4}}; + +static const unsigned char aes_test_cfb128_iv[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; + +static const unsigned char aes_test_cfb128_pt[64] = { + 0x6B, 0xC1, 0xBE, 0xE2, 0x2E, 0x40, 0x9F, 0x96, 0xE9, 0x3D, 0x7E, + 0x11, 0x73, 0x93, 0x17, 0x2A, 0xAE, 0x2D, 0x8A, 0x57, 0x1E, 0x03, + 0xAC, 0x9C, 0x9E, 0xB7, 0x6F, 0xAC, 0x45, 0xAF, 0x8E, 0x51, 0x30, + 0xC8, 0x1C, 0x46, 0xA3, 0x5C, 0xE4, 0x11, 0xE5, 0xFB, 0xC1, 0x19, + 0x1A, 0x0A, 0x52, 0xEF, 0xF6, 0x9F, 0x24, 0x45, 0xDF, 0x4F, 0x9B, + 0x17, 0xAD, 0x2B, 0x41, 0x7B, 0xE6, 0x6C, 0x37, 0x10}; + +static const unsigned char aes_test_cfb128_ct[3][64] = { + {0x3B, 0x3F, 0xD9, 0x2E, 0xB7, 0x2D, 0xAD, 0x20, 0x33, 0x34, 0x49, + 0xF8, 0xE8, 0x3C, 0xFB, 0x4A, 0xC8, 0xA6, 0x45, 0x37, 0xA0, 0xB3, + 0xA9, 0x3F, 0xCD, 0xE3, 0xCD, 0xAD, 0x9F, 0x1C, 0xE5, 0x8B, 0x26, + 0x75, 0x1F, 0x67, 0xA3, 0xCB, 0xB1, 0x40, 0xB1, 0x80, 0x8C, 0xF1, + 0x87, 0xA4, 0xF4, 0xDF, 0xC0, 0x4B, 0x05, 0x35, 0x7C, 0x5D, 0x1C, + 0x0E, 0xEA, 0xC4, 0xC6, 0x6F, 0x9F, 0xF7, 0xF2, 0xE6}, + {0xCD, 0xC8, 0x0D, 0x6F, 0xDD, 0xF1, 0x8C, 0xAB, 0x34, 0xC2, 0x59, + 0x09, 0xC9, 0x9A, 0x41, 0x74, 0x67, 0xCE, 0x7F, 0x7F, 0x81, 0x17, + 0x36, 0x21, 0x96, 0x1A, 0x2B, 0x70, 0x17, 0x1D, 0x3D, 0x7A, 0x2E, + 0x1E, 0x8A, 0x1D, 0xD5, 0x9B, 0x88, 0xB1, 0xC8, 0xE6, 0x0F, 0xED, + 0x1E, 0xFA, 0xC4, 0xC9, 0xC0, 0x5F, 0x9F, 0x9C, 0xA9, 0x83, 0x4F, + 0xA0, 0x42, 0xAE, 0x8F, 0xBA, 0x58, 0x4B, 0x09, 0xFF}, + {0xDC, 0x7E, 0x84, 0xBF, 0xDA, 0x79, 0x16, 0x4B, 0x7E, 0xCD, 0x84, + 0x86, 0x98, 0x5D, 0x38, 0x60, 0x39, 0xFF, 0xED, 0x14, 0x3B, 0x28, + 0xB1, 0xC8, 0x32, 0x11, 0x3C, 0x63, 0x31, 0xE5, 0x40, 0x7B, 0xDF, + 0x10, 0x13, 0x24, 0x15, 0xE5, 0x4B, 0x92, 0xA1, 0x3E, 0xD0, 0xA8, + 0x26, 0x7A, 0xE2, 0xF9, 0x75, 0xA3, 0x85, 0x74, 0x1A, 0xB9, 0xCE, + 0xF8, 0x20, 0x31, 0x62, 0x3D, 0x55, 0xB1, 0xE4, 0x71}}; +#endif /* MBEDTLS_CIPHER_MODE_CFB */ + +#if defined(MBEDTLS_CIPHER_MODE_CTR) +/* + * AES-CTR test vectors from: + * + * http://www.faqs.org/rfcs/rfc3686.html + */ + +static const unsigned char aes_test_ctr_key[3][16] = { + {0xAE, 0x68, 0x52, 0xF8, 0x12, 0x10, 0x67, 0xCC, 0x4B, 0xF7, 0xA5, 0x76, + 0x55, 0x77, 0xF3, 0x9E}, + {0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, + 0x32, 0x53, 0x91, 0x63}, + {0x76, 0x91, 0xBE, 0x03, 0x5E, 0x50, 0x20, 0xA8, 0xAC, 0x6E, 0x61, 0x85, + 0x29, 0xF9, 0xA0, 0xDC}}; + +static const unsigned char aes_test_ctr_nonce_counter[3][16] = { + {0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01}, + {0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B, + 0x00, 0x00, 0x00, 0x01}, + {0x00, 0xE0, 0x01, 0x7B, 0x27, 0x77, 0x7F, 0x3F, 0x4A, 0x17, 0x86, 0xF0, + 0x00, 0x00, 0x00, 0x01}}; + +static const unsigned char aes_test_ctr_pt[3][48] = { + {0x53, 0x69, 0x6E, 0x67, 0x6C, 0x65, 0x20, 0x62, 0x6C, 0x6F, 0x63, 0x6B, + 0x20, 0x6D, 0x73, 0x67}, + + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, + 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F}, + + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23}}; + +static const unsigned char aes_test_ctr_ct[3][48] = { + {0xE4, 0x09, 0x5D, 0x4F, 0xB7, 0xA7, 0xB3, 0x79, 0x2D, 0x61, 0x75, 0xA3, + 0x26, 0x13, 0x11, 0xB8}, + {0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, + 0xEE, 0x8E, 0xDA, 0xD3, 0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, + 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41, 0xBE, 0x28}, + {0xC1, 0xCF, 0x48, 0xA8, 0x9F, 0x2F, 0xFD, 0xD9, 0xCF, 0x46, 0x52, 0xE9, + 0xEF, 0xDB, 0x72, 0xD7, 0x45, 0x40, 0xA4, 0x2B, 0xDE, 0x6D, 0x78, 0x36, + 0xD5, 0x9A, 0x5C, 0xEA, 0xAE, 0xF3, 0x10, 0x53, 0x25, 0xB2, 0x07, 0x2F}}; + +static const int aes_test_ctr_len[3] = {16, 32, 36}; +#endif /* MBEDTLS_CIPHER_MODE_CTR */ + +/* + * Checkup routine + */ +int mbedtls_aes_self_test(int verbose) { + int ret = 0, i, j, u, v; + unsigned char key[32]; + unsigned char buf[64]; +#if defined(MBEDTLS_CIPHER_MODE_CBC) || defined(MBEDTLS_CIPHER_MODE_CFB) + unsigned char iv[16]; +#endif +#if defined(MBEDTLS_CIPHER_MODE_CBC) + unsigned char prv[16]; +#endif +#if defined(MBEDTLS_CIPHER_MODE_CTR) || defined(MBEDTLS_CIPHER_MODE_CFB) + size_t offset; +#endif +#if defined(MBEDTLS_CIPHER_MODE_CTR) + int len; + unsigned char nonce_counter[16]; + unsigned char stream_block[16]; +#endif + mbedtls_aes_context ctx; + + memset(key, 0, 32); + mbedtls_aes_init(&ctx); + + /* + * ECB mode + */ + for (i = 0; i < 6; i++) { + u = i >> 1; + v = i & 1; + + if (verbose != 0) + mbedtls_printf(" AES-ECB-%3d (%s): ", 128 + u * 64, + (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc"); + + memset(buf, 0, 16); + + if (v == MBEDTLS_AES_DECRYPT) { + mbedtls_aes_setkey_dec(&ctx, key, 128 + u * 64); + + for (j = 0; j < 10000; j++) + mbedtls_aes_crypt_ecb(&ctx, v, buf, buf); + + if (memcmp(buf, aes_test_ecb_dec[u], 16) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } else { + mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64); + + for (j = 0; j < 10000; j++) + mbedtls_aes_crypt_ecb(&ctx, v, buf, buf); + + if (memcmp(buf, aes_test_ecb_enc[u], 16) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } + + if (verbose != 0) mbedtls_printf("passed\n"); + } + + if (verbose != 0) mbedtls_printf("\n"); + +#if defined(MBEDTLS_CIPHER_MODE_CBC) + /* + * CBC mode + */ + for (i = 0; i < 6; i++) { + u = i >> 1; + v = i & 1; + + if (verbose != 0) + mbedtls_printf(" AES-CBC-%3d (%s): ", 128 + u * 64, + (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc"); + + memset(iv, 0, 16); + memset(prv, 0, 16); + memset(buf, 0, 16); + + if (v == MBEDTLS_AES_DECRYPT) { + mbedtls_aes_setkey_dec(&ctx, key, 128 + u * 64); + + for (j = 0; j < 10000; j++) + mbedtls_aes_crypt_cbc(&ctx, v, 16, iv, buf, buf); + + if (memcmp(buf, aes_test_cbc_dec[u], 16) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } else { + mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64); + + for (j = 0; j < 10000; j++) { + unsigned char tmp[16]; + + mbedtls_aes_crypt_cbc(&ctx, v, 16, iv, buf, buf); + + memcpy(tmp, prv, 16); + memcpy(prv, buf, 16); + memcpy(buf, tmp, 16); + } + + if (memcmp(prv, aes_test_cbc_enc[u], 16) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } + + if (verbose != 0) mbedtls_printf("passed\n"); + } + + if (verbose != 0) mbedtls_printf("\n"); +#endif /* MBEDTLS_CIPHER_MODE_CBC */ + +#if defined(MBEDTLS_CIPHER_MODE_CFB) + /* + * CFB128 mode + */ + for (i = 0; i < 6; i++) { + u = i >> 1; + v = i & 1; + + if (verbose != 0) + mbedtls_printf(" AES-CFB128-%3d (%s): ", 128 + u * 64, + (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc"); + + memcpy(iv, aes_test_cfb128_iv, 16); + memcpy(key, aes_test_cfb128_key[u], 16 + u * 8); + + offset = 0; + mbedtls_aes_setkey_enc(&ctx, key, 128 + u * 64); + + if (v == MBEDTLS_AES_DECRYPT) { + memcpy(buf, aes_test_cfb128_ct[u], 64); + mbedtls_aes_crypt_cfb128(&ctx, v, 64, &offset, iv, buf, buf); + + if (memcmp(buf, aes_test_cfb128_pt, 64) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } else { + memcpy(buf, aes_test_cfb128_pt, 64); + mbedtls_aes_crypt_cfb128(&ctx, v, 64, &offset, iv, buf, buf); + + if (memcmp(buf, aes_test_cfb128_ct[u], 64) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } + + if (verbose != 0) mbedtls_printf("passed\n"); + } + + if (verbose != 0) mbedtls_printf("\n"); +#endif /* MBEDTLS_CIPHER_MODE_CFB */ + +#if defined(MBEDTLS_CIPHER_MODE_CTR) + /* + * CTR mode + */ + for (i = 0; i < 6; i++) { + u = i >> 1; + v = i & 1; + + if (verbose != 0) + mbedtls_printf(" AES-CTR-128 (%s): ", + (v == MBEDTLS_AES_DECRYPT) ? "dec" : "enc"); + + memcpy(nonce_counter, aes_test_ctr_nonce_counter[u], 16); + memcpy(key, aes_test_ctr_key[u], 16); + + offset = 0; + mbedtls_aes_setkey_enc(&ctx, key, 128); + + if (v == MBEDTLS_AES_DECRYPT) { + len = aes_test_ctr_len[u]; + memcpy(buf, aes_test_ctr_ct[u], len); + + mbedtls_aes_crypt_ctr(&ctx, len, &offset, nonce_counter, + stream_block, buf, buf); + + if (memcmp(buf, aes_test_ctr_pt[u], len) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } else { + len = aes_test_ctr_len[u]; + memcpy(buf, aes_test_ctr_pt[u], len); + + mbedtls_aes_crypt_ctr(&ctx, len, &offset, nonce_counter, + stream_block, buf, buf); + + if (memcmp(buf, aes_test_ctr_ct[u], len) != 0) { + if (verbose != 0) mbedtls_printf("failed\n"); + + ret = 1; + goto exit; + } + } + + if (verbose != 0) mbedtls_printf("passed\n"); + } + + if (verbose != 0) mbedtls_printf("\n"); +#endif /* MBEDTLS_CIPHER_MODE_CTR */ + + ret = 0; + +exit: + mbedtls_aes_free(&ctx); + + return (ret); +} + +#endif /* MBEDTLS_SELF_TEST */ + +#endif /* MBEDTLS_AES_C */ diff --git a/lite/src/decryption/mbedtls/aes.h b/lite/src/decryption/mbedtls/aes.h new file mode 100644 index 0000000000000000000000000000000000000000..1e47c48cb2192de1078247cab19d17d7d7032a6a --- /dev/null +++ b/lite/src/decryption/mbedtls/aes.h @@ -0,0 +1,349 @@ +/** + * \file aes.h + * + * \brief AES block cipher + * + * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of mbed TLS (https://tls.mbed.org) + */ + +/** + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#ifndef MBEDTLS_AES_H +#define MBEDTLS_AES_H + +#if !defined(MBEDTLS_CONFIG_FILE) +#include "config.h" +#else +#include MBEDTLS_CONFIG_FILE +#endif + +#include +#include + +/* padlock.c and aesni.c rely on these values! */ +#define MBEDTLS_AES_ENCRYPT 1 +#define MBEDTLS_AES_DECRYPT 0 + +#define MBEDTLS_ERR_AES_INVALID_KEY_LENGTH -0x0020 /**< Invalid key length. */ +#define MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH \ + -0x0022 /**< Invalid data input length. */ + +#if (defined(__ARMCC_VERSION) || defined(_MSC_VER)) && !defined(inline) && \ + !defined(__cplusplus) +#define inline __inline +#endif + +#if !defined(MBEDTLS_AES_ALT) +// Regular implementation +// + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \brief AES context structure + * + * \note buf is able to hold 32 extra bytes, which can be used: + * - for alignment purposes if VIA padlock is used, and/or + * - to simplify key expansion in the 256-bit case by + * generating an extra round key + */ +typedef struct { + int nr; /*!< number of rounds */ + uint32_t* rk; /*!< AES round keys */ + uint32_t buf[68]; /*!< unaligned data */ +} mbedtls_aes_context; + +/** + * \brief Initialize AES context + * + * \param ctx AES context to be initialized + */ +void mbedtls_aes_init(mbedtls_aes_context* ctx); + +/** + * \brief Clear AES context + * + * \param ctx AES context to be cleared + */ +void mbedtls_aes_free(mbedtls_aes_context* ctx); + +/** + * \brief AES key schedule (encryption) + * + * \param ctx AES context to be initialized + * \param key encryption key + * \param keybits must be 128, 192 or 256 + * + * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH + */ +int mbedtls_aes_setkey_enc(mbedtls_aes_context* ctx, const unsigned char* key, + unsigned int keybits); + +/** + * \brief AES key schedule (decryption) + * + * \param ctx AES context to be initialized + * \param key decryption key + * \param keybits must be 128, 192 or 256 + * + * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH + */ +int mbedtls_aes_setkey_dec(mbedtls_aes_context* ctx, const unsigned char* key, + unsigned int keybits); + +/** + * \brief AES-ECB block encryption/decryption + * + * \param ctx AES context + * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT + * \param input 16-byte input block + * \param output 16-byte output block + * + * \return 0 if successful + */ +int mbedtls_aes_crypt_ecb(mbedtls_aes_context* ctx, int mode, + const unsigned char input[16], + unsigned char output[16]); + +#if defined(MBEDTLS_CIPHER_MODE_CBC) +/** + * \brief AES-CBC buffer encryption/decryption + * Length should be a multiple of the block + * size (16 bytes) + * + * \note Upon exit, the content of the IV is updated so that you can + * call the function same function again on the following + * block(s) of data and get the same result as if it was + * encrypted in one call. This allows a "streaming" usage. + * If on the other hand you need to retain the contents of the + * IV, you should either save it manually or use the cipher + * module instead. + * + * \param ctx AES context + * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT + * \param length length of the input data + * \param iv initialization vector (updated after use) + * \param input buffer holding the input data + * \param output buffer holding the output data + * + * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH + */ +int mbedtls_aes_crypt_cbc(mbedtls_aes_context* ctx, int mode, size_t length, + unsigned char iv[16], const unsigned char* input, + unsigned char* output); +#endif /* MBEDTLS_CIPHER_MODE_CBC */ + +#if defined(MBEDTLS_CIPHER_MODE_CFB) +/** + * \brief AES-CFB128 buffer encryption/decryption. + * + * Note: Due to the nature of CFB you should use the same key schedule for + * both encryption and decryption. So a context initialized with + * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and + * MBEDTLS_AES_DECRYPT. + * + * \note Upon exit, the content of the IV is updated so that you can + * call the function same function again on the following + * block(s) of data and get the same result as if it was + * encrypted in one call. This allows a "streaming" usage. + * If on the other hand you need to retain the contents of the + * IV, you should either save it manually or use the cipher + * module instead. + * + * \param ctx AES context + * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT + * \param length length of the input data + * \param iv_off offset in IV (updated after use) + * \param iv initialization vector (updated after use) + * \param input buffer holding the input data + * \param output buffer holding the output data + * + * \return 0 if successful + */ +int mbedtls_aes_crypt_cfb128(mbedtls_aes_context* ctx, int mode, size_t length, + size_t* iv_off, unsigned char iv[16], + const unsigned char* input, unsigned char* output); + +/** + * \brief AES-CFB8 buffer encryption/decryption. + * + * Note: Due to the nature of CFB you should use the same key schedule for + * both encryption and decryption. So a context initialized with + * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and + * MBEDTLS_AES_DECRYPT. + * + * \note Upon exit, the content of the IV is updated so that you can + * call the function same function again on the following + * block(s) of data and get the same result as if it was + * encrypted in one call. This allows a "streaming" usage. + * If on the other hand you need to retain the contents of the + * IV, you should either save it manually or use the cipher + * module instead. + * + * \param ctx AES context + * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT + * \param length length of the input data + * \param iv initialization vector (updated after use) + * \param input buffer holding the input data + * \param output buffer holding the output data + * + * \return 0 if successful + */ +int mbedtls_aes_crypt_cfb8(mbedtls_aes_context* ctx, int mode, size_t length, + unsigned char iv[16], const unsigned char* input, + unsigned char* output); +#endif /*MBEDTLS_CIPHER_MODE_CFB */ + +#if defined(MBEDTLS_CIPHER_MODE_CTR) +/** + * \brief AES-CTR buffer encryption/decryption + * + * Warning: You have to keep the maximum use of your counter in mind! + * + * Note: Due to the nature of CTR you should use the same key schedule for + * both encryption and decryption. So a context initialized with + * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and + * MBEDTLS_AES_DECRYPT. + * + * \param ctx AES context + * \param length The length of the data + * \param nc_off The offset in the current stream_block (for resuming + * within current cipher stream). The offset pointer to + * should be 0 at the start of a stream. + * \param nonce_counter The 128-bit nonce and counter. + * \param stream_block The saved stream-block for resuming. Is overwritten + * by the function. + * \param input The input data stream + * \param output The output data stream + * + * \return 0 if successful + */ +int mbedtls_aes_crypt_ctr(mbedtls_aes_context* ctx, size_t length, + size_t* nc_off, unsigned char nonce_counter[16], + unsigned char stream_block[16], + const unsigned char* input, unsigned char* output); +#endif /* MBEDTLS_CIPHER_MODE_CTR */ + +/** + * \brief Internal AES block encryption function + * (Only exposed to allow overriding it, + * see MBEDTLS_AES_ENCRYPT_ALT) + * + * \param ctx AES context + * \param input Plaintext block + * \param output Output (ciphertext) block + * + * \return 0 if successful + */ +int mbedtls_internal_aes_encrypt(mbedtls_aes_context* ctx, + const unsigned char input[16], + unsigned char output[16]); + +/** + * \brief Internal AES block decryption function + * (Only exposed to allow overriding it, + * see MBEDTLS_AES_DECRYPT_ALT) + * + * \param ctx AES context + * \param input Ciphertext block + * \param output Output (plaintext) block + * + * \return 0 if successful + */ +int mbedtls_internal_aes_decrypt(mbedtls_aes_context* ctx, + const unsigned char input[16], + unsigned char output[16]); + +#if !defined(MBEDTLS_DEPRECATED_REMOVED) +#if defined(MBEDTLS_DEPRECATED_WARNING) +#define MBEDTLS_DEPRECATED __attribute__((deprecated)) +#else +#define MBEDTLS_DEPRECATED +#endif +/** + * \brief Internal AES block encryption function + * (Only exposed to allow overriding it, + * see MBEDTLS_AES_ENCRYPT_ALT) + * + * \deprecated Superseded by mbedtls_aes_encrypt_ext() in 2.5.0 + * + * \param ctx AES context + * \param input Plaintext block + * \param output Output (ciphertext) block + */ +MBEDTLS_DEPRECATED static inline void mbedtls_aes_encrypt( + mbedtls_aes_context* ctx, const unsigned char input[16], + unsigned char output[16]) { + mbedtls_internal_aes_encrypt(ctx, input, output); +} + +/** + * \brief Internal AES block decryption function + * (Only exposed to allow overriding it, + * see MBEDTLS_AES_DECRYPT_ALT) + * + * \deprecated Superseded by mbedtls_aes_decrypt_ext() in 2.5.0 + * + * \param ctx AES context + * \param input Ciphertext block + * \param output Output (plaintext) block + */ +MBEDTLS_DEPRECATED static inline void mbedtls_aes_decrypt( + mbedtls_aes_context* ctx, const unsigned char input[16], + unsigned char output[16]) { + mbedtls_internal_aes_decrypt(ctx, input, output); +} + +#undef MBEDTLS_DEPRECATED +#endif /* !MBEDTLS_DEPRECATED_REMOVED */ + +#ifdef __cplusplus +} +#endif + +#else /* MBEDTLS_AES_ALT */ +#include "aes_alt.h" +#endif /* MBEDTLS_AES_ALT */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \brief Checkup routine + * + * \return 0 if successful, or 1 if the test failed + */ +int mbedtls_aes_self_test(int verbose); + +#ifdef __cplusplus +} +#endif + +#endif /* aes.h */ diff --git a/lite/src/decryption/mbedtls/config.h b/lite/src/decryption/mbedtls/config.h new file mode 100644 index 0000000000000000000000000000000000000000..57f12656c5ecd7ffed32315a1661dcce1a0cd87a --- /dev/null +++ b/lite/src/decryption/mbedtls/config.h @@ -0,0 +1,5 @@ +#pragma once + +#define MBEDTLS_AES_C +#define MBEDTLS_AES_ROM_TABLES +#define MBEDTLS_CIPHER_MODE_CBC diff --git a/lite/src/decryption/rc4/rc4_cryption_base.h b/lite/src/decryption/rc4/rc4_cryption_base.h new file mode 100644 index 0000000000000000000000000000000000000000..e811acba59a8bebd58dd9f492c714ad858375692 --- /dev/null +++ b/lite/src/decryption/rc4/rc4_cryption_base.h @@ -0,0 +1,156 @@ +/** + * \file src/decryption/rc4/rc4_cryption_base.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ +#pragma once + +#include +#include + +namespace lite { +namespace rc4 { + +#define m256(x) static_cast(x) + +/*! \brief Pseudo-random byte stream for RC4. + */ +class RC4RandStream { +public: + RC4RandStream() = default; + + RC4RandStream(uint64_t key) { reset(key); } + + void reset(uint64_t init_key) { + i_ = j_ = 0; + for (int i = 0; i < 256; i++) + s_[i] = i; + uint8_t j = 0; + for (int i = 0; i < 256; i++) { + j = j + s_[i] + m256(init_key >> ((i % 8) * 8)); + std::swap(s_[i], s_[j]); + } + // drop + for (int i = 0; i < 768; i++) { + next8(); + } + for (int i = 0, t = next8(); i < t; i++) { + next8(); + } + } + + uint8_t next8() { + i_++; + uint8_t a = s_[i_]; + j_ += a; + uint8_t b = s_[j_]; + s_[i_] = b; + s_[j_] = a; + uint8_t c = s_[m256((i_ << 5) ^ (j_ >> 3))] + + s_[m256((j_ << 5) ^ (i_ >> 3))]; + return (s_[m256(a + b)] + s_[c ^ 0xAA]) ^ s_[m256(j_ + b)]; + } + + uint64_t next64() { + uint64_t rst; + uint8_t* buf = reinterpret_cast(&rst); + for (int i = 0; i < 8; i++) { + buf[i] = next8(); + } + return rst; + } + +private: + uint8_t s_[256], i_ = 0, j_ = 0; +}; +#undef m256 + +/*! + * \brief fast and secure 64-bit hash + * see https://code.google.com/p/fast-hash/ + */ +class FastHash64 { +public: + FastHash64(uint64_t seed) + : hash_{seed}, + mul0_{key_gen_hash_mul0()}, + mul1_{key_gen_hash_mul1()} {} + + void feed(uint64_t val) { + val ^= val >> 23; + val *= mul0_; + val ^= val >> 47; + hash_ ^= val; + hash_ *= mul1_; + } + + uint64_t get() { return hash_; } + +private: + uint64_t hash_; + const uint64_t mul0_, mul1_; + + static uint64_t key_gen_hash_mul0() { + uint64_t rst; + uint8_t volatile* buf = reinterpret_cast(&rst); + buf[2] = 50; + buf[3] = 244; + buf[6] = 39; + buf[1] = 92; + buf[5] = 89; + buf[4] = 155; + buf[0] = 55; + buf[7] = 33; + return rst; + } + + static uint64_t key_gen_hash_mul1() { + uint64_t rst; + uint8_t volatile* buf = reinterpret_cast(&rst); + buf[6] = 3; + buf[2] = 109; + buf[7] = 136; + buf[1] = 25; + buf[5] = 85; + buf[0] = 101; + buf[4] = 242; + buf[3] = 30; + return rst; + } +}; + +// The encryption keys are always inlined. +static inline uint64_t key_gen_enc_key() { + uint64_t rst; + uint8_t volatile* buf = reinterpret_cast(&rst); + buf[4] = 120; + buf[3] = 121; + buf[7] = 122; + buf[6] = 123; + buf[0] = 124; + buf[5] = 125; + buf[2] = 126; + buf[1] = 127; + return rst; +} + +static inline uint64_t key_gen_hash_key() { + uint64_t rst; + uint8_t volatile* buf = reinterpret_cast(&rst); + buf[2] = 101; + buf[5] = 102; + buf[4] = 103; + buf[7] = 104; + buf[1] = 105; + buf[3] = 106; + buf[6] = 107; + buf[0] = 108; + return rst; +} +} // namespace rc4 +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/rc4/rc4_cryption_impl.cpp b/lite/src/decryption/rc4/rc4_cryption_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ca477bcfbce0e922788a44820d6e7460c239a8a8 --- /dev/null +++ b/lite/src/decryption/rc4/rc4_cryption_impl.cpp @@ -0,0 +1,219 @@ +/** + * \file src/decryption/rc4/rc4_cryption_impl.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "rc4_cryption_impl.h" +#include "../../misc.h" + +#include + +using namespace lite; + +/*! + * \brief Read the input stream once in order to initialize the decryption + * state. + */ +void RC4Impl::init_rc4_state() { + rc4::RC4RandStream enc_stream(m_enc_key); + rc4::FastHash64 dechash(m_hash_key); + + size_t offset = 0; + + std::vector buffer(128); + size_t remaining = m_model_length - sizeof(uint64_t); + while (remaining > 0) { + size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); + memcpy(buffer.data(), static_cast(m_model_mem) + offset, + toread); + offset += toread; + remaining -= toread; + + for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { + uint64_t value = buffer[i]; + value ^= enc_stream.next64(); + dechash.feed(value); + } + } + + uint64_t hashvalue; + memcpy(&hashvalue, static_cast(m_model_mem) + offset, + sizeof(hashvalue)); + offset += sizeof(hashvalue); + + hashvalue ^= dechash.get() ^ enc_stream.next64(); + m_state.hash_stream.reset(hashvalue); + m_state.enc_stream.reset(m_enc_key); +} + +std::vector RC4Impl::decrypt_model() { + std::vector result(m_model_length, 0); + + uint8_t* ptr = result.data(); + for (size_t i = 0; i < m_model_length; ++i) { + ptr[i] = static_cast(m_model_mem)[i]; + ptr[i] ^= m_state.hash_stream.next8() ^ m_state.enc_stream.next8(); + } + return result; +} + +/*! \brief Encrypt the data in m_buffer. + * + * The basic idea is to calculate a 64-bit hash from the buffer and append + * it to the end of the buffer. The basic requirement is that the change of + * every byte including the hash value will destroy the whole model in every + * byte. + * + * Encryption: + * + * 1. First calculate a 64-bit hash, called plain hash value, from the + * buffer. + * 2. Initialize a RC4 stream with the plain hash value. + * 3. Obfuscate the model body with the RC4 stream defined in step 2. + * 4. Calculate the hash value of the obfuscated model, called hash value + * after hashing. + * 5. Encrypt the model body with a RC4 stream made from the encryption key. + * 6. Bit-xor the hash value after hashing with the plain hash value, called + * mixed hash. + * 7. Encrypt the mixed hash with the RC4 stream defined in step 5, called + * the protected hash. + * 8. Append the protected hash to the buffer. + * + * Decryption: + * 1. Decrypt the model body with a RC4 stream made from the encryption key, + * which is the reverse of step 5 and 7 of encryption and get the mixed + * hash. + * 2. Calculate the hash value of the decrypted model, which equals to the + * hash value after hashing in step 4 of encryption. + * 3. Bit-xor the hash value after hashing and the mixed hash to get the + * plain hash value, which is the reverse of step 6 of encryption. + * 4. Un-obfuscate the model body with the plain hash value, which is the + * reverse of step 3 of encryption. + * + * Think: + * 1. If any byte in the model body is broken, the hash value after hashing + * will be broken in step 2, and hence the plain hash value in step 3 + * will be also broken, and finally, the model body will be broken in + * step 4. + * 2. If the protected hash is broken, the plain hash value in step 3 will + * be broken, and finally the model body will be broken. + */ +std::vector RC4Impl::encrypt_model() { + size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / + sizeof(size_t) * sizeof(size_t); + std::vector pad_model(total_length, 0); + memcpy(pad_model.data(), m_model_mem, m_model_length); + + // Calculate the hash of the model. + rc4::FastHash64 plainhash(m_hash_key); + uint64_t* ptr = reinterpret_cast(pad_model.data()); + size_t len = pad_model.size() / sizeof(uint64_t); + + for (size_t i = 0; i < len; ++i) + plainhash.feed(ptr[i]); + uint64_t plainhash_value = plainhash.get(); + + // Encrypt the model. + rc4::RC4RandStream hash_enc(plainhash_value); + rc4::RC4RandStream outmost_enc(m_enc_key); + rc4::FastHash64 afterhashenc_hash(m_hash_key); + + for (size_t i = 0; i < len; ++i) { + uint64_t value = ptr[i] ^ hash_enc.next64(); + afterhashenc_hash.feed(value); + ptr[i] = value ^ outmost_enc.next64(); + } + + uint64_t protected_hash = + plainhash_value ^ afterhashenc_hash.get() ^ outmost_enc.next64(); + + size_t end = pad_model.size(); + pad_model.resize(pad_model.size() + sizeof(uint64_t)); + ptr = reinterpret_cast(&pad_model[end]); + *ptr = protected_hash; + return pad_model; +} + +/*! + * \brief Read the input stream once in order to initialize the decryption + * state. + */ +void SimpleFastRC4Impl::init_sfrc4_state() { + rc4::RC4RandStream enc_stream(m_enc_key); + rc4::FastHash64 dechash(m_hash_key); + + size_t offset = 0; + std::vector buffer(128); + size_t remaining = m_model_length - sizeof(uint64_t); + while (remaining > 0) { + size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); + memcpy(buffer.data(), static_cast(m_model_mem) + offset, + toread); + offset += toread; + remaining -= toread; + + for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { + uint64_t value = buffer[i]; + dechash.feed(value); + } + } + + uint64_t hashvalue; + memcpy(&hashvalue, static_cast(m_model_mem) + offset, + sizeof(hashvalue)); + + offset += sizeof(hashvalue); + + /*! \brief test the hash_val. */ + if (hashvalue != dechash.get()) + LITE_THROW( + "The checksum of the file cannot be verified. The file may " + "be encrypted in the wrong algorithm or different keys."); + + m_state.hash_stream.reset(m_hash_key); + m_state.enc_stream.reset(m_enc_key); +} + +std::vector SimpleFastRC4Impl::decrypt_model() { + std::vector result(m_model_length, 0); + uint8_t* ptr = result.data(); + for (size_t i = 0; i < m_model_length; ++i) { + ptr[i] = static_cast(m_model_mem)[i]; + ptr[i] ^= m_state.enc_stream.next8(); + } + return result; +} + +std::vector SimpleFastRC4Impl::encrypt_model() { + size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / + sizeof(size_t) * sizeof(size_t); + std::vector pad_model(total_length, 0); + memcpy(pad_model.data(), m_model_mem, m_model_length); + + // Calculate the hash of the model. + rc4::FastHash64 enchash(m_hash_key); + uint64_t* ptr = reinterpret_cast(pad_model.data()); + size_t len = pad_model.size() / sizeof(uint64_t); + + // Encrypt the model. + rc4::RC4RandStream out_enc(m_enc_key); + for (size_t i = 0; i < len; ++i) { + ptr[i] = ptr[i] ^ out_enc.next64(); + enchash.feed(ptr[i]); + } + + uint64_t hash_value = enchash.get(); + + size_t end = pad_model.size(); + pad_model.resize(pad_model.size() + sizeof(uint64_t)); + ptr = reinterpret_cast(&pad_model[end]); + *ptr = hash_value; + + return pad_model; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/rc4/rc4_cryption_impl.h b/lite/src/decryption/rc4/rc4_cryption_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..b5c88e836729a5e27662a0209c9d0163310bfede --- /dev/null +++ b/lite/src/decryption/rc4/rc4_cryption_impl.h @@ -0,0 +1,79 @@ +/** + * \file src/decryption/rc4/rc4_cryption_impl.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ +#pragma once +#include "rc4_cryption_base.h" + +#include +#include + +namespace lite { + +class RC4Impl { + struct RC4State { + rc4::RC4RandStream enc_stream; + rc4::RC4RandStream hash_stream; + } m_state; + +public: + RC4Impl(const void* model_mem, size_t size, const std::vector& key) + : m_model_mem(model_mem), m_model_length(size) { + const uint8_t* data = key.data(); + m_hash_key = *reinterpret_cast(data); + m_enc_key = *reinterpret_cast(data + 8); + } + + std::vector encrypt_model(); + std::vector decrypt_model(); + + /*! \brief Read the input stream once in order to initialize the decryption + * state. + */ + void init_rc4_state(); + +private: + const void* m_model_mem; + size_t m_model_length; + + uint64_t m_hash_key; + uint64_t m_enc_key; +}; + +class SimpleFastRC4Impl { + struct SFRC4State { + rc4::RC4RandStream enc_stream; + rc4::RC4RandStream hash_stream; + } m_state; + +public: + SimpleFastRC4Impl(const void* model_mem, size_t size, + const std::vector& key) + : m_model_mem(model_mem), m_model_length(size) { + const uint8_t* data = key.data(); + m_hash_key = *reinterpret_cast(data); + m_enc_key = *reinterpret_cast(data + 8); + } + std::vector encrypt_model(); + std::vector decrypt_model(); + + /*! \brief Read the input stream once in order to initialize the decryption + * state. + */ + void init_sfrc4_state(); + +private: + const void* m_model_mem; + size_t m_model_length; + + uint64_t m_hash_key; + uint64_t m_enc_key; +}; + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/rc4_cryption.cpp b/lite/src/decryption/rc4_cryption.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d8105deec04b40350f5b9b356fb996a8bdcfaf53 --- /dev/null +++ b/lite/src/decryption/rc4_cryption.cpp @@ -0,0 +1,58 @@ +/** + * \file src/decryption/rc4_cryption.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "rc4_cryption.h" +#include "rc4/rc4_cryption_impl.h" + +#include + +using namespace lite; + +std::vector RC4::decrypt_model(const void* model_mem, size_t size, + const std::vector& key) { + RC4Impl rc4_impl(model_mem, size, key); + rc4_impl.init_rc4_state(); + return rc4_impl.decrypt_model(); +} + +std::vector RC4::encrypt_model(const void* model_mem, size_t size, + const std::vector& key) { + RC4Impl rc4_impl(model_mem, size, key); + return rc4_impl.encrypt_model(); +} + +std::vector RC4::get_decrypt_key() { + std::vector keys(128, 0); + uint64_t* data = reinterpret_cast(keys.data()); + data[0] = rc4::key_gen_hash_key(); + data[1] = rc4::key_gen_enc_key(); + return keys; +}; + +std::vector SimpleFastRC4::decrypt_model( + const void* model_mem, size_t size, const std::vector& key) { + SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); + simple_fast_rc4_impl.init_sfrc4_state(); + return simple_fast_rc4_impl.decrypt_model(); +} +std::vector SimpleFastRC4::encrypt_model( + const void* model_mem, size_t size, const std::vector& key) { + SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); + return simple_fast_rc4_impl.encrypt_model(); +} + +std::vector SimpleFastRC4::get_decrypt_key() { + std::vector keys(128, 0); + uint64_t* data = reinterpret_cast(keys.data()); + data[0] = rc4::key_gen_hash_key(); + data[1] = rc4::key_gen_enc_key(); + return keys; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/decryption/rc4_cryption.h b/lite/src/decryption/rc4_cryption.h new file mode 100644 index 0000000000000000000000000000000000000000..1c5c9f89aded18559ecef610cf4cb31650ca604c --- /dev/null +++ b/lite/src/decryption/rc4_cryption.h @@ -0,0 +1,44 @@ +/** + * \file src/decryption/rc4_cryption.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ +#pragma once + +#include "rc4/rc4_cryption_base.h" + +#include + +namespace lite { + +class RC4 { +public: + static std::vector decrypt_model(const void* model_mem, + size_t size, + const std::vector& key); + + static std::vector encrypt_model(const void* model_mem, + size_t size, + const std::vector& key); + + static std::vector get_decrypt_key(); +}; + +class SimpleFastRC4 { +public: + static std::vector decrypt_model(const void* model_mem, + size_t size, + const std::vector& key); + static std::vector encrypt_model(const void* model_mem, + size_t size, + const std::vector& key); + + static std::vector get_decrypt_key(); +}; + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/function_base.h b/lite/src/function_base.h new file mode 100644 index 0000000000000000000000000000000000000000..42c1abe8a0e5a11dd9330be3141e839cda4768ba --- /dev/null +++ b/lite/src/function_base.h @@ -0,0 +1,53 @@ +/** + * \file src/function_base.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#include +#include "misc.h" +#include "type_info.h" +// template +namespace lite { +class TensorImplDft; +class NetworkImplDft; +namespace { + +template +struct class_type_name { + std::string operator()() { return ""; } +}; +#define ADD_STATEMENT(class_name, backend_name) \ + template <> \ + struct class_type_name { \ + std::string operator()() { return #backend_name; } \ + } +ADD_STATEMENT(TensorImplDft, Dft); +ADD_STATEMENT(NetworkImplDft, Dft); +#undef ADD_STATEMENT +} // namespace + +// if it can't find the function, ignore +template +ret_type try_call_func(std::string func_name, Args... args) { + mark_used_variable(func_name); + mark_used_variable(args...); + return nullptr; +} + +// if it can't find the function, throw error +template +ret_type call_func(std::string func_name, Args... args) { + mark_used_variable(args...); + auto backend_name = class_type_name()(); + auto msg_info = + func_name + " is not aviliable in " + backend_name + " backend."; + LITE_THROW(msg_info.c_str()); +} +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/global.cpp b/lite/src/global.cpp new file mode 100644 index 0000000000000000000000000000000000000000..378127e2fdb4e0e3f72376b6b5aa40eb0bba6e4b --- /dev/null +++ b/lite/src/global.cpp @@ -0,0 +1,256 @@ +/** + * \file src/global.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include + +#include "lite/global.h" +#include "decryption/aes_decrypt.h" +#include "decryption/decrypt_base.h" +#include "decryption/rc4_cryption.h" +#include "misc.h" +#include "parse_info/parse_info_base.h" +#include "parse_info/default_parse.h" + +#if LITE_BUILD_WITH_MGE +#include "megbrain/common.h" +#include "megbrain/comp_node.h" +#include "megbrain/serialization/extern_c_opr.h" +#include "megbrain/version.h" +#include "megcore_opencl.h" +#include "mge/algo_cache/file_cache.h" +#include "mge/common.h" +#if MGB_ENABLE_TENSOR_RT +#include "megbrain/tensorrt/tensorrt_engine_cache.h" +#endif +#if LITE_WITH_CUDA +#include "mge/algo_cache/redis_cache.h" +#endif +#endif + +#include +#include + +using namespace lite; + +lite::DecryptionStaticData& lite::decryption_static_data() { + static lite::DecryptionStaticData global_map; + return global_map; +} + +void lite::get_version(int& major, int& minor, int& patch) { +#if LITE_BUILD_WITH_MGE + auto version = mgb::get_version(); + major = version.major; + minor = version.minor; + patch = version.patch; +#else + //! without mge, the version set the max version + major = 8; + minor = 9999; + patch = 0; +#endif +} + +size_t lite::get_device_count(LiteDeviceType device_type) { +#if LITE_BUILD_WITH_MGE + auto mgb_device_type = to_compnode_locator(device_type).type; + return mgb::CompNode::get_device_count(mgb_device_type); +#else + LITE_MARK_USED_VAR(device_type); + LITE_THROW("no lite backend avialible, please check build macro."); +#endif +} + +bool lite::register_decryption_and_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key) { + LITE_LOCK_GUARD(decryption_static_data().map_mutex); + auto& global_map = decryption_static_data().decryption_methods; + if (global_map.find(decrypt_name) != global_map.end()) { + LITE_THROW(ssprintf("The decryption method %s is already registered.", + decrypt_name.c_str())); + return false; + } else { + auto key_pointer = std::make_shared>(key); + global_map[decrypt_name] = {func, key_pointer}; + LITE_LOG("Registered ecryption method %s.", decrypt_name.c_str()); + return true; + } +} + +bool lite::update_decryption_or_key(std::string decrypt_name, + const DecryptionFunc& func, + const std::vector& key) { + LITE_LOCK_GUARD(decryption_static_data().map_mutex); + auto& global_map = decryption_static_data().decryption_methods; + if (global_map.find(decrypt_name) != global_map.end()) { + std::shared_ptr> key_pointer; + DecryptionFunc new_func; + if (func) { + new_func = func; + LITE_LOG("%s decryption function is updated.", + decrypt_name.c_str()); + } else { + new_func = global_map[decrypt_name].first; + } + if (key.size()) { + key_pointer = std::make_shared>(key); + LITE_LOG("%s decryption key is updated.", decrypt_name.c_str()); + } else { + key_pointer = global_map[decrypt_name].second; + } + global_map[decrypt_name] = {new_func, key_pointer}; + return true; + } else { + LITE_THROW(ssprintf("The decryption method %s is not registered.", + decrypt_name.c_str())); + return false; + } +} + +lite::ParseInfoStaticData& lite::parse_info_static_data() { + static lite::ParseInfoStaticData global_map; + return global_map; +} + +bool lite::register_parse_info_func(std::string info_type, + const ParseInfoFunc& parse_func) { + LITE_LOCK_GUARD(parse_info_static_data().map_mutex); + auto& global_map = parse_info_static_data().parse_info_methods; + if (global_map.find(info_type) != global_map.end()) { + LITE_THROW(ssprintf("The parse info method %s is already registered.", + info_type.c_str())); + return false; + } else { + global_map[info_type] = parse_func; + LITE_LOG("Registered infomation parser method %s.", info_type.c_str()); + return true; + } +} + +#if LITE_BUILD_WITH_MGE + +namespace { +struct CacheControl { + LITE_MUTEX cache_mutex; + std::string cache_type = "file"; + std::atomic_size_t config_algo_times{0}; + std::atomic_size_t config_trt_times{0}; +}; +CacheControl cache_control; +} // namespace + + +void lite::try_coalesce_all_free_memory() { + mgb::CompNode::try_coalesce_all_free_memory(); +} + +void lite::set_loader_lib_path(const std::string& loader_path) { + const char* lib_path = loader_path.c_str(); + LITE_LOG("load a device loader of path %s.", lib_path); + auto handle = dlopen(lib_path, RTLD_LAZY); + LITE_ASSERT(handle, "failed to open c opr lib %s: %s", lib_path, dlerror()); + const char* entry = MGB_C_OPR_INIT_FUNC_STR; + auto func = dlsym(handle, entry); + LITE_ASSERT(func, "can not resolve %s: %s", entry, dlerror()); + typedef void (*entry_f_t)(void*); + reinterpret_cast(func)( + reinterpret_cast(&mgb_get_extern_c_opr_api_versioned)); +} + +void lite::set_persistent_cache(const std::string& cache_path, + bool always_sync) { + LITE_LOCK_GUARD(cache_control.cache_mutex); + cache_control.cache_type = "file"; + if (cache_control.config_algo_times >= 1) { + LITE_WARN( + "The cache has been set,maybe some model is using now, change " + "it now may cause unknow error!!"); + } + cache_control.config_algo_times++; + mgb::PersistentCache::set_impl(std::make_shared( + cache_path.c_str(), always_sync)); +} + +void lite::dump_persistent_cache(const std::string& cache_path) { + LITE_LOCK_GUARD(cache_control.cache_mutex); + LITE_ASSERT(cache_control.cache_type == "file", + "now cache type is redis, it can't be dumped."); + static_cast(mgb::PersistentCache::inst()) + .dump_cache(cache_path.c_str()); +} + +//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine +void lite::set_tensor_rt_cache(std::string tensorrt_cache_path) { +#if MGB_ENABLE_TENSOR_RT + LITE_LOCK_GUARD(cache_control.cache_mutex); + if (cache_control.config_trt_times >= 1) { + LITE_WARN( + "The trt cache has been set,maybe some model is using now, " + "change it now may cause unknow error!!"); + } + cache_control.config_trt_times++; + mgb::TensorRTEngineCache::enable_engine_cache(true); + mgb::TensorRTEngineCache::set_impl( + std::make_shared(tensorrt_cache_path)); +#else + LITE_MARK_USED_VAR(tensorrt_cache_path); + LITE_THROW("TensorRT is disable at compile time."); +#endif +} + +void lite::dump_tensor_rt_cache() { +#if MGB_ENABLE_TENSOR_RT + if (mgb::TensorRTEngineCache::enable_engine_cache()) { + mgb::TensorRTEngineCache::inst().dump_cache(); + } +#else + LITE_THROW("TensorRT is disable at compile time."); +#endif +} + +#else //LITE_BUILD_WITH_MGE +void lite::try_coalesce_all_free_memory() {} + +void lite::set_loader_lib_path(const std::string& ) { + LITE_THROW("mge is disbale at build time, please build with mge"); +} + +void lite::set_persistent_cache(const std::string&, bool) { + LITE_THROW("mge is disbale at build time, please build with mge"); +} + +void lite::dump_persistent_cache(const std::string& ) { + LITE_THROW("mge is disbale at build time, please build with mge"); +} + +//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine +void lite::set_tensor_rt_cache(std::string ) { + LITE_THROW("mge is disbale at build time, please build with mge"); +} + +void lite::dump_tensor_rt_cache() { + LITE_THROW("mge is disbale at build time, please build with mge"); +} +#endif +namespace lite { +REGIST_DECRYPTION_METHOD("AES_default", lite::AESDcryption::decrypt_model, + lite::AESDcryption::get_decrypt_key()); + +REGIST_DECRYPTION_METHOD("RC4_default", lite::RC4::decrypt_model, + lite::RC4::get_decrypt_key()); + +REGIST_DECRYPTION_METHOD("SIMPLE_FAST_RC4_default", + lite::SimpleFastRC4::decrypt_model, + lite::SimpleFastRC4::get_decrypt_key()); + +REGIST_PARSE_INFO_FUNCTION("LITE_default", lite::default_parse_info); +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/lite_build_config.h.in b/lite/src/lite_build_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..e1607948da80840360da040e0de8d68b203dfa98 --- /dev/null +++ b/lite/src/lite_build_config.h.in @@ -0,0 +1,37 @@ +/** + * \file lite/src/lite_build_config.h.in + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ +#ifndef _HEADER_LITE_BUILD_CONFIG +#define _HEADER_LITE_BUILD_CONFIG + +#cmakedefine01 LITE_ENABLE_LOGGING +#cmakedefine01 LITE_ENABLE_EXCEPTION +#cmakedefine01 LITE_WITH_CUDA +#cmakedefine01 LITE_ASSERT_LOC + +#ifndef LITE_ENABLE_LOGGING +#define LITE_ENABLE_LOGGING 1 +#endif + +#ifndef LITE_ENABLE_EXCEPTION +#if __cpp_exceptions || __EXCEPTIONS || \ + (defined(_MSC_VER) && defined(_CPPUNWIND)) +#define LITE_ENABLE_EXCEPTION 1 +#else +#define LITE_ENABLE_EXCEPTION 0 +#endif +#endif + +#ifndef LITE_WITH_CUDA +#define LITE_WITH_CUDA 0 +#endif + +#ifndef LITE_ASSERT_LOC +#define LITE_ASSERT_LOC 0 +#endif +#endif // _HEADER_LITE_BUILD_CONFIG diff --git a/lite/src/mge/algo_cache/file_cache.cpp b/lite/src/mge/algo_cache/file_cache.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d96d4b0bcd91f042d8e699227150c95b33da0d4 --- /dev/null +++ b/lite/src/mge/algo_cache/file_cache.cpp @@ -0,0 +1,254 @@ +/** + * \file lite/src/mge/algo_cache/file_cache.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../common.h" +#include "file_cache.h" + +using namespace lite; + +//////////////////////// InFilePersistentCache::InputMemory /////////////// +class InFilePersistentCache::InputMemory { + const uint8_t* m_ptr; + size_t m_offset = 0; + size_t m_size; + +public: + InputMemory(const uint8_t* bin, size_t size) : m_ptr{bin}, m_size{size} {} + + template + void read(T& val) { + static_assert(std::is_trivially_copyable::value, + "only support trivially copyable type"); + LITE_ASSERT(m_offset + sizeof(T) <= m_size); + memcpy(&val, m_ptr, sizeof(T)); + m_offset += sizeof(T); + m_ptr += sizeof(T); + } + + template + void read(T* buf, size_t size) { + static_assert(std::is_trivially_copyable::value && sizeof(T) == 1, + "only support read bytes"); + LITE_ASSERT(m_offset + size <= m_size); + memcpy(buf, m_ptr, size); + m_offset += size; + m_ptr += size; + } +}; + +//////////////////////// InFilePersistentCache::InputFile /////////////// +class InFilePersistentCache::InputFile { + FILE* m_fp; + +public: + InputFile(const char* path) : m_fp{fopen(path, "rb")} { + LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); + } + ~InputFile() { + if (m_fp) { + fclose(m_fp); + } + } + + template + void read(T& val) { + static_assert(std::is_trivially_copyable::value, + "only support trivially copyable type"); + auto ret = fread(&val, sizeof(T), 1, m_fp); + LITE_ASSERT(ret == 1); + } + + template + void read(T* buf, size_t size) { + static_assert(std::is_trivially_copyable::value && sizeof(T) == 1, + "only support read bytes"); + auto ret = fread(buf, size, 1, m_fp); + LITE_ASSERT(ret == 1); + } +}; + +//////////////////////// InFilePersistentCache::OutputFile /////////////// +class InFilePersistentCache::OutputFile { + FILE* m_fp; + +public: + OutputFile(const char* path) : m_fp{fopen(path, "wb")} { + LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); + } + ~OutputFile() { + if (m_fp) { + fclose(m_fp); + } + } + + template + void write(T val) { + auto ret = fwrite(&val, sizeof(T), 1, m_fp); + LITE_ASSERT(ret == 1); + } + + template + void write(const T* buf, size_t size) { + static_assert(sizeof(T) == 1, "only support write bytes"); + auto ret = fwrite(buf, size, 1, m_fp); + LITE_ASSERT(ret == 1); + } + + void flush() { fflush(m_fp); } + + void set_head() { fseek(m_fp, 0, SEEK_SET); } +}; + +//////////////////////// InFilePersistentCache::BlobStorage /////////////// + +template +InFilePersistentCache::BlobStorage& +InFilePersistentCache::BlobStorage::init_from_input(Input& inp) { + uint32_t data_size; + inp.read(data_size); + size = data_size; + data_refhold = std::make_unique(size); + inp.read(data_refhold.get(), size); + ptr = data_refhold.get(); + return *this; +} + +void InFilePersistentCache::BlobStorage::write_to_file( + OutputFile& out_file) const { + uint32_t u_size = size; + out_file.write(u_size); + out_file.write(data_refhold.get(), u_size); +} + +InFilePersistentCache::BlobStorage& +InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) { + data_refhold = std::make_unique(b.size + 1); + memcpy(data_refhold.get(), b.ptr, b.size); + data_refhold.get()[b.size] = 0; // for C-string safety + ptr = data_refhold.get(); + size = b.size; + return *this; +} + +//////////////////////// InFilePersistentCache ////////////////////// + +template +void InFilePersistentCache::read_cache(Input& inp) { + uint32_t nr_category; + inp.read(nr_category); + char category_buf[256]; + for (uint32_t i = 0; i < nr_category; i++) { + uint32_t category_size; + inp.read(category_size); + inp.read(category_buf, category_size); + category_buf[category_size] = '\0'; + + std::string category(category_buf); + mgb_log_debug("load new category: %s", category_buf); + + // read bobs + uint32_t nr_bobs; + inp.read(nr_bobs); + for (uint32_t j = 0; j < nr_bobs; j++) { + BlobStorage key_storage; + key_storage.init_from_input(inp).init_hash(); + mgb_log_debug("read key: %zu", key_storage.hash); + m_cache[category][std::move(key_storage)].init_from_input(inp); + } + } +} + +InFilePersistentCache::InFilePersistentCache(const char* path, + bool always_open) { + if (!access(path, F_OK)) { + mgb_log_debug("use fastrun cache: %s", path); + InputFile inp(path); + read_cache(inp); + } + if (always_open) { + m_always_open_file = std::make_shared(path); + } +} + +InFilePersistentCache::InFilePersistentCache(const uint8_t* bin, size_t size) { + LITE_ASSERT(bin); + InputMemory inp(bin, size); + read_cache(inp); +} + +void InFilePersistentCache::dump_cache(const char* path) { + OutputFile out_file(path); + dump_cache(&out_file); +} + +void InFilePersistentCache::dump_cache(OutputFile* out_file) { + uint32_t nr_category = m_cache.size(); + out_file->write(nr_category); + + for (const auto& cached_category : m_cache) { + uint32_t category_size = cached_category.first.size(); + out_file->write(category_size); + out_file->write(cached_category.first.data(), category_size); + mgb_log_debug("write new category: %s", cached_category.first.c_str()); + + uint32_t nr_bobs = cached_category.second.size(); + out_file->write(nr_bobs); + for (const auto& item : cached_category.second) { + mgb_log_debug("dump key: %zu", item.first.hash); + item.first.write_to_file(*out_file); + item.second.write_to_file(*out_file); + } + } +} + +mgb::Maybe InFilePersistentCache::get( + const std::string& category, const Blob& key) { + decltype(m_cache.begin()) iter0; + { + MGB_LOCK_GUARD(m_mtx); + iter0 = m_cache.find(category); + if (iter0 == m_cache.end()) + return mgb::None; + } + + BlobStorage key_storage; + key_storage.Blob::operator=(key); + key_storage.init_hash(); + + MGB_LOCK_GUARD(m_mtx); + + auto iter1 = iter0->second.find(key_storage); + if (iter1 == iter0->second.end()) + return mgb::None; + return iter1->second; +} + +void InFilePersistentCache::put(const std::string& category, const Blob& key, + const Blob& value) { + BlobStorage key_storage; + key_storage.init_data_ref(key).init_hash(); + + MGB_LOCK_GUARD(m_mtx); + auto size0 = m_cache.size(); + m_cache[category][std::move(key_storage)].init_data_ref(value); + if (m_cache.size() > size0) { + mgb_log_debug("new cache category: %s", category.c_str()); + } + if (m_always_open_file) { + m_always_open_file->set_head(); + dump_cache(m_always_open_file.get()); + m_always_open_file->flush(); + } +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/algo_cache/file_cache.h b/lite/src/mge/algo_cache/file_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..340780ec8009cc595abee184296fe2a4cef21716 --- /dev/null +++ b/lite/src/mge/algo_cache/file_cache.h @@ -0,0 +1,85 @@ +/** + * \file lite/src/mge/algo_cache/file_cache.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" +#if LITE_BUILD_WITH_MGE + +#include "megbrain/utils/persistent_cache.h" + +namespace lite { + +/** + * dump format: + * + * all integers in local endian (effectively little endian as I can see) + * + * dump format: + * + * []* + */ +//! TODO: fix one thread set cache when other threads is using old cache +class InFilePersistentCache final : public mgb::PersistentCache { + class InputFile; + class InputMemory; + class OutputFile; + struct BlobStorage : public Blob { + std::unique_ptr data_refhold; + size_t hash = 0; + + template + BlobStorage& init_from_input(Input& inp); + void write_to_file(OutputFile& out_file) const; + BlobStorage& init_data_ref(const Blob& b); + + BlobStorage& init_hash() { + hash = mgb::XXHash{}.update(ptr, size).digest(); + return *this; + } + + bool operator==(const BlobStorage& rhs) const { + return size == rhs.size && !memcmp(ptr, rhs.ptr, size); + } + + struct Hash { + size_t operator()(const BlobStorage& b) const { return b.hash; } + }; + }; + std::unordered_map> + m_cache; + LITE_MUTEX m_mtx; + std::shared_ptr m_always_open_file; + + template + void read_cache(Input& inp); + +public: + InFilePersistentCache() = default; + InFilePersistentCache(const char* path, bool always_open = false); + InFilePersistentCache(const uint8_t* bin, size_t size); + + /** + * \warning You should invoke \c dump_cache mannually to save the cache + * file. + */ + void dump_cache(const char* path); + void dump_cache(OutputFile* out_file); + + mgb::Maybe get(const std::string& category, const Blob& key) override; + void put(const std::string& category, const Blob& key, + const Blob& value) override; +}; + +} // namespace lite +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/algo_cache/redis_cache.cpp b/lite/src/mge/algo_cache/redis_cache.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ecf0ccbcb1604f98e61854cd4d827cf3037a34e0 --- /dev/null +++ b/lite/src/mge/algo_cache/redis_cache.cpp @@ -0,0 +1,241 @@ +/** + * \file lite/src/mge/algo_cache/redis_cache.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA +#include "../../misc.h" +#include "redis_cache.h" + +#include +#include + +namespace { + +/* +** Translation Table as described in RFC1113 +*/ +static const char cb64[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +/* +** Translation Table to decode: +*https://github.com/dgiardini/imgcalkap/blob/master/base64.c +*/ +static const char cd64[] = + "|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`" + "abcdefghijklmnopq"; + +/* +** encodeblock +** +** encode 3 8-bit binary bytes as 4 '6-bit' characters +*/ +void encodeblock(unsigned char in[3], unsigned char out[4], int len) { + out[0] = cb64[in[0] >> 2]; + out[1] = cb64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)]; + out[2] = (unsigned char)(len > 1 ? cb64[((in[1] & 0x0f) << 2) | + ((in[2] & 0xc0) >> 6)] + : '='); + out[3] = (unsigned char)(len > 2 ? cb64[in[2] & 0x3f] : '='); +} + +/* +** decodeblock +** +** decode 4 '6-bit' characters into 3 8-bit binary bytes +*/ +void decodeblock(unsigned char in[4], unsigned char out[3]) { + out[0] = (unsigned char)(in[0] << 2 | in[1] >> 4); + out[1] = (unsigned char)(in[1] << 4 | in[2] >> 2); + out[2] = (unsigned char)(((in[2] << 6) & 0xc0) | in[3]); +} + +/** + * Encode string to base64 string + * @param input - source string + * @param outdata - target base64 string + * @param linesize - max size of line + */ +void encode(const std::vector& input, + std::vector& outdata, int linesize = 76) { + outdata.clear(); + + unsigned char in[3], out[4]; + int i, len, blocksout = 0; + size_t j = 0; + + auto* indata = reinterpret_cast(input.data()); + unsigned int insize = input.size(); + + while (j <= insize) { + len = 0; + for (i = 0; i < 3; i++) { + in[i] = (unsigned char)indata[j]; + j++; + if (j <= insize) { + len++; + } else { + in[i] = 0; + } + } + if (len) { + encodeblock(in, out, len); + for (i = 0; i < 4; i++) { + outdata.push_back(out[i]); + } + blocksout++; + } + if (blocksout >= (linesize / 4) || (j == insize)) { + if (blocksout) { + outdata.push_back('\r'); + outdata.push_back('\n'); + } + blocksout = 0; + } + } +} + +/** + * Decode base64 string ot source + * @param input - base64 string + * @param outdata - source string + */ +void decode(const std::vector& input, + std::vector& outdata) { + outdata.clear(); + + unsigned char in[4], out[3], v; + int i, len; + size_t j = 0; + + auto* indata = reinterpret_cast(input.data()); + unsigned int insize = input.size(); + + while (j <= insize) { + for (len = 0, i = 0; i < 4 && (j <= insize); i++) { + v = 0; + while ((j <= insize) && v == 0) { + v = (unsigned char)indata[j++]; + v = (unsigned char)((v < 43 || v > 122) ? 0 : cd64[v - 43]); + if (v) { + v = (unsigned char)((v == '$') ? 0 : v - 61); + } + } + if (j <= insize) { + len++; + if (v) { + in[i] = (unsigned char)(v - 1); + } + } else { + in[i] = 0; + } + } + if (len) { + decodeblock(in, out); + for (i = 0; i < len - 1; i++) { + outdata.push_back(out[i]); + } + } + } +} + +/** + * Encode binary data to base64 buffer + * @param input - source data + * @param outdata - target base64 buffer + * @param linesize + */ +void encode(const std::string& input, std::string& outdata, int linesize = 76) { + std::vector out; + std::vector in(input.begin(), input.end()); + encode(in, out, linesize); + outdata = std::string(out.begin(), out.end()); +} + +/** + * Decode base64 buffer to source binary data + * @param input - base64 buffer + * @param outdata - source binary data + */ +void decode(const std::string& input, std::string& outdata) { + std::vector in(input.begin(), input.end()); + std::vector out; + decode(in, out); + outdata = std::string(out.begin(), out.end()); +} + +} // namespace + +using namespace lite; + +RedisCache::RedisCache(std::string redis_ip, size_t port, std::string password) + : m_ip(redis_ip), m_port(port), m_password(password) { + m_client.auth(password); + m_client.connect( + m_ip, m_port, + [](const std::string& host, std::size_t port, + cpp_redis::connect_state status) { + if (status == cpp_redis::connect_state::dropped) { + LITE_LOG("client disconnected from %s.", host.c_str()); + LITE_LOG("Redis server connect to %s :%zu failed.", + host.c_str(), port); + } + }, + std::uint32_t(200)); +} + +mgb::Maybe RedisCache::get( + const std::string& category, const mgb::PersistentCache::Blob& key) { + LITE_LOCK_GUARD(m_mtx); + if (m_old == nullptr) { + return mgb::None; + } + auto mem_result = m_old->get(category, key); + if (mem_result.valid()) + return mem_result; + + std::string key_str(static_cast(key.ptr), key.size); + std::string redis_key_str; + encode(category + '@' + key_str, redis_key_str, 24); + auto result = m_client.get(redis_key_str); + m_client.sync_commit(std::chrono::milliseconds(100)); + LITE_ASSERT(is_valid()); + auto content = result.get(); + if (content.is_null()) + return mgb::None; + std::string decode_content; + decode(content.as_string(), decode_content); + m_old->put(category, key, {decode_content.data(), decode_content.length()}); + + return m_old->get(category, key); +} + +void RedisCache::put(const std::string& category, const Blob& key, + const mgb::PersistentCache::Blob& value) { + // ScopedTimer t1(std::string("put") + category); + LITE_LOCK_GUARD(m_mtx); + std::string key_str(static_cast(key.ptr), key.size); + std::string redis_key_str; + encode(category + '@' + key_str, redis_key_str); + std::string value_str(static_cast(value.ptr), value.size); + std::string redis_value_str; + encode(value_str, redis_value_str); + + auto result = m_client.set(redis_key_str, redis_value_str); + if (m_old == nullptr) { + return; + } + m_old->put(category, key, value); + m_client.sync_commit(std::chrono::milliseconds(100)); + LITE_ASSERT(is_valid()); +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/algo_cache/redis_cache.h b/lite/src/mge/algo_cache/redis_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..d0bd0032f629c8ad4872dd3fec71b5f61124391b --- /dev/null +++ b/lite/src/mge/algo_cache/redis_cache.h @@ -0,0 +1,47 @@ +/** + * \file lite/src/mge/algo_cache/redis_cache.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA +#include +#include +#include +#include "megbrain/utils/persistent_cache.h" + +namespace lite { + +//! TODO: fix one thread set cache when other threads is using old cache +class RedisCache final : public mgb::PersistentCache { +public: + RedisCache(std::string redis_ip, size_t port, std::string password); + + bool is_valid() { return m_client.is_connected(); } + ~RedisCache() {} + void init(std::shared_ptr old) { m_old = old; } + + mgb::Maybe get(const std::string& category, const Blob& key) override; + + void put(const std::string& category, const Blob& key, + const Blob& value) override; + +private: + std::shared_ptr m_old; + LITE_MUTEX m_mtx; + cpp_redis::client m_client; + const std::string m_ip; + const size_t m_port; + const std::string m_password; +}; + +} // namespace lite +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/common.cpp b/lite/src/mge/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08fdcaa756be423811ef77237c19d6c78517b980 --- /dev/null +++ b/lite/src/mge/common.cpp @@ -0,0 +1,191 @@ +/** + * \file src/mge/common.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "common.h" +#include "megdnn/dtype.h" + +using namespace lite; +using namespace mgb; + +enum class CompressionMethod { + NO_COMPRESSION = 0, + FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS = 1, + FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS = 2, +}; + +void lite::decompressed_tensor_value_loader( + void* ptr_, const mgb::TensorLayout& layout, + mgb::serialization::InputFile& fin) { + uint8_t compress_flag; + fin.read(&compress_flag, sizeof(compress_flag)); + size_t num_weights = layout.total_nr_elems(); + switch (CompressionMethod(compress_flag)) { + case CompressionMethod::NO_COMPRESSION: { + mgb::serialization::GraphLoadConfig::default_tensor_value_loader( + ptr_, layout, fin); + break; + } + case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS: { + if (ptr_) { + float stride, base; + std::vector weights(num_weights); + fin.read(&stride, sizeof(stride)); + fin.read(&base, sizeof(base)); + fin.read(weights.data(), num_weights * sizeof(uint8_t)); + auto* ptr = static_cast(ptr_); + for (size_t i = 0; i < num_weights; ++i) + ptr[i] = stride * weights[i] + base; + } else { + fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint8_t)); + } + break; + } + case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS: { + if (ptr_) { + float stride, base; + std::vector weights(num_weights); + fin.read(&stride, sizeof(stride)); + fin.read(&base, sizeof(base)); + fin.read(weights.data(), num_weights * sizeof(uint16_t)); + auto* ptr = static_cast(ptr_); + for (size_t i = 0; i < num_weights; ++i) + ptr[i] = stride * weights[i] + base; + } else { + fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint16_t)); + } + break; + } + default: + LITE_THROW("Unexpected compression method"); + } +} + +LTensorLayout lite::to_impl_layout(const Layout& layout) { + mgb::TensorLayout mge_layout; + mge_layout.ndim = layout.ndim; + LITE_ASSERT(layout.ndim < TensorShape::MAX_NDIM, + "lite layout ndim is to large"); + for (size_t i = 0; i < layout.ndim; i++) { + mge_layout.shape[i] = layout.shapes[i]; + } + mge_layout.init_contiguous_stride(); + switch (layout.data_type) { + case LiteDataType::LITE_FLOAT: + mge_layout.dtype = mgb::dtype::Float32(); + break; + case LiteDataType::LITE_HALF: + mge_layout.dtype = mgb::dtype::Float16(); + break; + case LiteDataType::LITE_INT: + mge_layout.dtype = mgb::dtype::Int32(); + break; + case LiteDataType::LITE_INT8: + mge_layout.dtype = mgb::dtype::Int8(); + break; + case LiteDataType::LITE_UINT8: + mge_layout.dtype = mgb::dtype::Uint8(); + break; + case LiteDataType::LITE_INT16: + mge_layout.dtype = mgb::dtype::Int16(); + break; + default: + LITE_THROW(mgb::ssprintf("unsupport dtype in lite enum id is %d.", + static_cast(layout.data_type))); + } + return mge_layout; +} + +Layout lite::to_lite_layout(const LTensorLayout& mge_layout) { + Layout layout; + if (!mge_layout.dtype.valid()) { + return layout; + } + layout.ndim = mge_layout.ndim; + LITE_ASSERT(layout.ndim < layout.MAXDIM, "tensor layout ndim is to large"); + for (size_t i = 0; i < layout.ndim; i++) { + layout.shapes[i] = mge_layout.shape[i]; + } + switch (mge_layout.dtype.enumv()) { + case mgb::DTypeEnum::Float32: + layout.data_type = LiteDataType::LITE_FLOAT; + break; + case mgb::DTypeEnum::Float16: + layout.data_type = LiteDataType::LITE_HALF; + break; + case mgb::DTypeEnum::Int32: + layout.data_type = LiteDataType::LITE_INT; + break; + case mgb::DTypeEnum::Int16: + layout.data_type = LiteDataType::LITE_INT16; + break; + case mgb::DTypeEnum::Int8: + layout.data_type = LiteDataType::LITE_INT8; + break; + case mgb::DTypeEnum::Uint8: + layout.data_type = LiteDataType::LITE_UINT8; + break; + default: + LITE_THROW(mgb::ssprintf("unsupport dtype in lite : %s.", + mge_layout.to_string().c_str())); + } + return layout; +} + +mgb::CompNode::Locator lite::to_compnode_locator(const LiteDeviceType& device) { + mgb::CompNode::Locator loc; + switch (device) { + case LiteDeviceType::LITE_CPU: + loc.type = mgb::CompNode::DeviceType::CPU; + break; + case LiteDeviceType::LITE_CUDA: + loc.type = mgb::CompNode::DeviceType::CUDA; + break; + case LiteDeviceType::LITE_ATLAS: + loc.type = mgb::CompNode::DeviceType::ATLAS; + break; + case LiteDeviceType::LITE_OPENCL: + loc.type = mgb::CompNode::DeviceType::OPENCL; + break; + case LiteDeviceType::LITE_DEVICE_DEFAULT: + loc.type = mgb::CompNode::DeviceType::UNSPEC; + break; + default: + LITE_THROW( + ssprintf("lite unsupported compnode type: enum value: %d.", + (int)(device))); + } + return loc; +} + +LiteDeviceType lite::get_device_from_locator( + const mgb::CompNode::Locator& locator) { + switch (locator.type) { + case mgb::CompNode::DeviceType::CPU: + case mgb::CompNode::DeviceType::MULTITHREAD: + return LiteDeviceType::LITE_CPU; + case mgb::CompNode::DeviceType::CUDA: + return LiteDeviceType::LITE_CUDA; + case mgb::CompNode::DeviceType::ATLAS: + return LiteDeviceType::LITE_ATLAS; + case mgb::CompNode::DeviceType::OPENCL: + return LiteDeviceType::LITE_OPENCL; + case mgb::CompNode::DeviceType::UNSPEC: + return LiteDeviceType::LITE_DEVICE_DEFAULT; + default: + LITE_THROW( + ssprintf("lite unsupported compnode type: enum value: %d.", + (int)(locator.type))); + } +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/common.h b/lite/src/mge/common.h new file mode 100644 index 0000000000000000000000000000000000000000..4d4066d973908093c0acfbe1d7f1a73749c59406 --- /dev/null +++ b/lite/src/mge/common.h @@ -0,0 +1,66 @@ +/** + * \file src/mge/common.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../misc.h" +#include "lite/network.h" +#include "lite/tensor.h" +#include "megbrain/comp_node.h" +#include "megbrain/serialization/serializer.h" +#include "megbrain/tensor.h" + +//! rename mge name L* +namespace lite { +using LTensorLayout = mgb::TensorLayout; +using LComputingGraph = mgb::ComputingGraph; +using LDeviceTensorStorage = mgb::DeviceTensorStorage; +} // namespace lite + +namespace lite { +/*! + * \brief transform mgelite Layout to mgb TensorLayout + */ +LTensorLayout to_impl_layout(const Layout& layout); + +/*! + * \brief transform mgb TensorLayout to mgelite Layout + */ +Layout to_lite_layout(const mgb::TensorLayout& mge_layout); + +/*! + * \brief transform mgelite device to mgb CompNode Locator + */ +mgb::CompNode::Locator to_compnode_locator(const LiteDeviceType& device); + +/*! + * \brief transform mgb CompNode Locator to lite Device + */ +LiteDeviceType get_device_from_locator(const mgb::CompNode::Locator& locator); + +/*! \brief A megbrain tensor loader with weight decompression. + * + * The weight to be compressed must start with a byte of compression flag (CF). + * + * 1. CF = 0: no compression. + * 2. CF = 1: float32 stride + float32 base + uint8 weight (return s*w+b) + * 3. CF = 2: float32 stride + float32 base + uint16 weight (return s*w+b) + * + */ +void decompressed_tensor_value_loader(void* ptr_, + const mgb::TensorLayout& layout, + mgb::serialization::InputFile& fin); + +} // namespace lite +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/function_dft.h b/lite/src/mge/function_dft.h new file mode 100644 index 0000000000000000000000000000000000000000..a997a3f357e6a330cb2be7b584cd651811ef925f --- /dev/null +++ b/lite/src/mge/function_dft.h @@ -0,0 +1,212 @@ + +/** + * \file src/mge/function_dft.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#if LITE_BUILD_WITH_MGE +#include "function_base.h" +#include "network_impl.h" +#include "network_impl_base.h" +#include "tensor_impl.h" +namespace lite { + +#define THROW_FUNC_ERROR(func_name) \ + auto msg_info = func_name + " is not aviliable in Dft backend."; \ + LITE_THROW(msg_info.c_str()) + +// the functions used for dft's tensor.cpp are as followed: + +template <> +inline std::shared_ptr +call_func>( + std::string func_name) { + if (func_name == "create_tensor") { + return std::make_shared(); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline std::shared_ptr +call_func>( + std::string func_name, LiteDeviceType device_type, + bool is_pinned_host) { + if (func_name == "create_tensor") { + return std::make_shared(device_type, is_pinned_host); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline std::shared_ptr +call_func>( + std::string func_name, int device_id, LiteDeviceType device_type, + const Layout layout, bool is_pinned_host) { + if (func_name == "create_tensor") { + return std::make_shared(device_id, device_type, layout, + is_pinned_host); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline std::shared_ptr +call_func>( + std::string func_name, LiteDeviceType device_type, const Layout layout, + bool is_pinned_host) { + if (func_name == "create_tensor") { + return std::make_shared(device_type, layout, + is_pinned_host); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline std::shared_ptr +call_func>( + std::string func_name, int device_id, int stream_id, + LiteDeviceType device_type, bool is_pinned_host) { + if (func_name == "create_tensor") { + return std::make_shared(device_id, stream_id, + device_type, is_pinned_host); + } + THROW_FUNC_ERROR(func_name); +} + +// the functions used for dft's network.cpp are as followed: + +template <> +inline std::unique_ptr +call_func>( + std::string func_name) { + if (func_name == "create_network") { + return std::make_unique(); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline Network::NetworkImplBase* +try_call_func( + std::string func_name) { + if (func_name == "parse_model") { + return new NetworkImplDft(); + } + THROW_FUNC_ERROR(func_name); +} + +#define CALL_FUNC(func_name, ...) \ + network_impl->cast_final_safe().func_name(__VA_ARGS__) + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + size_t num) { + if (func_name == "set_cpu_threads_number") { + CALL_FUNC(set_cpu_threads_number, num); + } else if (func_name == "set_network_algo_workspace_limit") { + CALL_FUNC(set_network_algo_workspace_limit, num); + } else { + THROW_FUNC_ERROR(func_name); + } +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl) { + if (func_name == "use_tensorrt") { + CALL_FUNC(use_tensorrt); + } else if (func_name == "set_cpu_inplace_mode") { + CALL_FUNC(set_cpu_inplace_mode); + } else { + THROW_FUNC_ERROR(func_name); + } +} + +template <> +inline size_t call_func( + std::string func_name, Network::NetworkImplBase* network_impl) { + if (func_name == "get_cpu_threads_number") { + return CALL_FUNC(get_cpu_threads_number); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline bool call_func( + std::string func_name, Network::NetworkImplBase* network_impl) { + if (func_name == "is_cpu_inplace_mode") { + return CALL_FUNC(is_cpu_inplace_mode); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + ThreadAffinityCallback thread_affinity_callback) { + if (func_name == "set_runtime_thread_affinity") { + return CALL_FUNC(set_runtime_thread_affinity, + std::move(thread_affinity_callback)); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size, + bool binary_equal_between_batch) { + if (func_name == "set_network_algo_policy") { + return CALL_FUNC(set_network_algo_policy, strategy, shared_batch_size, + binary_equal_between_batch); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + std::shared_ptr user_allocator) { + if (func_name == "set_memory_allocator") { + return CALL_FUNC(set_memory_allocator, user_allocator); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + std::string file_name) { + if (func_name == "enable_io_txt_dump") { + return CALL_FUNC(enable_io_txt_dump, file_name); + } else if (func_name == "enable_io_bin_dump") { + return CALL_FUNC(enable_io_bin_dump, file_name); + } + THROW_FUNC_ERROR(func_name); +} + +template <> +inline void call_func( + std::string func_name, Network::NetworkImplBase* network_impl, + Network::NetworkImplBase* src_network_impl) { + if (func_name == "share_runtime_memory_with") { + CALL_FUNC(share_runtime_memory_with, src_network_impl); + } else if (func_name == "shared_weight_with") { + CALL_FUNC(shared_weight_with, src_network_impl); + } else { + THROW_FUNC_ERROR(func_name); + } +} +#undef THROW_FUNC_ERROR + +} // namespace lite +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/memory_allocator.h b/lite/src/mge/memory_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..d321fd9fbeeab482403a40c04b5f8e2101159232 --- /dev/null +++ b/lite/src/mge/memory_allocator.h @@ -0,0 +1,69 @@ +/** + * \file src/mge/memory_alloctor.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "common.h" +#include "megbrain/dtype.h" +#include "network_impl.h" + +#include "megbrain/graph/cg.h" + +namespace lite { + +class UserStaticMemAlloc final : public mgb::cg::DeviceMemoryAllocator { + std::shared_ptr m_allocator = nullptr; + +public: + UserStaticMemAlloc(std::shared_ptr allocator) + : m_allocator(allocator) {} + + void alloc_static(LComputingGraph*, LDeviceTensorStorage& dest, + size_t size) override { + if (size < dest.size()) { + return; + } + auto cn = dest.comp_node_allow_invalid(); + LITE_ASSERT(cn.valid(), "The compnode is invalid when alloc memory."); + LiteDeviceType device_type = + get_device_from_locator(cn.locator_logical()); + int device_id = cn.locator_logical().device; + auto ptr_alloc = static_cast(m_allocator->allocate( + device_type, device_id, size, cn.get_mem_addr_alignment())); + auto storage = std::shared_ptr( + ptr_alloc, + [allocator = m_allocator, device_type, device_id](void* ptr) { + allocator->free(device_type, device_id, ptr); + }); + dest.reset(cn, size, storage); + } + void alloc_dynamic(mgb::VarNode*, mgb::DeviceTensorStorage& dest, + size_t size) override { + alloc_static(nullptr, dest, size); + } + + void defrag_prealloc_contig(mgb::ComputingGraph*, mgb::CompNode comp_node, + size_t size) override { + LiteDeviceType device_type = + get_device_from_locator(comp_node.locator_logical()); + int device_id = comp_node.locator_logical().device; + auto ptr_tmp = + m_allocator->allocate(device_type, device_id, size, + comp_node.get_mem_addr_alignment()); + m_allocator->free(device_type, device_id, ptr_tmp); + } +}; + +} // namespace lite +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ae176632edcccb48e566f219d62efcb165703e05 --- /dev/null +++ b/lite/src/mge/network_impl.cpp @@ -0,0 +1,781 @@ +/** + * \file src/mge/network_impl.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "network_impl.h" +#include "common.h" +#include "lite/network.h" +#include "memory_allocator.h" +#include "parse_model/model_parser.h" +#include "parse_info/parse_info_base.h" + +#include "megbrain/common.h" +#include "megbrain/comp_node.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/gopt/inference.h" +#include "megbrain/graph.h" +#include "megbrain/graph/cg.h" +#include "megbrain/opr/io.h" +#include "megbrain/tensor.h" + +#if MGB_OPENCL +#include "megcore_opencl.h" +#endif + +#include +#include +#include + +using namespace lite; +using namespace mgb; + +LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft); + +void NetworkImplDft::set_config(const Config& config) { + m_user_config = std::make_unique(); + *m_user_config = config; + m_load_config.comp_graph = mgb::ComputingGraph::make(); + m_compnode_locator = to_compnode_locator(m_user_config->device_type); + m_compnode_locator.device = config.device_id; +} + +void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) { + application_config(); + const auto& src_impl = src_network->cast_final_safe(); + LITE_ASSERT(src_impl.m_loader, + "Clone network must after the network is loaded."); + m_load_result = src_impl.m_loader->load(m_load_config, true); + + //! flag weather the mode is cross compnode model + cross_compnode_model_detect(); + + //! update the IO of the network + update_io(); + + //! replace the IO when there is device input or output + compile_graph(); +} + +void NetworkImplDft::application_config() { + auto device_type = m_user_config->device_type; + m_compnode_locator.type = to_compnode_locator(device_type).type; + m_compnode_locator.device = m_user_config->device_id; + if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) { + m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; + m_compnode_locator.device = m_user_config->device_id; + } + //! model options +#define ConfigOption(mge_name, lite_name) \ + options.mge_name = m_user_config->options.lite_name; + + auto&& options = m_load_config.comp_graph->options(); + ConfigOption(graph_opt.weight_preprocess, weight_preprocess); + ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess); + ConfigOption(fake_next_exec, fake_next_exec); + ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run); + m_load_config.const_var_shape = m_user_config->options.const_shape; + ConfigOption(force_dynamic_alloc, force_dynamic_alloc); + ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc); + ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change); + LITE_ASSERT(m_user_config->options.jit_level == 0 || + (m_user_config->options.jit_level > 0 && + device_type == LiteDeviceType::LITE_CUDA), + "jit only support in cuda device."); + ConfigOption(graph_opt.jit, jit_level); + ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level); + ConfigOption(graph_opt_level, graph_opt_level); + ConfigOption(async_exec_level, async_exec_level); + +#undef ConfigOption +#define ConfigOptionLayoutTransform(name) \ + if (m_user_config->options.name) { \ + options.graph_opt.name(); \ + } + ConfigOptionLayoutTransform(enable_nchw44); + ConfigOptionLayoutTransform(enable_nchw44_dot); + ConfigOptionLayoutTransform(enable_nchw88); + ConfigOptionLayoutTransform(enable_nhwcd4); + ConfigOptionLayoutTransform(enable_nchw4); + ConfigOptionLayoutTransform(enable_nchw32); + ConfigOptionLayoutTransform(enable_nchw64); +#undef ConfigOptionLayoutTransform + if (m_user_config->has_compression) { + m_load_config.tensor_value_loader = decompressed_tensor_value_loader; + } + + //! if device is LITE_NONE, the compnode information is stored in model + if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) { + //! currently not set Locator type because an atlas mgb model is a + //! cross-compnode graph + if (device_type == LiteDeviceType::LITE_ATLAS) { + m_load_config.comp_node_mapper = + [this](mgb::CompNode::Locator& loc) { + if (loc.type == mgb::CompNode::DeviceType::ATLAS) { + loc.device = m_compnode_locator.device; + loc.stream = m_compnode_locator.stream; + } else if (loc.type == + mgb::CompNode::DeviceType::MULTITHREAD) { + loc.stream = m_nr_threads; + } + }; + } else { + m_load_config.comp_node_mapper = + [this](mgb::CompNode::Locator& loc) { + loc = m_compnode_locator; + }; + } + } +} + +void NetworkImplDft::set_memory_allocator( + std::shared_ptr user_allocator) { + auto allocator = std::make_shared(user_allocator); + LITE_ASSERT(m_load_config.comp_graph); + m_load_config.comp_graph->set_device_memory_allocator(allocator); +} + +//! share the runtime memory with other network, the weights is not shared +void NetworkImplDft::share_runtime_memory_with( + Network::NetworkImplBase* network_impl) { + LITE_ASSERT(network_impl); + LITE_ASSERT(m_load_config.comp_graph); + m_load_config.comp_graph->share_device_memory_with( + *(network_impl->cast_final_safe() + .m_load_config.comp_graph)); +} + +void NetworkImplDft::set_cpu_inplace_mode() { + LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, + "cpu inplace mode is only avaliable in CPU."); + m_is_cpu_inplace_mode = true; + if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) { + m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT; + } else { + LITE_ASSERT( + m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD, + "cpu inplace mode is only avaliable in CPU."); + m_compnode_locator.device = + mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT; + } +} + +void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) { + LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, + "multi threads mode is only avaliable in CPU."); + if (nr_threads > 1) { + m_nr_threads = nr_threads; + m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; + m_compnode_locator.nr_threads = nr_threads; + } +} + +void NetworkImplDft::set_runtime_thread_affinity( + const ThreadAffinityCallback& thread_affinity_callback) { + LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, + "multi threads mode is only avaliable in CPU."); + mgb::CompNode::Locator loc; + m_load_config.comp_node_mapper(loc); + auto cn = mgb::CompNode::load(loc); + if (m_nr_threads > 1) { + mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity( + thread_affinity_callback); + } else { + mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( + [thread_affinity_callback](void) { + thread_affinity_callback(0); + }); + } +} + +void NetworkImplDft::set_device_id(int device_id) { + m_compnode_locator.device = device_id; + m_user_config->device_id = device_id; +} + +void NetworkImplDft::set_stream_id(int stream_id) { + m_compnode_locator.stream = stream_id; +} + +void NetworkImplDft::use_tensorrt() { + auto&& options = m_load_config.comp_graph->options(); + options.graph_opt.tensorrt = true; +} + +//! set the callback in async model +void NetworkImplDft::set_async_callback(const AsyncCallback& callback) { + LITE_ASSERT(!m_is_cpu_inplace_mode, + "cpu inplace mode not support async mode"); + LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU || + m_user_config->device_type == LiteDeviceType::LITE_CUDA, + "Now only cpu and cuda>10.0 support async mode"); + m_async = true; + m_async_callback = std::move(callback); +} + +void NetworkImplDft::make_output_spec() { + m_output_spec.clear(); + for (auto&& out : m_network_io->outputs) { + if (m_load_result.output_var_map.count(out.name)) { + auto&& load_out = m_load_result.output_var_map[out.name]; + auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable { + mgb::CompNode comp_node = dv.comp_node(); + if (out.io_type == LiteIOType::LITE_IO_SHAPE) { + auto mgb_layout = dv.layout(); + out.lite_tensor->set_layout(to_lite_layout(mgb_layout)); + } else { + TensorHelper::implement(out.lite_tensor) + ->cast_final_safe() + .copy_from_mge_tensor(dv); + out.lite_tensor->update_from_implement(); + } + if (m_async) { + out.have_sync = true; + bool need_exec_cb = true; + for (auto&& j : m_network_io->outputs) { + if (!j.have_sync) { + need_exec_cb = false; + } + } + if (need_exec_cb) { + for (auto&& j : m_network_io->outputs) { + j.have_sync = false; + } + comp_node.add_callback([this]() { finish(); }); + } + } + }; + m_output_spec.emplace_back(load_out, std::move(cb)); + } else { + LITE_THROW(ssprintf("no output named : %s in the mode", + out.name.c_str())); + } + } +} + +void NetworkImplDft::replace_dev_input_pass() { + mgb::CompNode::Locator locator; + m_load_config.comp_node_mapper(locator); + //! CPU is not need use device input + if (locator.type == mgb::CompNode::DeviceType::CPU) { + return; + } + //! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor + //! in m_network_io.input, user can directly change the dev tensor + //! storage through m_network_io.input.lite_tensor->reset() befor forward + using DeviceTensorMap = + std::unordered_map>; + DeviceTensorMap name2dev_tensor; + + mgb::ThinHashMap host_val2var; + + //! construct host_val2var that maps from host tensor to corresponding var + auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + mgb::HostTensorND* tensor = + opr->cast_final() + .host_data() + .get(); + host_val2var[tensor] = opr->output(0); + } + }; + mgb::cg::DepOprIter dep_iter{on_opr}; + for (auto i : m_load_result.output_var_list) { + dep_iter.add(i.node()->owner_opr()); + } + + mgb::ThinHashMap inp_var_map, out_var_map; + + mgb::SmallVector to_clear; + for (auto&& config_in : m_network_io->inputs) { + if (!config_in.is_host) { + auto host_val = m_load_result.tensor_map[config_in.name]; + auto dev_val = TensorHelper::implement(config_in.lite_tensor) + ->cast_final_safe() + .m_dev_tensor; + auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make( + *m_load_result.graph, dev_val, {config_in.name}); + inp_var_map[host_val2var.at(host_val.get())] = dev_var; + name2dev_tensor[config_in.name] = dev_val; + } + } + auto new_ovar = + mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map); + for (size_t i = 0; i < new_ovar.size(); ++i) { + out_var_map[m_load_result.output_var_list[i]] = new_ovar[i]; + } + for (auto&& i : m_load_result.output_var_map) { + i.second = out_var_map.at(i.second); + } + for (auto&& i : m_load_result.output_var_map_id) { + i.second = out_var_map.at(i.second); + } + for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) { + new_ovar[i].rename(m_load_result.output_var_list[i].node()->name()); + } + m_load_result.output_var_list = std::move(new_ovar); +} + +void NetworkImplDft::cross_compnode_model_detect() { + mgb::ThinHashSet nr_used_device_type; + auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { + for (auto j : opr->output()) { + if (j->comp_node() != mgb::CompNode::default_cpu()) { + nr_used_device_type.insert( + get_device_from_locator(j->comp_node().locator())); + } + } + }; + mgb::cg::DepOprIter dep_iter{on_opr}; + for (auto i : m_load_result.output_var_list) { + dep_iter.add(i.node()->owner_opr()); + } + m_nr_device_type = nr_used_device_type.size(); +} + +void NetworkImplDft::load_model( + std::shared_ptr model_mem, size_t size, + std::unordered_map separate_config_map) { + if (!m_loader) { + m_input_file = mgb::serialization::InputFile::make_mem_proxy( + model_mem, size, false); + auto format = + mgb::serialization::GraphLoader::identify_graph_dump_format( + *m_input_file); + if (!format.valid()) { + LITE_THROW("invalid model format"); + } + m_loader = mgb::serialization::GraphLoader::make( + std::move(m_input_file), format.val()); + } + + + //! applay the user configration to mge model + application_config(); + + //! config some flag get from json config file + if (separate_config_map.find("device_id") != separate_config_map.end()) { + set_device_id(separate_config_map["device_id"].unsafe_cast()); + } + if (separate_config_map.find("number_threads") != + separate_config_map.end() && + separate_config_map["number_threads"].unsafe_cast() > 1) { + set_cpu_threads_number( + separate_config_map["number_threads"].unsafe_cast()); + } + if (separate_config_map.find("enable_inplace_model") != + separate_config_map.end() && + separate_config_map["enable_inplace_model"].unsafe_cast()) { + set_cpu_inplace_mode(); + } + if (separate_config_map.find("use_tensorrt") != separate_config_map.end() && + separate_config_map["use_tensorrt"].unsafe_cast()) { + use_tensorrt(); + } + + m_load_result = m_loader->load(m_load_config, true); + + cross_compnode_model_detect(); + + //! update the IO of the network + update_io(); + + //! replace the IO when there is device input or output + compile_graph(); +} + +void NetworkImplDft::compile_graph() { + modify_exection_policy(); + replace_dev_input_pass(); + make_output_spec(); + m_execute_func = m_load_result.graph_compile(m_output_spec); +} + +void NetworkImplDft::start() const { + if (m_start_callback) { + std::unordered_map>> + input_io_map; + for (auto&& io_inner : m_network_io->inputs) { + input_io_map[io_inner.name] = { + IO{io_inner.name, io_inner.is_host, io_inner.io_type, + io_inner.config_layout}, + io_inner.lite_tensor}; + } + m_start_callback(input_io_map); + } +} + +void NetworkImplDft::forward() { + start(); + LITE_ASSERT(m_execute_func, "forward must be called after network loaded."); + m_execute_func->execute(); +} + +void NetworkImplDft::wait() { + if (!m_async) { + m_execute_func->wait(); + } + finish(); +} + +void NetworkImplDft::finish() const { + if (m_async) { + LITE_ASSERT(m_async_callback, + "The callback func must set when async mode."); + m_async_callback(); + } + if (m_finish_callback) { + std::unordered_map>> + output_io_map; + for (auto&& io_inner : m_network_io->outputs) { + output_io_map[io_inner.name] = { + IO{io_inner.name, io_inner.is_host, io_inner.io_type, + io_inner.config_layout}, + io_inner.lite_tensor}; + } + m_finish_callback(output_io_map); + } + output_plugin_result(); +} + +void NetworkImplDft::set_io(const NetworkIO& network_io) { + m_network_io = std::make_unique(); + for (auto&& in : network_io.inputs) { + m_network_io->inputs.emplace_back(in); + } + for (auto&& out : network_io.outputs) { + m_network_io->outputs.emplace_back(out); + } +} + +void NetworkImplDft::update_io() { + update_input(); + update_output(); +} + +void NetworkImplDft::update_input() { + auto device_type = m_user_config->device_type; + auto device_id = m_compnode_locator.device; + auto stream_id = m_compnode_locator.stream; + //! if cpu all input and output are host + if (device_type == LiteDeviceType::LITE_CPU) { + for (auto&& in : m_network_io->inputs) { + in.is_host = true; + } + } + //! if cross compnode model, modify the device input if it is not valid + if (m_nr_device_type > 1) { + for (auto&& in_tensor_iter : m_load_result.tensor_map) { + for (auto&& config_in : m_network_io->inputs) { + //! if tensor is set to device input + if (in_tensor_iter.first == config_in.name && + !config_in.is_host) { + //! if the origin compnode of the tensor is not the device, + //! set the input to host + if (get_device_from_locator( + in_tensor_iter.second->comp_node().locator()) == + LiteDeviceType::LITE_CPU) { + config_in.is_host = true; + LITE_WARN( + "The input tensor %s of the cross device model " + "should not from device.", + config_in.name.c_str()); + } + } + } + } + } + for (auto&& in_tensor_iter : m_load_result.tensor_map) { + bool found = false; + for (auto&& config_in : m_network_io->inputs) { + if (in_tensor_iter.first == config_in.name) { + found = true; + if (config_in.is_host) { + config_in.lite_tensor = std::make_shared( + device_id, stream_id, device_type, true); + TensorHelper::implement(config_in.lite_tensor) + ->cast_final_safe() + .m_host_tensor = in_tensor_iter.second; + config_in.lite_tensor->update_from_implement(); + } else { + config_in.lite_tensor = std::make_shared( + device_id, stream_id, device_type); + config_in.lite_tensor->set_layout( + to_lite_layout(in_tensor_iter.second->layout())); + } + if (config_in.config_layout.ndim && + !(config_in.config_layout == + config_in.lite_tensor->get_layout())) { + config_in.lite_tensor->set_layout(config_in.config_layout); + } + } + } + if (!found) { + IOInner io_in; + io_in.name = in_tensor_iter.first; + io_in.lite_tensor = std::make_shared(device_id, stream_id, + device_type, true); + TensorHelper::implement(io_in.lite_tensor) + ->cast_final_safe() + .m_host_tensor = in_tensor_iter.second; + io_in.lite_tensor->update_from_implement(); + m_network_io->inputs.push_back(io_in); + } + } + //! delete the IO that is not the network + for (auto it = m_network_io->inputs.begin(); + it != m_network_io->inputs.end();) { + if (it->lite_tensor == nullptr) { + LITE_LOG("%s is not the network input, ignore it.", + it->name.c_str()); + it = m_network_io->inputs.erase(it); + } else { + it++; + } + } +} + +void NetworkImplDft::update_output() { + auto device_type = m_user_config->device_type; + auto device_id = m_compnode_locator.device; + auto stream_id = m_compnode_locator.stream; + if (device_type == LiteDeviceType::LITE_CPU) { + for (auto&& out : m_network_io->outputs) { + out.is_host = true; + } + } + //! delete the output that is not the network + for (auto out_it = m_network_io->outputs.begin(); + out_it != m_network_io->outputs.end();) { + if (std::find_if(m_load_result.output_var_list.begin(), + m_load_result.output_var_list.end(), + [out_it](const mgb::SymbolVar var) { + return var.node()->name() == out_it->name; + }) == m_load_result.output_var_list.end()) { + LITE_LOG("%s is not the network output, ignore it.", + out_it->name.c_str()); + out_it = m_network_io->outputs.erase(out_it); + } else { + out_it++; + } + } + //! user config the output tensor, so only compute the config output + if (m_compute_configured_output_only) { + LITE_ASSERT(m_network_io->outputs.size() > 0, + "compute configured output only with no configure output."); + for (auto out_it = m_network_io->outputs.begin(); + out_it != m_network_io->outputs.end(); out_it++) { + //! use pinned memory to copy form device + if (out_it->is_host) { + out_it->lite_tensor = std::make_shared( + device_id, stream_id, device_type, true); + } else { + out_it->lite_tensor = std::make_shared( + device_id, stream_id, device_type); + } + } + //! user not set, use default output + } else { + for (auto&& out : m_load_result.output_var_list) { + auto it = std::find_if(m_network_io->outputs.begin(), + m_network_io->outputs.end(), + [&out](const IOInner io) { + return io.name == out.node()->name(); + }); + if (it != m_network_io->outputs.end()) { + if (it->is_host) { + it->lite_tensor = std::make_shared( + device_id, stream_id, device_type, true); + } else { + it->lite_tensor = std::make_shared( + device_id, stream_id, device_type); + } + } else { + IOInner output; + output.name = out.node()->name(); + output.lite_tensor = std::make_shared( + device_id, stream_id, device_type, true); + m_network_io->outputs.push_back({output}); + } + } + } +} + +std::shared_ptr NetworkImplDft::get_io_tensor(std::string io_name, + LiteTensorPhase phase) { + if (phase == LiteTensorPhase::LITE_INPUT || + phase == LiteTensorPhase::LITE_IO) { + for (auto&& config_in : m_network_io->inputs) { + if (io_name == config_in.name) { + return config_in.lite_tensor; + } + } + } + if (phase == LiteTensorPhase::LITE_OUTPUT || + phase == LiteTensorPhase::LITE_IO) { + for (auto&& config_out : m_network_io->outputs) { + if (io_name == config_out.name) { + config_out.lite_tensor->update_from_implement(); + return config_out.lite_tensor; + } + } + } + LITE_THROW(mgb::ssprintf( + "tensor name must be %s input tensor name or the registered " + "output tensor name if NetworkIO is set, if NetworkIO is not set, " + "the output tensor is all the network output tensor, or the output " + "tensor is only the registered tensor.", + io_name.c_str())); + return nullptr; +} + +std::shared_ptr NetworkImplDft::get_input_tensor(size_t index) { + return get_io_tensor(get_input_name(index)); +} + +std::shared_ptr NetworkImplDft::get_output_tensor(size_t index) { + return get_io_tensor(get_output_name(index)); +} + +//! set opr algorithm selection strategy in the network +void NetworkImplDft::set_network_algo_policy(LiteAlgoSelectStrategy strategy, + uint32_t shared_batch_size, + bool binary_equal_between_batch) { + using S = megdnn::param::ExecutionPolicy::Strategy; + auto dst_strategy = static_cast(0); + if (static_cast(strategy) & + LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) { + dst_strategy = dst_strategy | S::HEURISTIC; + } + if (static_cast(strategy) & + LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) { + dst_strategy = dst_strategy | S::PROFILE; + } + if (static_cast(strategy) & + LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) { + dst_strategy = dst_strategy | S::REPRODUCIBLE; + } + if (static_cast(strategy) & + LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) { + dst_strategy = dst_strategy | S::OPTIMIZED; + } + m_execution_policy = dst_strategy; + + auto&& fast_run_config = + m_load_config.comp_graph->options().fast_run_config; + fast_run_config.binary_equal_between_batch = binary_equal_between_batch; + fast_run_config.shared_batch_size = shared_batch_size; + + if (m_execute_func) { + LITE_WARN( + "set_network_algo_policy maybe cause error after loaded " + "network!!!!"); + modify_exection_policy(); + } +} + +void NetworkImplDft::modify_exection_policy() { + mgb::SymbolVarArray vars; + for (auto i : m_output_spec) { + vars.push_back(i.first); + } + if (static_cast(m_execution_policy) != 0) + mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy); +} + +//! set opr algorithm selection strategy in the network +void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) { + mgb::SymbolVarArray vars; + for (auto i : m_output_spec) { + vars.push_back(i.first); + } + mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit); +} + +//! get the input tensor name in the order of graph +std::vector NetworkImplDft::get_all_output_name() const { + std::vector output_names; + for (auto& output : m_network_io->outputs) { + output_names.push_back(output.name.c_str()); + } + return output_names; +} + +//! get the input tensor name in the order of graph +std::vector NetworkImplDft::get_all_input_name() const { + std::vector input_names; + for (auto& input : m_load_result.tensor_map) { + input_names.push_back(input.first.c_str()); + } + return input_names; +} + +//! get the output tensor name in the order of graph +const char* NetworkImplDft::get_output_name(size_t index) const { + LITE_ASSERT( + index < m_load_result.output_var_list.size(), + "The output tensor index is large than the total outputs number."); + return m_load_result.output_var_list[index].node()->name().c_str(); +} + +//! get the input tensor name in the order of graph +const char* NetworkImplDft::get_input_name(size_t index) const { + LITE_ASSERT( + index < m_load_result.tensor_map.size(), + "The input tensor index is large than the total inputs number."); + size_t i = 0; + for (auto& input : m_load_result.tensor_map) { + if (i == index) { + return input.first.c_str(); + } + i++; + } + LITE_THROW(ssprintf("no input tensor of index %zu.", index)); +} + +//! Plugin part +void NetworkImplDft::enable_profile_performance(std::string profile_json_file) { +#if MGB_ENABLE_JSON +#if MGB_OPENCL + mgb::CompNode::enable_opencl_profile(true); +#endif + m_profiler = std::make_unique( + m_load_config.comp_graph.get()); + m_profiler_output_file = profile_json_file; +#else + LITE_MARK_USED_VAR(profile_json_file); + LITE_THROW("JSON is disable at compile time."); +#endif +} + +void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) { + auto iodump = std::make_unique( + m_load_config.comp_graph.get(), io_txt_out_file.c_str()); + iodump->print_addr(false); + m_iodump = std::move(iodump); +} + +void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) { + m_iodump = std::make_unique( + m_load_config.comp_graph.get(), io_bin_out_dir.c_str()); +} + +void inline NetworkImplDft::output_plugin_result() const { +#if MGB_ENABLE_JSON + if (m_profiler && m_execute_func) { + m_profiler->to_json_full(m_execute_func.get()) + ->writeto_fpath(m_profiler_output_file); + } +#endif +} +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/network_impl.h b/lite/src/mge/network_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..fd466da82d4360e3b65792a1f08762223c64e208 --- /dev/null +++ b/lite/src/mge/network_impl.h @@ -0,0 +1,242 @@ +/** + * \file src/mge/network_impl.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "lite/network.h" +#include "network_impl_base.h" +#include "tensor_impl.h" + +#include "megbrain/graph/bases.h" +#include "megbrain/plugin/opr_io_dump.h" +#include "megbrain/plugin/profiler.h" +#include "megbrain/serialization/extern_c_opr.h" +#include "megbrain/serialization/file.h" +#include "megbrain/serialization/load_dump_config.h" +#include "megbrain/serialization/serializer.h" +#include "megbrain/utils/thin/hash_table.h" + +#include +#include + +namespace lite { + +/*! + * \brief implement the Network, contain the mgb related member + */ +class NetworkImplDft final : public Network::NetworkImplBase { + LITE_DYN_TYPE_OBJ_FINAL_DECL; + +public: + using S = megdnn::param::ExecutionPolicy::Strategy; + //! set the config of the network, include: + //! the inference device + //! the other inference options, such as record_level, weight_preprocess... + void set_config(const Config& config) override; + + //! set the special io infomation, if not set, default io tensor will used, + //! this is special for input/output is not host tensor, default the + //! input/output tensors are host tensor + void set_io(const NetworkIO& network_io) override; + + //! only compute the output tensor in user configured + void compute_only_configured_output() override { + m_compute_configured_output_only = true; + } + + //! get the network input and ouput tensor, the layout of which is + //! sync from mge tensor + std::shared_ptr get_io_tensor( + std::string io_name, + LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override; + + //! get the input tensor by index in the load_result tensormap + std::shared_ptr get_input_tensor(size_t index) override; + + //! get the output tensor by index in the load_result output_var_list + std::shared_ptr get_output_tensor(size_t index) override; + + //! get all the input tensor name in the order in load return + std::vector get_all_input_name() const override; + + //! get all the output tensor name in the order in load return + std::vector get_all_output_name() const override; + + //! get the input tensor name in the order in load return + const char* get_input_name(size_t index) const override; + + //! get the output tensor name in the order in load return + const char* get_output_name(size_t index) const override; + + //! set the callback in async model + void set_async_callback(const AsyncCallback& callback) override; + + //! set the start callback which will execute before network forward + void set_start_callback(const StartCallback& callback) override { + m_start_callback = std::move(callback); + } + + //! set the finish callback which will execute after network forward + void set_finish_callback(const FinishCallback& callback) override { + m_finish_callback = std::move(callback); + } + + //! load the model and get the m_load_result + void load_model(std::shared_ptr model_mem, size_t size, + std::unordered_map + separate_config_map = {}) override; + + //! forward the network with filled input data and fill the output data + //! to the output tensor + void forward() override; + + //! in sync model, wait utile the inference finish + void wait() override; + + virtual LiteDeviceType get_device_type() const override { + return m_user_config->device_type; + } + + //! Set cpu default mode when device is CPU, in some low computation + //! device or single core device, this mode will get good performace + void set_cpu_inplace_mode(); + bool is_cpu_inplace_mode() const { return m_is_cpu_inplace_mode; } + + //! When device is CPU, this interface will set the to be loaded model + //! run in multi thread mode with the given thread number. + void set_cpu_threads_number(size_t nr_threads); + size_t get_cpu_threads_number() const { return m_nr_threads; } + + //! set device id, default device id = 0 + void set_device_id(int device_id) override; + int get_device_id() const override { return m_compnode_locator.device; }; + + LiteBackend get_backend_type() const override { + return LiteBackend::LITE_DEFAULT; + } + //! set stream id, default stream id = 0 + void set_stream_id(int stream_id) override; + int get_stream_id() const override { return m_compnode_locator.stream; }; + + //! enable tensorrt + void use_tensorrt(); + + //! enable profile the network, a JSON format file will be generated + void enable_profile_performance( + std::string profile_json_file_path) override; + + /********************** mge special function ************************/ + //! load a new network which will share weights with src network + void shared_weight_with(const NetworkImplBase* src_network); + + //! share the runtime memory with other network, the weights is not shared + void share_runtime_memory_with(NetworkImplBase* network); + //! set threads affinity callback; + void set_runtime_thread_affinity( + const ThreadAffinityCallback& thread_affinity_callback); + + //! set the network memroy allocator, the allocator is defined by user + void set_memory_allocator(std::shared_ptr user_allocator); + + //! set opr algorithm selection strategy in the network + void set_network_algo_policy(LiteAlgoSelectStrategy strategy, + uint32_t shared_batch_size, + bool binary_equal_between_batch); + + //! set workspace_limit for oprs with multiple algorithms, set + //! workspace limitation can save memory but may influence the performance + void set_network_algo_workspace_limit(size_t workspace_limit); + + //! Dump input/output values of all internal variables to output file, + //! in text format + void enable_io_txt_dump(std::string io_txt_out_file); + + //! Dump input/output values of all internal variables to output + //! directory, in binary format + void enable_io_bin_dump(std::string io_bin_out_dir); + +private: + //! construct the outputspec according to the m_network_io, and set the + //! call_back to the outputspec + void make_output_spec(); + + //! modify the execution policy + void modify_exection_policy(); + + //! if the input is dev tensor, the pass will replace the H2D Opr to + //! VolatileSharedDeviceTensor Opr + void replace_dev_input_pass(); + + //! check whether the model is cross compnode + void cross_compnode_model_detect(); + + //! when the model have loaded, update the IO, if not set networkio, update + //! the networkio with the IO of loaded model + void update_io(); + + void update_input(); + void update_output(); + + //! when the model info have loaded, update the config according the model + //! info, finaly use it in compute graph + void application_config(); + + //! after finish forwarding the netwark, output the result of plugin to file + void output_plugin_result() const; + + //! when finish forwarding the network, the function will be called + void finish() const; + + //! before forwarding the network, the function will be called + void start() const; + + //! compile the graph to get the execute function + void compile_graph(); + +private: + bool m_async = false; + bool m_is_cpu_inplace_mode = false; + int m_nr_device_type = 0; + size_t m_nr_threads = 1; + bool m_compute_configured_output_only = false; + mgb::CompNode::Locator m_compnode_locator; + + AsyncCallback m_async_callback = nullptr; + std::unique_ptr m_network_io; + std::unique_ptr m_user_config; + std::unique_ptr m_execute_func; + + //! The model load related data + S m_execution_policy = static_cast(0); + std::unique_ptr m_input_file; + mgb::serialization::GraphLoadConfig m_load_config; + mgb::serialization::GraphLoader::LoadResult m_load_result; + mgb::ComputingGraph::OutputSpec m_output_spec; + std::shared_ptr m_loader; + + //! start and finish callback + StartCallback m_start_callback = nullptr; + FinishCallback m_finish_callback = nullptr; + + //! profile and io dump related data +#if MGB_ENABLE_JSON + std::unique_ptr m_profiler; + std::string m_profiler_output_file; +#endif + std::unique_ptr m_iodump; +}; + +} // namespace lite + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/tensor_impl.cpp b/lite/src/mge/tensor_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1c699a3d49b248d8ff9c4819014f3f16a7e46c75 --- /dev/null +++ b/lite/src/mge/tensor_impl.cpp @@ -0,0 +1,435 @@ +/** + * \file inlude/mge/tensor.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "tensor_impl.h" +#include "common.h" + +#include "lite/tensor.h" + +#include "megbrain/comp_node.h" +#include "megbrain/tensor.h" + +#include + +using namespace lite; + +/**********************TensorImpl****************************/ + +LITE_DYN_TYPE_OBJ_FINAL_IMPL(TensorImplDft); + +TensorImplDft::TensorImplDft() { + m_host_tensor = + std::make_shared(mgb::CompNode::default_cpu()); +} + +TensorImplDft::TensorImplDft(LiteDeviceType device, bool is_pinned_host) { + auto cn = mgb::CompNode::load(to_compnode_locator(device)); + if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { + device = LiteDeviceType::LITE_CPU; + } + if (device == LiteDeviceType::LITE_CPU) { + m_host_tensor = std::make_shared( + mgb::CompNode::default_cpu()); + } else if (is_pinned_host) { + m_host_tensor = std::make_shared(cn); + } else { + m_dev_tensor = std::make_shared(cn); + } +} + +TensorImplDft::TensorImplDft(LiteDeviceType device, const Layout& layout, + bool is_pinned_host) { + auto cn = mgb::CompNode::load(to_compnode_locator(device)); + auto mge_layout = to_impl_layout(layout); + if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { + device = LiteDeviceType::LITE_CPU; + } + if (device == LiteDeviceType::LITE_CPU) { + m_host_tensor = std::make_shared( + mgb::CompNode::default_cpu(), mge_layout); + } else if (is_pinned_host) { + m_host_tensor = std::make_shared(cn, mge_layout); + } else { + m_dev_tensor = std::make_shared(cn, mge_layout); + } +} + +TensorImplDft::TensorImplDft(int device_id, LiteDeviceType device_type, + const Layout& layout, bool is_pinned_host) { + auto locator = to_compnode_locator(device_type); + locator.device = device_id; + auto cn = mgb::CompNode::load(locator); + if (device_type == LiteDeviceType::LITE_DEVICE_DEFAULT) { + device_type = LiteDeviceType::LITE_CPU; + } + if (layout.ndim) { + auto mge_layout = to_impl_layout(layout); + if (device_type == LiteDeviceType::LITE_CPU) { + m_host_tensor = std::make_shared( + mgb::CompNode::default_cpu(), mge_layout); + } else if (is_pinned_host) { + m_host_tensor = std::make_shared(cn, mge_layout); + } else { + m_dev_tensor = + std::make_shared(cn, mge_layout); + } + } else { + if (device_type == LiteDeviceType::LITE_CPU) { + m_host_tensor = std::make_shared( + mgb::CompNode::default_cpu()); + } else if (is_pinned_host) { + m_host_tensor = std::make_shared(cn); + } else { + m_dev_tensor = std::make_shared(cn); + } + } +} + +TensorImplDft::TensorImplDft(int device_id, int stream_id, + LiteDeviceType device_type, bool is_pinned_host) { + auto locator = to_compnode_locator(device_type); + locator.device = device_id; + locator.stream = stream_id; + auto cn = mgb::CompNode::load(locator); + if (get_device_from_locator(locator) == LiteDeviceType::LITE_CPU) { + m_host_tensor = std::make_shared( + mgb::CompNode::default_cpu()); + } else if (is_pinned_host) { + m_host_tensor = std::make_shared(cn); + } else { + m_dev_tensor = std::make_shared(cn); + } +} + +LiteDeviceType TensorImplDft::get_device_type() const { + if (is_host()) { + return LiteDeviceType::LITE_CPU; + } else { + return get_device_from_locator(m_dev_tensor->comp_node().locator()); + } +} + +int TensorImplDft::get_device_id() const { + if (is_host()) { + return m_host_tensor->comp_node().locator().device; + } else { + return m_dev_tensor->comp_node().locator().device; + } +} + +bool TensorImplDft::is_pinned_host() const { + return is_host() && + get_device_from_locator(m_host_tensor->comp_node().locator()) != + LiteDeviceType::LITE_CPU; +} + +void TensorImplDft::set_mge_tensor_compnode(const mgb::CompNode& comp_node) { + if (is_host()) { + m_host_tensor->comp_node(comp_node, true); + } else { + m_dev_tensor->comp_node(comp_node, true); + } +} + +Layout TensorImplDft::get_layout() const { + if (is_host()) { + return to_lite_layout(m_host_tensor->layout()); + } else { + return to_lite_layout(m_dev_tensor->layout()); + } +} + +void* TensorImplDft::get_memory_ptr() const { + if (is_host()) { + return static_cast(m_host_tensor->raw_ptr()); + } else { + return static_cast(m_dev_tensor->raw_ptr()); + } +} + +void* TensorImplDft::get_memory_ptr(const std::vector& idx) const { + if (is_host()) { + auto elemsize_log = m_host_tensor->layout().dtype.size_log(); + switch (elemsize_log) { + case 0: + return static_cast( + m_host_tensor->ptr(idx.begin(), idx.end())); + break; + case 1: + return static_cast( + m_host_tensor->ptr(idx.begin(), idx.end())); + break; + case 2: + return static_cast( + m_host_tensor->ptr(idx.begin(), idx.end())); + break; + default: + LITE_THROW("not supported data_type."); + } + } else { + auto elemsize_log = m_dev_tensor->layout().dtype.size_log(); + switch (elemsize_log) { + case 0: + return static_cast( + m_dev_tensor->ptr(idx.begin(), idx.end())); + break; + case 1: + return static_cast( + m_dev_tensor->ptr(idx.begin(), idx.end())); + break; + case 2: + return static_cast( + m_dev_tensor->ptr(idx.begin(), idx.end())); + break; + default: + LITE_THROW("not supported data_type."); + } + } +} + +std::shared_ptr TensorImplDft::slice( + const std::vector& start, const std::vector& end, + const std::vector& step) { + Layout layout; + mgb::TensorLayout layout_mge; + if (is_host()) { + layout_mge = m_host_tensor->layout(); + layout = to_lite_layout(m_host_tensor->layout()); + } else { + layout_mge = m_dev_tensor->layout(); + layout = to_lite_layout(m_dev_tensor->layout()); + } + + size_t length = start.size(); + LITE_ASSERT(length == end.size() && length <= layout.ndim, + "The start and end must be the same size and less than layout " + "ndim."); + std::vector slices; + if (step.size()) { + LITE_ASSERT(length == step.size(), + "The start and step must be the same size."); + for (size_t i = 0; i < length; i++) { + slices.push_back(mgb::Slice{start[i], end[i], step[i]}); + } + } else { + for (size_t i = 0; i < length; i++) { + slices.push_back(mgb::Slice{start[i], end[i]}); + } + } + auto subspec = mgb::SubTensorSpec::make_from_offset_elem(layout_mge, 0); + size_t axis = 0; + for (auto&& i : slices) { + subspec.merge_with(i.apply(subspec.layout(), axis)); + axis++; + } + auto ret = std::make_shared(); + auto& impl = TensorHelper::implement(ret)->cast_final_safe(); + if (is_host()) { + *impl.m_host_tensor = m_host_tensor->sub(subspec); + } else { + impl.m_dev_tensor = std::make_shared( + m_dev_tensor->sub(subspec)); + impl.m_host_tensor = nullptr; + } + LITE_ASSERT(is_host() == impl.is_host()); + return ret; +} + +void TensorImplDft::fill_zero() { + if (is_host()) { + auto mge_layout = m_host_tensor->layout(); + if (m_host_tensor->layout().is_physical_contiguous()) { + auto ptr = get_memory_ptr(); + std::memset(ptr, 0, + mge_layout.dtype.size(mge_layout.total_nr_elems())); + } else { + TensorImplDft tmp(LiteDeviceType::LITE_CPU, + to_lite_layout(mge_layout), true); + tmp.fill_zero(); + this->copy_from(&tmp); + } + } else { + mgb::dev_tensor_memset(*m_dev_tensor, 0); + m_dev_tensor->sync(); + } +} + +void TensorImplDft::share_memory_with(const TensorImplBase* src_tensor_impl) { + auto src_dft_tensor = static_cast(src_tensor_impl); + LITE_ASSERT(is_host() == src_dft_tensor->is_host(), + "share memory must happen in same device"); + //! make shape the src memory is ready + src_tensor_impl->get_memory_ptr(); + if (is_host()) { + *m_host_tensor = *src_dft_tensor->m_host_tensor; + } else { + *m_dev_tensor = *src_dft_tensor->m_dev_tensor; + } +} + +void TensorImplDft::set_layout(const Layout& layout) { + bool host = is_host(); + auto mgb_layout = to_impl_layout(layout); + if (host) { + m_host_tensor->dtype(mgb_layout.dtype); + m_host_tensor->resize(mgb_layout); + } else { + m_dev_tensor->dtype(mgb_layout.dtype); + m_dev_tensor->resize(mgb_layout); + } +} + +void TensorImplDft::reshape(const Layout& layout) { + auto mgb_layout = to_impl_layout(layout); + bool host = is_host(); + if (host) { + m_host_tensor->resize(mgb_layout); + } else { + m_dev_tensor->resize(mgb_layout); + } +} + +void TensorImplDft::reset(void* prepared_data) { + auto raw_ptr = static_cast(prepared_data); + auto raw_storage = std::shared_ptr(raw_ptr, [](void*) {}); + bool host = is_host(); + if (host) { + auto cn = m_host_tensor->comp_node(); + auto mge_layout = m_host_tensor->layout(); + size_t size = mge_layout.span().dist_byte(); + mgb::HostTensorStorage storage; + storage.reset(cn, size, raw_storage); + m_host_tensor->reset(storage, mge_layout); + } else { + auto cn = m_dev_tensor->comp_node(); + auto mge_layout = m_dev_tensor->layout(); + size_t size = mge_layout.span().dist_byte(); + mgb::DeviceTensorStorage storage; + storage.reset(cn, size, raw_storage); + m_dev_tensor->reset(storage, mge_layout); + } +} + +void TensorImplDft::reset(void* prepared_data, const Layout& layout) { + set_layout(layout); + reset(prepared_data); +} + +bool TensorImplDft::is_continue_memory() const { + if (is_host()) { + return m_host_tensor->layout().is_physical_contiguous(); + } else { + return m_dev_tensor->layout().is_physical_contiguous(); + } +} + +void TensorImplDft::copy_from(const TensorImplBase* src_impl) { + if (is_continue_memory()) { + copy_from_continue(src_impl); + } else { + copy_from_fixlayout(src_impl); + } +} + +void TensorImplDft::copy_from_continue(const TensorImplBase* src_impl) { + auto src = static_cast(src_impl); + if (is_host()) { + //! host to host + if (src->is_host()) { + m_host_tensor->copy_from(*src->m_host_tensor); + //! device to host + } else { + auto src_cn = src->m_dev_tensor->comp_node(); + auto dst_cn = m_host_tensor->comp_node(); + if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { + LITE_WARN( + "The dst tensor memroy is alloced before coping, " + "then pinned memroy would not use to optmize the " + "copy performance."); + //! When D2H in megbrain and the compnode of src and dst is not + //! equal, there must be one compnode that is cpu-default, so + //! here, we use temp tensor for transition + auto tmp_impl = std::make_shared(); + tmp_impl->set_mge_tensor_compnode(src_cn); + tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); + m_host_tensor->copy_from(*tmp_impl->m_host_tensor); + } else { + //! if dst compnode is not valid(memory is not alloced), the + //! tensor is pinned host tensor + m_host_tensor->comp_node(src_cn, true); + m_host_tensor->copy_from(*src->m_dev_tensor).sync(); + } + } + } else { + //! host to device + if (src->is_host()) { + m_dev_tensor->copy_from(*src->m_host_tensor).sync(); + //! device to device + } else { + m_dev_tensor->copy_from(*src->m_dev_tensor).sync(); + } + } +} + +void TensorImplDft::copy_from_fixlayout(const TensorImplBase* src_impl) { + auto src = static_cast(src_impl); + if (is_host()) { + //! host to host + if (src->is_host()) { + m_host_tensor->copy_from_fixlayout(*src->m_host_tensor); + //! device to host + } else { + auto src_cn = src->m_dev_tensor->comp_node(); + auto dst_cn = m_host_tensor->comp_node(); + if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { + LITE_WARN( + "The dst tensor memroy is alloced before coping, " + "then pinned memroy would not use to optmize the " + "copy performance."); + //! When D2H in megbrain and the compnode of src and dst is not + //! equal, there must be one compnode that is cpu-default, so + //! here, we use temp tensor for transition + auto tmp_impl = std::make_shared(); + tmp_impl->set_mge_tensor_compnode(src_cn); + tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); + m_host_tensor->copy_from_fixlayout(*tmp_impl->m_host_tensor); + } else { + //! if dst compnode is not valid(memory is not alloced), the + //! tensor is pinned host tensor + m_host_tensor->comp_node(src_cn, true); + m_host_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); + } + } + } else { + //! host to device + if (src->is_host()) { + m_dev_tensor->copy_from_fixlayout(*src->m_host_tensor).sync(); + //! device to device + } else { + m_dev_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); + } + } +} + +void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) { + if (is_host()) { + auto src_cn = dv.comp_node(); + m_host_tensor->comp_node(src_cn, true); + m_host_tensor->copy_from(dv); + } else { + m_dev_tensor->copy_from(dv); + } +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/tensor_impl.h b/lite/src/mge/tensor_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..74991d41c670210eed89da2ab251e682efec50ad --- /dev/null +++ b/lite/src/mge/tensor_impl.h @@ -0,0 +1,128 @@ +/** + * \file src/mge/tensor_impl.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "lite/tensor.h" +#include "tensor_impl_base.h" + +#include "megbrain/tensor.h" + +#include + +namespace lite { + +/*! + * \brief implement the Tensor in mge + */ +class TensorImplDft final : public Tensor::TensorImplBase { + LITE_DYN_TYPE_OBJ_FINAL_DECL; + +public: + TensorImplDft(); + TensorImplDft(LiteDeviceType device, bool is_pinned_host = false); + TensorImplDft(LiteDeviceType device, const Layout& layout, + bool is_pinned_host = false); + TensorImplDft(int device_id, LiteDeviceType device, + const Layout& layout = {}, bool is_pinned_host = false); + TensorImplDft(int device_id, int stream_id, LiteDeviceType device, + bool is_pinned_host = false); + + virtual ~TensorImplDft() = default; + + LiteDeviceType get_device_type() const override; + + int get_device_id() const override; + + LiteBackend get_backend_type() const override { + return LiteBackend::LITE_DEFAULT; + } + Layout get_layout() const override; + + bool is_pinned_host() const override; + + //! which will trigger memory alloc in tensor implement + void* get_memory_ptr() const override; + + //! which will trigger memory alloc in tensor implement if memory is not + //! allocated, and compute the ptr in the gaven idx + void* get_memory_ptr(const std::vector& idx) const override; + + //! set layout will change the layout and reallocate memory of the tensor + void set_layout(const Layout& layout) override; + + //! use the user allocated data to reset the memory of the tensor, the + //! memory will not be managed by the lite, later, the user should delete + //! it. + void reset(void* prepared_data) override; + + //! use the user allocated data and corresponding layout to reset the data + //! and layout of the tensor, the memory will not be managed by lite, later, + //! the user should delete it. + void reset(void* prepared_data, const Layout& layout) override; + + //! get a new tensor slice from the origin tensor + std::shared_ptr slice( + const std::vector& start, const std::vector& end, + const std::vector& step = {}) override; + + //! set the tensor memory with zero + void fill_zero() override; + + //! reshape the tensor with new shape, keep the data_type the same + void reshape(const Layout& layout) override; + + //! copy tensor form other tensor + //! Note: the best way for tensor copy is just set the dst device, left + //! layout empty, when copying the dst layout will be set the same with + //! src + void copy_from(const TensorImplBase* src_impl) override; + + //! share memory with other tensor + void share_memory_with(const TensorImplBase* src_impl) override; + + //! whether the memory of tensor is continue + bool is_continue_memory() const override; + + //! get host tensor + std::shared_ptr host_tensor() const { + return m_host_tensor; + } + //! get device tensor + std::shared_ptr dev_tensor() const { + return m_dev_tensor; + } + //! copy from mgb tensor + void copy_from_mge_tensor(const mgb::DeviceTensorND& dv); + +public: + friend class NetworkImplDft; + +private: + bool is_host() const { return m_host_tensor != nullptr; }; + + void copy_from_continue(const TensorImplBase* src_impl); + + void copy_from_fixlayout(const TensorImplBase* src_impl); + + void set_mge_tensor_compnode(const mgb::CompNode& comp_node); + +private: + std::shared_ptr m_host_tensor; + std::shared_ptr m_dev_tensor; +}; + +} // namespace lite + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/misc.cpp b/lite/src/misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c44d024179fd8f6b9c08ad14ac89dfd7a4c276a9 --- /dev/null +++ b/lite/src/misc.cpp @@ -0,0 +1,154 @@ +/** + * \file inlude/misc.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "./misc.h" +#include "lite/global.h" + +#include +#include +#include + +#if LITE_BUILD_WITH_MGE +#include "megbrain/common.h" +#endif + +#ifdef __ANDROID__ +#include +#endif + +using namespace lite; + +namespace lite { +namespace log_detail { + +LiteLogLevel current_log_level = LiteLogLevel::ERROR; + +template +constexpr size_t countof(T (&)[N]) { + return N; +} +} // namespace log_detail +} // namespace lite + +namespace { +std::string svsprintf(const char* fmt, va_list ap_orig) { + int size = 100; /* Guess we need no more than 100 bytes */ + char* p; + + if ((p = (char*)malloc(size)) == nullptr) + return "svsprintf: malloc failed"; + + for (;;) { + va_list ap; + va_copy(ap, ap_orig); + int n = vsnprintf(p, size, fmt, ap); + va_end(ap); + + if (n < 0) + return "svsprintf: vsnprintf failed"; + + if (n < size) { + std::string rst(p); + free(p); + return rst; + } + + size = n + 1; + + char* np = (char*)realloc(p, size); + if (!np) { + free(p); + return "svsprintf: realloc failed"; + } else + p = np; + } +} +} // namespace + +void lite::set_log_level(LiteLogLevel l) { + log_detail::current_log_level = l; +#if LITE_BUILD_WITH_MGE + mgb::LogLevel lite_log_level = mgb::LogLevel::DEBUG; + switch (l) { + case LiteLogLevel::DEBUG: + lite_log_level = mgb::LogLevel::DEBUG; + break; + case LiteLogLevel::INFO: + lite_log_level = mgb::LogLevel::INFO; + break; + case LiteLogLevel::WARN: + lite_log_level = mgb::LogLevel::WARN; + break; + case LiteLogLevel::ERROR: + lite_log_level = mgb::LogLevel::ERROR; + break; + default: + LITE_THROW("unkonw loglevel"); + } + mgb::set_log_level(lite_log_level); +#endif +} + +LiteLogLevel lite::get_log_level() { + return log_detail::current_log_level; +} + +std::string lite::ssprintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + auto ret = svsprintf(format, ap); + va_end(ap); + return ret; +} + +void lite::print_log(LiteLogLevel level, const char* format, ...) { + if (static_cast(level) < static_cast(get_log_level())) { + return; + } + using namespace std::chrono; + + auto now = system_clock::now(); + auto now_time_t = system_clock::to_time_t(now); + + tm now_tm; + +#if _WIN32 + localtime_s(&now_tm, &now_time_t); +#else + localtime_r(&now_time_t, &now_tm); +#endif + + auto now_trunc_to_sec = system_clock::from_time_t(mktime(&now_tm)); + auto microsec = duration_cast(now - now_trunc_to_sec); + + char time_buffer[100]; + snprintf(time_buffer, log_detail::countof(time_buffer), + "%02d:%02d:%02d.%06ld ", now_tm.tm_hour, now_tm.tm_min, + now_tm.tm_sec, long(microsec.count())); + + const char* prefix[] = {"LITE[DBG] ", "LITE[INF] ", "LITE[WRN] ", + "LITE[ERR] "}; + std::string out; + out += prefix[int(level)]; + out += time_buffer; + + va_list ap; + va_start(ap, format); + auto ret = svsprintf(format, ap); + va_end(ap); + out += ret; + +#ifdef __ANDROID__ + __android_log_print(ANDROID_LOG_INFO, "lite", "%s", out.c_str()); +#else + fprintf(stderr, "%s\n", out.c_str()); +#endif +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/misc.h b/lite/src/misc.h new file mode 100644 index 0000000000000000000000000000000000000000..c6799feda0c6974a69b8703d84f99e5828ddb92b --- /dev/null +++ b/lite/src/misc.h @@ -0,0 +1,254 @@ +/** + * \file include/misc.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#include "lite_build_config.h" + +#include +#include +#include +#include +#include "lite/common_enum_c.h" +#include "lite/global.h" + +namespace lite { +#if LITE_ENABLE_EXCEPTION +/*! \brief The error class in lite. + * + * It can be used to represent both an error caused by the invalid + * input of the caller or an invalid runtime condition. + * + * The necessary presumption should be guaranteed by assertions instead of + * exceptions. + */ +class Error : public std::exception { +public: + Error(const std::string& msg) : m_msg("Error: " + msg) {} + const char* what() const noexcept override { return m_msg.c_str(); } + +private: + std::string m_msg; +}; +#endif + +std::string ssprintf(const char* fmt = 0, ...) + __attribute__((format(printf, 1, 2))); + +/*! + * \brief Print a message. + * + * The message is printed only if level is above or equals to the current log + * level. + */ +void print_log(LiteLogLevel level, const char* format = 0, ...) + __attribute__((format(printf, 2, 3))); +} // namespace lite + +#if LITE_ENABLE_LOGGING +#define LITE_LOG_(level, msg...) \ + do { \ + lite::print_log(LiteLogLevel::level, ##msg); \ + } while (0) +#else +#define LITE_LOG_(level, msg...) (void)0 +#endif + +#define LITE_LOG(fmt...) LITE_LOG_(DEBUG, fmt); +#define LITE_DEBUG(fmt...) LITE_LOG_(DEBUG, fmt); +#define LITE_WARN(fmt...) LITE_LOG_(WARN, fmt); +#define LITE_ERROR(fmt...) LITE_LOG_(ERROR, fmt); + +#if LITE_ENABLE_EXCEPTION +#define LITE_THROW(msg) throw lite::Error(msg) +#else +#define LITE_THROW(msg) \ + do { \ + LITE_ERROR(msg); \ + __builtin_trap(); \ + } while (0) +#endif + +#if LITE_ENABLE_EXCEPTION +#define LITE_ERROR_HANDLER_BEGIN try { +#define LITE_ERROR_HANDLER_END \ + } \ + catch (const ::lite::Error& e) { \ + std::string msg = std::string("Lite exception: ") + e.what(); \ + LITE_ERROR("%s.", msg.c_str()); \ + throw; \ + } + +#else +#define LITE_ERROR_HANDLER_BEGIN +#define LITE_ERROR_HANDLER_END +#endif + +/*! \brief Return an error if the given pointer is null pointer. + * + * The macro is used to ensure the validity of the passing context pointer. + */ +#define LITE_CHECK_NON_NULL_POINTER(ptr) \ + LITE_ASSERT(ptr != nullptr, "Input ptr is null.") + +//! branch prediction hint: likely to take +#define lite_likely(v) __builtin_expect(static_cast(v), 1) + +//! branch prediction hint: unlikely to take +#define lite_unlikely(v) __builtin_expect(static_cast(v), 0) + +#if LITE_ENABLE_LOGGING +#if LITE_ASSERT_LOC +#define LITE_ASSERT(expr, msg...) \ + do { \ + if (lite_unlikely(!(expr))) { \ + auto info = lite::ssprintf(msg); \ + LITE_THROW( \ + lite::ssprintf("Assert \' %s \' failed at file : %s \n" \ + "line %d : %s,\nextra " \ + "message: %s", \ + #expr, __FILE__, __LINE__, \ + __PRETTY_FUNCTION__, info.c_str())); \ + } \ + } while (0) +#else +#define LITE_ASSERT(expr, msg...) \ + do { \ + if (lite_unlikely(!(expr))) { \ + auto info = lite::ssprintf(msg); \ + LITE_THROW(lite::ssprintf( \ + "Assert \' %s \' failed at file : %s \n" \ + "line %d : %s,\nextra " \ + "message: %s", \ + #expr, "about location info, please build with debug", \ + __LINE__, __PRETTY_FUNCTION__, info.c_str())); \ + } \ + } while (0) +#endif +#else +#define LITE_ASSERT(expr, msg...) \ + do { \ + if (lite_unlikely(!(expr))) { \ + auto msg_string = lite::ssprintf(msg); \ + LITE_THROW(msg_string.c_str()); \ + } \ + } while (0) +#endif + +#define LITE_MARK_USED_VAR(var) ((void)var) + +namespace lite { +class ScopedTimer { +public: + typedef std::chrono::system_clock Clock; + typedef std::chrono::nanoseconds Nsec; + + ScopedTimer(std::string name) : m_name(name) { m_start = Clock::now(); } + ~ScopedTimer() { + m_stop = Clock::now(); + std::chrono::duration elapsed = m_stop - m_start; + Nsec u = std::chrono::duration_cast(elapsed); + auto msg = ssprintf("%s used time %fms.", m_name.c_str(), + static_cast(u.count()) / 1000000.f); + LITE_LOG("%s", msg.c_str()); + } + +private: + std::chrono::time_point m_start, m_stop; + const std::string m_name; +}; + +class Timer { +public: + typedef std::chrono::system_clock Clock; + typedef std::chrono::nanoseconds Nsec; + + Timer(std::string name) : m_name(name) { m_start = Clock::now(); } + double get_used_time() { + m_stop = Clock::now(); + std::chrono::duration elapsed = m_stop - m_start; + Nsec u = std::chrono::duration_cast(elapsed); + return static_cast(u.count()) / 1000000.0; + } + void print_used_time(int iter) { + m_stop = Clock::now(); + std::chrono::duration elapsed = m_stop - m_start; + Nsec u = std::chrono::duration_cast(elapsed); + printf("%s used time %f ms\n", (m_name + std::to_string(iter)).c_str(), + static_cast(u.count()) / 1000000.0); + } + void reset_start() { m_start = Clock::now(); } + +private: + std::chrono::time_point m_start, m_stop; + const std::string m_name; +}; + +inline void mark_used_variable() {} +template +inline void mark_used_variable(T firstArg, Arg... args) { + LITE_MARK_USED_VAR(firstArg); + mark_used_variable(args...); +} +} // namespace lite + +#if defined(_WIN32) +#include +#include +#undef CONST +#define F_OK 0 +#define RTLD_LAZY 0 +// On the windows platform we use a lib_filename without a full path so +// the win-api "LoadLibrary" would uses a standard search strategy to +// find the lib module. As we cannot access to the lib_filename without a +// full path, we should not use "access(a, b)" to verify it. +#define access(a, b) false +static inline void* dlopen(const char* file, int) { + return static_cast(LoadLibrary(file)); +} + +static inline char* dlerror() { + const char* errmsg = "dlerror not aviable in windows"; + return const_cast(errmsg); +} + +static inline void* dlsym(void* handle, const char* name) { + FARPROC symbol = GetProcAddress((HMODULE)handle, name); + return reinterpret_cast(symbol); +} +#elif __linux__ || __unix__ || __APPLE__ +#include +#include +#endif + +#if __DEPLOY_ON_XP_SP2__ +//! refer to +//! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 +//! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not +//! implement some base apis for c++ std function, for example, +//! std::mutex/std::thread/std::condition_variable as a workround, we will +//! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc! +#define LITE_MUTEX size_t +#define LITE_RECURSIVE_MUTEX size_t +#define LITE_LOCK_GUARD(mtx) LITE_MARK_USED_VAR(mtx) +#define LITE_LOCK_GUARD_UNIQUE(mtx) LITE_MARK_USED_VAR(mtx) +#define LITE_LOCK_GUARD_SHARED(mtx) LITE_MARK_USED_VAR(LITE_MARK_USED_VAR) +#else +#define LITE_MUTEX std::mutex +#define LITE_RECURSIVE_MUTEX std::recursive_mutex +#define LITE_LOCK_GUARD(mtx) \ + std::lock_guard LITE_LOCK_GUARD_CTOR(mtx) + +#define LITE_LOCK_GUARD_UNIQUE(mtx) \ + std::unique_lock LITE_LOCK_GUARD_CTOR(mtx) + +#define LITE_LOCK_GUARD_SHARED(mtx) \ + std::shared_lock LITE_LOCK_GUARD_CTOR(mtx) +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/network.cpp b/lite/src/network.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f779d8792c5d761548468d117d07418f1e4b5f9 --- /dev/null +++ b/lite/src/network.cpp @@ -0,0 +1,501 @@ +/** + * \file src/network.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/network.h" +#include "function_base.h" +#include "network_impl_base.h" +#include "parse_info/parse_info_base.h" +#include "parse_model/model_parser.h" +#include "type_info.h" +#if LITE_BUILD_WITH_MGE +#include "mge/function_dft.h" +#include "mge/network_impl.h" +#endif + +#include +#include + +using namespace lite; + +/** + * \brief Construct the new work implement + * the order must be : + * 1. creeat the implement + * 2. config and load + * 3. set_io + */ +Network::Network(const Config& config, const NetworkIO& network_io) { + LITE_ERROR_HANDLER_BEGIN + m_config = config; + m_network_io = network_io; + if (config.backend == LiteBackend::LITE_DEFAULT) { + m_impl = call_func>( + "create_network"); + } else if (config.backend == LiteBackend::LITE_RK_NPU) { + m_impl = call_func>( + "create_network"); + } + m_impl->set_config(config); + m_impl->set_io(network_io); + LITE_ERROR_HANDLER_END +} + +Network::Network(const NetworkIO& network_io, const Config& config) { + LITE_ERROR_HANDLER_BEGIN + m_config = config; + m_network_io = network_io; + if (config.backend == LiteBackend::LITE_DEFAULT) { + m_impl = call_func>( + "create_network"); + } else if (config.backend == LiteBackend::LITE_RK_NPU) { + m_impl = call_func>( + "create_network"); + } + m_impl->set_config(config); + m_impl->set_io(network_io); + LITE_ERROR_HANDLER_END +} + +void Network::load_model(void* model_mem, size_t size) { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + //! this model_mem is managed by user + std::shared_ptr model{model_mem, [](void*) {}}; + prase_model(model, size); + LITE_ERROR_HANDLER_END +} + +void Network::load_model(std::string model_path) { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + FILE* fin = fopen(model_path.c_str(), "rb"); + LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), + strerror(errno)); + fseek(fin, 0, SEEK_END); + size_t size = ftell(fin); + fseek(fin, 0, SEEK_SET); + void* ptr = malloc(size); + std::shared_ptr buf{ptr, ::free}; + auto nr = fread(buf.get(), 1, size, fin); + LITE_ASSERT(nr == size); + fclose(fin); + prase_model(buf, size); + LITE_ERROR_HANDLER_END +} + +void Network::prase_model(std::shared_ptr model_data, size_t size) { + std::unordered_map separate_config_map; + ModelParser model_parser(model_data, size); + //! parse the model info + if (model_parser.parse_model_info(m_config, m_network_io, + separate_config_map, m_extra_info)) { + if (m_config.backend == LiteBackend::LITE_DEFAULT && + m_impl->get_backend_type() != LiteBackend::LITE_DEFAULT) { + m_impl.reset(try_call_func( + "parse_model")); + } else if (m_config.backend == LiteBackend::LITE_RK_NPU && + m_impl->get_backend_type() != LiteBackend::LITE_RK_NPU) { + m_impl.reset(try_call_func( + "parse_model")); + } + m_impl->set_config(m_config); + m_impl->set_io(m_network_io); + } + //! decryption the model + size_t model_length; + auto&& model_shared_ptr = model_parser.parse_model(model_length, m_config); + + m_impl->load_model(model_shared_ptr, model_length, separate_config_map); + m_loaded = true; + update_from_implement(); +} + +Network::~Network() = default; + +void Network::update_from_implement() { + m_config.device_type = m_impl->get_device_type(); +} + +void Network::compute_only_configured_output() { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(!m_loaded, + "compute_only_configured_output should be used before model " + "loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->compute_only_configured_output(); + LITE_ERROR_HANDLER_END +} + +std::shared_ptr Network::get_io_tensor(std::string name, + LiteTensorPhase phase) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_io_tensor(name, phase); + LITE_ERROR_HANDLER_END +} + +std::shared_ptr Network::get_input_tensor(size_t index) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, + "get_input_tensor should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_input_tensor(index); + LITE_ERROR_HANDLER_END +} + +std::shared_ptr Network::get_output_tensor(size_t index) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, + "get_output_tensor should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_output_tensor(index); + LITE_ERROR_HANDLER_END +} + +Network& Network::set_async_callback(const AsyncCallback& callback) { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->set_async_callback(std::move(callback)); + return *this; + LITE_ERROR_HANDLER_END +} + +Network& Network::set_start_callback(const StartCallback& callback) { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->set_start_callback(std::move(callback)); + return *this; + LITE_ERROR_HANDLER_END +} + +Network& Network::set_finish_callback(const FinishCallback& callback) { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->set_finish_callback(std::move(callback)); + return *this; + LITE_ERROR_HANDLER_END +} + +Network& Network::set_device_id(int device_id) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(!m_loaded, "set_device_id should be used before model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->set_device_id(device_id); + return *this; + LITE_ERROR_HANDLER_END +} + +Network& Network::set_stream_id(int stream_id) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(!m_loaded, "set_stream_id should be used before model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->set_stream_id(stream_id); + return *this; + LITE_ERROR_HANDLER_END +} + +void Network::forward() { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, "forward should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl.get()); + m_impl->forward(); + LITE_ERROR_HANDLER_END +} + +void Network::wait() { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, "wait should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + m_impl->wait(); + LITE_ERROR_HANDLER_END +} + +std::string Network::get_input_name(size_t index) const { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, "get_input_name should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_input_name(index); + LITE_ERROR_HANDLER_END +} + +std::string Network::get_output_name(size_t index) const { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, "get_output_name should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_output_name(index); + LITE_ERROR_HANDLER_END +} + +std::vector Network::get_all_input_name() const { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, + "get_all_input_name should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + auto all_input_name = m_impl->get_all_input_name(); + std::vector all_names; + for (auto& name : all_input_name) { + all_names.push_back(name); + } + return all_names; + LITE_ERROR_HANDLER_END +} + +std::vector Network::get_all_output_name() const { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_loaded, + "get_all_output_name should be used after model loaded."); + LITE_CHECK_NON_NULL_POINTER(m_impl); + auto all_output_name = m_impl->get_all_output_name(); + std::vector all_names; + for (auto& name : all_output_name) { + all_names.push_back(name); + } + return all_names; + LITE_ERROR_HANDLER_END +} + +int Network::get_device_id() const { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_device_id(); + LITE_ERROR_HANDLER_END +} + +int Network::get_stream_id() const { + LITE_ERROR_HANDLER_BEGIN + LITE_CHECK_NON_NULL_POINTER(m_impl); + return m_impl->get_stream_id(); + LITE_ERROR_HANDLER_END +} + +void Network::enable_profile_performance(std::string profile_file_path) { + LITE_ERROR_HANDLER_BEGIN + m_impl->enable_profile_performance(profile_file_path); + LITE_ERROR_HANDLER_END +} + +const std::string& Network::get_model_extra_info() { + LITE_ERROR_HANDLER_BEGIN + return m_extra_info; + LITE_ERROR_HANDLER_END +} + +LiteDeviceType Network::get_device_type() const { + LITE_ERROR_HANDLER_BEGIN + return m_impl->get_device_type(); + LITE_ERROR_HANDLER_END +} + +/*********************** MGE special network function ***************/ + +void Runtime::set_cpu_threads_number(std::shared_ptr network, + size_t nr_threads) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT( + !NetworkHelper::loaded(network), + "set_cpu_threads_number should be used before model loaded."); + call_func("set_cpu_threads_number", network_impl, + nr_threads); + return; + } + LITE_THROW("set_cpu_threads_number is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::use_tensorrt(std::shared_ptr network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(!NetworkHelper::loaded(network), + "use_tensorrt should be used before model loaded."); + call_func("use_tensorrt", network_impl); + return; + } + LITE_THROW("use_tensorrt is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +size_t Runtime::get_cpu_threads_number(const std::shared_ptr network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + return call_func("get_cpu_threads_number", + network_impl); + } + LITE_THROW("get_cpu_threads_number is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::set_runtime_thread_affinity( + std::shared_ptr network, + const ThreadAffinityCallback& thread_affinity_callback) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(NetworkHelper::loaded(network), + "set_runtime_thread_affinity should be used after model " + "loaded."); + call_func("set_runtime_thread_affinity", + network_impl, thread_affinity_callback); + + return; + } + LITE_THROW("set_runtime_thread_affinity is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::set_cpu_inplace_mode(std::shared_ptr network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(!NetworkHelper::loaded(network), + "set_cpu_inplace_mode should be used before model loaded."); + call_func("set_cpu_inplace_mode", network_impl); + return; + } + LITE_THROW("set_cpu_inplace_mode is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +bool Runtime::is_cpu_inplace_mode(const std::shared_ptr network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + return call_func("is_cpu_inplace_mode", + network_impl); + } + LITE_THROW("is_cpu_inplace_mode is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +//! set opr algorithm selection strategy in the network +void Runtime::set_network_algo_policy(std::shared_ptr network, + LiteAlgoSelectStrategy strategy, + uint32_t shared_batch_size, + bool binary_equal_between_batch) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + call_func("set_network_algo_policy", network_impl, + strategy, shared_batch_size, + binary_equal_between_batch); + return; + } + LITE_THROW("set_network_algo_policy is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +//! set opr algorithm selection strategy in the network +void Runtime::set_network_algo_workspace_limit(std::shared_ptr network, + size_t workspace_limit) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(NetworkHelper::loaded(network), + "set_network_algo_policy should be used after model " + "loaded."); + call_func("set_network_algo_workspace_limit", + network_impl, workspace_limit); + return; + } + LITE_THROW( + "set_network_algo_workspace_limit is not aviliable in the " + "backend."); + LITE_ERROR_HANDLER_END +} + +//! set the network memroy allocator, the allocator is defined by user +void Runtime::set_memory_allocator(std::shared_ptr network, + std::shared_ptr user_allocator) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(!NetworkHelper::loaded(network), + "set_memory_allocator should be used before model loaded."); + call_func("set_memory_allocator", network_impl, + user_allocator); + return; + } + LITE_THROW("set_memory_allocator is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::share_runtime_memory_with(std::shared_ptr dst_network, + std::shared_ptr src_network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl_dst = NetworkHelper::implement(dst_network); + if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(!NetworkHelper::loaded(dst_network), + "share_runtime_memory_with should be used before model " + "loaded."); + call_func("share_runtime_memory_with", + network_impl_dst, + NetworkHelper::implement(src_network)); + return; + } + LITE_THROW("share_runtime_memory_with is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::enable_io_txt_dump(std::shared_ptr network, + std::string io_txt_out_file) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + call_func("enable_io_txt_dump", network_impl, + io_txt_out_file); + return; + } + LITE_THROW("enable_io_txt_dump is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::enable_io_bin_dump(std::shared_ptr network, + std::string io_bin_out_dir) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl = NetworkHelper::implement(network); + if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { + call_func("enable_io_bin_dump", network_impl, + io_bin_out_dir); + return; + } + LITE_THROW("enable_io_bin_dump is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +void Runtime::shared_weight_with_network( + std::shared_ptr dst_network, + const std::shared_ptr src_network) { + LITE_ERROR_HANDLER_BEGIN + auto network_impl_dst = NetworkHelper::implement(dst_network); + if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { + LITE_ASSERT(NetworkHelper::loaded(src_network), + "shared_weight_with_network should be used after the src " + "network " + "loaded."); + auto src_implment = NetworkHelper::implement(src_network); + call_func("shared_weight_with", network_impl_dst, + src_implment); + NetworkHelper::loaded(dst_network, true); + return; + } + LITE_THROW("shared_weight_with_network is not aviliable in the backend."); + LITE_ERROR_HANDLER_END +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/network_impl_base.h b/lite/src/network_impl_base.h new file mode 100644 index 0000000000000000000000000000000000000000..c90af5b5d316482f1704764b6901983ab69945b2 --- /dev/null +++ b/lite/src/network_impl_base.h @@ -0,0 +1,161 @@ +/** + * \file src/network_impl_base.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite/network.h" +#include "misc.h" +#include "tensor_impl_base.h" +#include "type_info.h" + +#include + +namespace lite { + +/*! + * \brief the Inner IO data struct, add some inner data from IO + */ +class IOInner : public IO { +public: + //! use to flag the corresponding lite_tensor is filled, when the + //! value of lite_tensor is filled, the have_sync is true, other wise false, + //! this is used in async mode + bool have_sync = false; + //! Real input and output data location + std::shared_ptr lite_tensor = nullptr; + + IOInner() = default; + IOInner(const IO& io) { + name = io.name; + is_host = io.is_host; + io_type = io.io_type; + config_layout = io.config_layout; + } +}; + +/*! + * \brief the realy network IO info when network run + */ +struct NetworkIOInner { + std::vector inputs; + std::vector outputs; +}; + +/*! + * \brief implement the Network, contain the mgb related member + */ +class Network::NetworkImplBase : public DynTypeObj { +public: + virtual ~NetworkImplBase() = default; + + //! set the config of the network, include: + //! the inference device + //! the other inference options, such as record_level, weight_preprocess... + virtual void set_config(const Config& config) = 0; + + //! set the special io infomation, if not set, default io tensor will used, + //! this is special for input/output is not host tensor, default the + //! input/output tensors are host tensor + virtual void set_io(const NetworkIO& network_io) = 0; + + //! only compute the output tensor in user configured + virtual void compute_only_configured_output() = 0; + + //! get the network input and ouput tensor, the layout of which is + //! sync from mge tensor + virtual std::shared_ptr get_io_tensor( + std::string io_name, + LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0; + + //! get the input tensor by index in the load_result tensormap + virtual std::shared_ptr get_input_tensor(size_t index) = 0; + + //! get the output tensor by index in the load_result output_var_list + virtual std::shared_ptr get_output_tensor(size_t index) = 0; + + //! get all the input tensor name in the order in load return + virtual std::vector get_all_input_name() const = 0; + + //! get all the output tensor name in the order in load return + virtual std::vector get_all_output_name() const = 0; + + //! get the input tensor name in the order in load return + virtual const char* get_input_name(size_t index) const = 0; + + //! get the output tensor name in the order in load return + virtual const char* get_output_name(size_t index) const = 0; + + //! set the callback in async model + virtual void set_async_callback(const AsyncCallback& callback) = 0; + + //! set the start callback which will execute before network forward + virtual void set_start_callback(const StartCallback& callback) = 0; + + //! set the finish callback which will execute after network forward + virtual void set_finish_callback(const FinishCallback& callback) = 0; + + //! load the model and get the m_load_result + virtual void load_model(std::shared_ptr model_mem, size_t size, + std::unordered_map + separate_config_map = {}) = 0; + + //! forward the network with filled input data and fill the output data + //! to the output tensor + virtual void forward() = 0; + + //! in sync model, wait utile the inference finish + virtual void wait() = 0; + + //! set device id, default device id = 0 + virtual void set_device_id(int device_id) = 0; + virtual int get_device_id() const = 0; + virtual LiteBackend get_backend_type() const = 0; + //! set stream id, default stream id = 0 + virtual void set_stream_id(int stream_id) = 0; + virtual int get_stream_id() const = 0; + + virtual LiteDeviceType get_device_type() const = 0; + + //! enable profile the network, a file will be generated + virtual void enable_profile_performance(std::string profile_file_path) = 0; +}; + +/******************************** friend class *****************************/ +/*! + * \brief friend class of Network, for convenient accessing the Network members + */ +class NetworkHelper { +public: + static bool loaded(const std::shared_ptr network) { + LITE_ASSERT(network); + return network->m_loaded; + } + static void loaded(const std::shared_ptr network, bool loaded) { + LITE_ASSERT(network); + network->m_loaded = loaded; + } + static Network::NetworkImplBase* implement(const Network* network) { + LITE_ASSERT(network); + return network->m_impl.get(); + } + static Network::NetworkImplBase* implement( + const std::shared_ptr network) { + LITE_ASSERT(network); + return network->m_impl.get(); + } + static void implement(const std::shared_ptr network, + std::unique_ptr impl) { + LITE_ASSERT(network); + network->m_impl = std::move(impl); + } +}; + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/parse_info/default_parse.h b/lite/src/parse_info/default_parse.h new file mode 100644 index 0000000000000000000000000000000000000000..921b6354d566e6cdc4813cf63244dc2af7bd5c41 --- /dev/null +++ b/lite/src/parse_info/default_parse.h @@ -0,0 +1,246 @@ +/** + * \file src/parse_info/default_parse.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "../misc.h" + +#include "lite/global.h" +#include "lite/network.h" +#include "nlohmann/json.hpp" + +namespace lite { +//! The LITE_default parse info function +bool default_parse_info( + const void* info_ptr, size_t length, const std::string& model_name, + Config& config, NetworkIO& network_io, + std::unordered_map& separate_config_map, + std::string& extra_info) { + using json = nlohmann::json; + std::string json_string(static_cast(info_ptr), length); + auto info = json::parse(json_string); + + if (!info["valid"]) { + return false; + } + auto info_model_name = info["name"]; + if (info_model_name != model_name) { + LITE_THROW( + ssprintf("infomation of model name is not match, packed model " + "is %s, but json info get %s.", + model_name.c_str(), + static_cast(info_model_name).c_str())); + } + //! check version + std::string model_version = info["version"]; + int major = std::stoi(model_version.substr(0, model_version.find("."))); + int start = model_version.find(".") + 1; + int minor = std::stoi( + model_version.substr(start, model_version.find(".", start))); + start = model_version.find(".", start) + 1; + int patch = std::stoi(model_version.substr(start)); + int lite_major, lite_minor, lite_patch; + lite::get_version(lite_major, lite_minor, lite_patch); + size_t model_version_sum = (major * 10000 + minor) * 100 + patch; + size_t lite_version_sum = + (lite_major * 10000 + lite_minor) * 100 + lite_patch; + if (model_version_sum > lite_version_sum) { + LITE_WARN("Lite load the future version model !!!!!!!!!!!!!"); + } + + if (info.contains("has_compression")) { + config.has_compression = info["has_compression"]; + } + if (info.contains("backend")) { + if (info["backend"] == "MGE") { + config.backend = LiteBackend::LITE_DEFAULT; + } + if (info["backend"] == "RK") { + config.backend = LiteBackend::LITE_RK_NPU; + } + } + + auto get_device_type = [](std::string type) -> LiteDeviceType { + if (type == "CPU") + return LiteDeviceType::LITE_CPU; + if (type == "CUDA") + return LiteDeviceType::LITE_CUDA; + if (type == "OPENCL") + return LiteDeviceType::LITE_OPENCL; + if (type == "ATLAS") + return LiteDeviceType::LITE_ATLAS; + if (type == "NPU") + return LiteDeviceType::LITE_NPU; + else { + LITE_THROW(ssprintf("LITE not support device type of %s.", + type.c_str())); + } + }; + if (info.contains("device")) { + auto device_json = info["device"]; + config.device_type = get_device_type(device_json["type"]); + if (device_json.contains("device_id")) { + separate_config_map["device_id"] = + static_cast(device_json["device_id"]); + } + if (device_json.contains("number_threads")) { + separate_config_map["number_threads"] = + static_cast(device_json["number_threads"]); + } + if (device_json.contains("enable_inplace_model")) { + separate_config_map["enable_inplace_model"] = + static_cast(device_json["enable_inplace_model"]); + } + if (device_json.contains("use_tensorrt")) { + separate_config_map["use_tensorrt"] = + static_cast(device_json["use_tensorrt"]); + } + } + //! options + if (info.contains("options")) { + auto options = info["options"]; + if (options.contains("weight_preprocess")) + config.options.weight_preprocess = options["weight_preprocess"]; + if (options.contains("fuse_preprocess")) + config.options.fuse_preprocess = options["fuse_preprocess"]; + if (options.contains("fake_next_exec")) + config.options.fake_next_exec = options["fake_next_exec"]; + if (options.contains("var_sanity_check_first_run")) + config.options.var_sanity_check_first_run = + options["var_sanity_check_first_run"]; + if (options.contains("const_shape")) + config.options.const_shape = options["const_shape"]; + if (options.contains("force_dynamic_alloc")) + config.options.force_dynamic_alloc = options["force_dynamic_alloc"]; + if (options.contains("force_output_dynamic_alloc")) + config.options.force_output_dynamic_alloc = + options["force_output_dynamic_alloc"]; + if (options.contains("no_profiling_on_shape_change")) + config.options.no_profiling_on_shape_change = + options["no_profiling_on_shape_change"]; + if (options.contains("jit_level")) + config.options.jit_level = options["jit_level"]; + if (options.contains("comp_node_seq_record_level")) + config.options.comp_node_seq_record_level = + options["comp_node_seq_record_level"]; + if (options.contains("graph_opt_level")) + config.options.graph_opt_level = options["graph_opt_level"]; + if (options.contains("async_exec_level")) + config.options.async_exec_level = options["async_exec_level"]; + } + //! IO + auto get_io_type = [](std::string type) -> LiteIOType { + if (type == "value") + return LiteIOType::LITE_IO_VALUE; + if (type == "shape") + return LiteIOType::LITE_IO_SHAPE; + else { + LITE_THROW( + ssprintf("LITE not support IO type of %s.", type.c_str())); + } + }; + auto get_data_type = [](std::string type) -> LiteDataType { + if (type == "float32") + return LiteDataType::LITE_FLOAT; + if (type == "float16") + return LiteDataType::LITE_HALF; + if (type == "int32") + return LiteDataType::LITE_INT; + if (type == "int16") + return LiteDataType::LITE_INT16; + if (type == "int8") + return LiteDataType::LITE_INT8; + if (type == "uint8") + return LiteDataType::LITE_UINT8; + else { + LITE_THROW(ssprintf("LITE not support data type of %s.", + type.c_str())); + } + }; +#define SET_SHAPE(shape_json_, config_) \ + do { \ + int ndim = 0; \ + for (int i = 0; i < 4; i++) { \ + if (shape_json_.contains(shape_name[i])) { \ + ndim++; \ + config_.config_layout.shapes[i] = shape_json_[shape_name[i]]; \ + } else { \ + break; \ + } \ + } \ + config_.config_layout.ndim = ndim; \ + } while (0) + +#define Config_IO(io_json_, io_config_) \ + if (io_json_.contains("is_host")) \ + io_config_.is_host = io_json_["is_host"]; \ + if (io_json_.contains("io_type")) \ + io_config_.io_type = get_io_type(io_json_["io_type"]); \ + if (io_json_.contains("dtype")) \ + io_config_.config_layout.data_type = get_data_type(io_json_["dtype"]); \ + if (io_json_.contains("shape")) { \ + auto shape_json = io_json_["shape"]; \ + SET_SHAPE(shape_json, io_config_); \ + } + + const std::string shape_name[] = {"dim0", "dim1", "dim2", "dim3"}; + if(info.contains("IO")){ + auto IOs = info["IO"]; + if(IOs.contains("inputs")){ + auto inputs = IOs["inputs"]; + for (size_t i = 0; i < inputs.size(); i++) { + auto input_json = inputs[i]; + bool found = false; + for (auto&& io_config : network_io.inputs) { + if (io_config.name == input_json["name"]) { + found = true; + Config_IO(input_json, io_config); + } + } + if (!found) { + IO input; + input.name = input_json["name"]; + Config_IO(input_json, input); + network_io.inputs.push_back(input); + } + } + } + if (IOs.contains("outputs")) { + auto outputs = IOs["outputs"]; + for (size_t i = 0; i < outputs.size(); i++) { + auto output_json = outputs[i]; + bool found = false; + for (auto&& io_config : network_io.outputs) { + if (io_config.name == output_json["name"]) { + found = true; + Config_IO(output_json, io_config); + } + } + if (!found) { + IO output; + output.name = output_json["name"]; + Config_IO(output_json, output); + network_io.outputs.push_back(output); + } + } + } + } + //! extra_info + if (info.contains("extra_info")) { + extra_info = info["extra_info"].dump(); + } + return true; +#undef GET_BOOL +#undef Config_IO +} + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/parse_info/parse_info_base.h b/lite/src/parse_info/parse_info_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d54ed05410fc06d605e43ec1e049ec7fcbfa02f0 --- /dev/null +++ b/lite/src/parse_info/parse_info_base.h @@ -0,0 +1,40 @@ +/** + * \file src/parse_info/parse_info_base.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#include "lite/global.h" +#include "mutex" + +namespace lite { + +struct ParseInfoStaticData { + std::unordered_map parse_info_methods; + LITE_MUTEX map_mutex; +}; + +ParseInfoStaticData& parse_info_static_data(); + +template +struct ParseInfoRegister; +} // namespace lite + +#define REGIST_PARSE_INFO_FUNCTION(name_, func_) \ + REGIST_PARSE_INFO_FUNCTION_WITH_NUM(__COUNTER__, name_, func_) + +#define REGIST_PARSE_INFO_FUNCTION_WITH_NUM(number_, name_, func_) \ + template <> \ + struct ParseInfoRegister { \ + ParseInfoRegister() { register_parse_info_func(name_, func_); } \ + }; \ + namespace { \ + ParseInfoRegister parse_info_##number_; \ + } + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/parse_model/model_parser.cpp b/lite/src/parse_model/model_parser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de45f48409f162e5920ada98f8077c804e9e6141 --- /dev/null +++ b/lite/src/parse_model/model_parser.cpp @@ -0,0 +1,134 @@ +/** + * \file src/model_parser.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "model_parser.h" +#include "decryption/decrypt_base.h" +#include "parse_info/parse_info_base.h" + +using namespace lite; +using namespace model_parse; + +std::string ModelParser::sm_model_tag = "packed_model"; + +void ModelParser::parse_header() { + size_t tag_length = sm_model_tag.size(); + + //! parse model tag + const char* ptr = static_cast(m_model.get()); + std::string tag(static_cast(ptr), tag_length); + if (sm_model_tag == tag) { + m_is_bare_model = false; + } else { + //! if no tag, the model is bare model, return + m_is_bare_model = true; + return; + } + + uint8_t* buffer = static_cast(m_model.get()) + tag_length; + auto packed_model = GetPackModel(buffer); + auto models = packed_model->models(); + LITE_ASSERT(models->size() == 1, "Now only support one model"); + auto model = models->Get(0); + m_model_name = model->header()->name()->c_str(); + m_model_decryption_name = + model->header()->model_decryption_method()->c_str(); + m_info_decryption_name = model->header()->info_decryption_method()->c_str(); + m_info_parse_func_name = model->header()->info_parse_method()->c_str(); + + m_info = model->info(); + m_model_data = model->data(); +} + +bool ModelParser::parse_model_info( + Config& network_config, NetworkIO& network_io, + std::unordered_map& isolated_config_map, + std::string& extra_info) const { + //! no model info, no parse, direct return + if (m_is_bare_model || !m_info) { + return false; + } + size_t info_length = m_info->data()->size(); + const uint8_t* info_data = m_info->data()->Data(); + //! decryption the info + auto info_ptr = decrypt_memory(info_data, info_length, + m_info_decryption_name, info_length); + //! parse the info + LITE_LOCK_GUARD(parse_info_static_data().map_mutex); + auto it_parse = parse_info_static_data().parse_info_methods.find( + m_info_parse_func_name); + if (it_parse == parse_info_static_data().parse_info_methods.end()) { + LITE_THROW(ssprintf("can't find model info parse function %s.", + m_info_parse_func_name.c_str())); + } + auto model_info_parse_func = + parse_info_static_data().parse_info_methods[m_info_parse_func_name]; + //! convert for NetworkIOInner to NetworkIO + if (model_info_parse_func) { + model_info_parse_func(info_ptr.get(), info_length, m_model_name, + network_config, network_io, isolated_config_map, + extra_info); + } else { + LITE_THROW(ssprintf("model info parse function of %s is empty", + m_info_parse_func_name.c_str())); + } + return true; +} + +std::shared_ptr ModelParser::parse_model(size_t& model_length, + const Config& config) const { + if (m_is_bare_model) { + if (config.bare_model_cryption_name.size() == 0) { + model_length = m_total_length; + return m_model; + } else { + return decrypt_memory( + static_cast(m_model.get()), m_total_length, + config.bare_model_cryption_name, model_length); + } + } + LITE_ASSERT(m_model_data, "packed model parse error!"); + model_length = m_model_data->data()->size(); + const uint8_t* model_data = m_model_data->data()->Data(); + LITE_ASSERT(model_length > 0, "The loaded model is of zero length."); + return decrypt_memory(model_data, model_length, m_model_decryption_name, + model_length); +} + +std::shared_ptr ModelParser::decrypt_memory( + const uint8_t* data, size_t length, const std::string decryption_name, + size_t& result_length) const { + const uint8_t* memory_ptr = data; + if (decryption_name == "NONE") { + result_length = length; + return std::shared_ptr(const_cast(memory_ptr), + [](void*) {}); + } + LITE_LOCK_GUARD(decryption_static_data().map_mutex); + auto it = decryption_static_data().decryption_methods.find(decryption_name); + if (it == decryption_static_data().decryption_methods.end()) { + LITE_THROW(ssprintf("The decryption method %s is not registed yet.", + decryption_name.c_str())); + } + auto&& func = it->second.first; + auto&& key = it->second.second; + if (func) { + auto model_vector = func(memory_ptr, length, *key); + result_length = model_vector.size(); + auto tmp_model_vector = + new std::vector(std::move(model_vector)); + return std::shared_ptr( + tmp_model_vector->data(), + [tmp_model_vector](void*) { delete tmp_model_vector; }); + } else { + LITE_THROW(ssprintf("No decryption function in %s method.", + decryption_name.c_str())); + } +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/parse_model/model_parser.h b/lite/src/parse_model/model_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..d6edb8f0eefa12a7b32d0687f601d04483e46647 --- /dev/null +++ b/lite/src/parse_model/model_parser.h @@ -0,0 +1,75 @@ +/** + * \file src/model_parser.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once +#include "lite/global.h" +#include "../network_impl_base.h" + +#include "pack_model_generated.h" +#include + +#include + +namespace lite { + +/*! + * \brief parse the model and decyt + */ +class ModelParser { +public: + ModelParser(std::shared_ptr model_ptr, size_t model_length) + : m_model(model_ptr), m_total_length(model_length) { + //! parse the header + parse_header(); + } + + //! parse the Info part of the model, update the network_config and + //! network_io + bool parse_model_info( + Config& network_config, NetworkIO& network_io, + std::unordered_map& isolated_config_map, + std::string& extra_info) const; + + //! parse the model and decrypt the model + std::shared_ptr parse_model(size_t& model_length, + const Config& config) const; + +private: + //! parse the header of the model and store the model related information + //! to the menber data + void parse_header(); + + //! decrypt a memory with length of length and decryption method name + //! decrypt_name + std::shared_ptr decrypt_memory(const uint8_t* data, size_t length, + const std::string decryption_name, + size_t& result_length) const; + +private: + std::string m_model_name; + //! the info and model decryption method name, the + //! decryption func can be found through this name + std::string m_info_decryption_name; + std::string m_model_decryption_name; + //! the function name to parse the model info + std::string m_info_parse_func_name; + //! if a model is not added json info to the model is not crypted, the + //! model is a bare model + bool m_is_bare_model = true; + + const model_parse::ModelInfo* m_info = nullptr; + const model_parse::ModelData* m_model_data = nullptr; + + std::shared_ptr m_model; + size_t m_total_length; + + static std::string sm_model_tag; +}; +} // namespace lite + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/parse_model/pack_model.fbs b/lite/src/parse_model/pack_model.fbs new file mode 100644 index 0000000000000000000000000000000000000000..d0bc442eadafcaaf9db2bc993bc803126a9400f1 --- /dev/null +++ b/lite/src/parse_model/pack_model.fbs @@ -0,0 +1,28 @@ +namespace model_parse; + +table ModelHeader { + name:string; + info_decryption_method:string; + info_parse_method:string; + model_decryption_method:string; +} + +table ModelInfo { + data:[ubyte]; +} + +table ModelData { + data:[ubyte]; +} + +table Model { + header:ModelHeader; + info:ModelInfo; + data:ModelData; +} + +table PackModel { + models:[Model]; +} + +root_type PackModel; diff --git a/lite/src/tensor.cpp b/lite/src/tensor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6bda653f646ec5b95e5b5b80cc94a439248fa2cf --- /dev/null +++ b/lite/src/tensor.cpp @@ -0,0 +1,339 @@ +/** + * \file src/tensor.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite/tensor.h" +#include "function_base.h" +#include "tensor_impl_base.h" +#if LITE_BUILD_WITH_MGE +#include "megbrain/comp_node.h" +#include "megbrain/tensor.h" +#include "mge/function_dft.h" +#include "mge/tensor_impl.h" +#endif + +#include + +using namespace lite; + +size_t Layout::get_elem_size() const { + size_t elesize = 1; + switch (data_type) { + case LiteDataType::LITE_INT64: + elesize = 8; + break; + case LiteDataType::LITE_FLOAT: + case LiteDataType::LITE_INT: + case LiteDataType::LITE_UINT: + elesize = 4; + break; + case LiteDataType::LITE_HALF: + case LiteDataType::LITE_INT16: + case LiteDataType::LITE_UINT16: + elesize = 2; + break; + case LiteDataType::LITE_INT8: + case LiteDataType::LITE_UINT8: + elesize = 1; + break; + default: + LITE_THROW("not support data type."); + } + return elesize; +} + +bool Layout::operator==(const Layout& other) const { + bool equal = true; + equal &= (ndim == other.ndim); + equal &= (data_type == other.data_type); + for (size_t i = 0; i < ndim; i++) { + equal &= (shapes[i] == other.shapes[i]); + } + return equal; +} + +Tensor::~Tensor() = default; + +Tensor::Tensor() { + LITE_ERROR_HANDLER_BEGIN + m_tensor_impl = call_func>( + "create_tensor"); + LITE_ERROR_HANDLER_END +} + +Tensor::Tensor(LiteDeviceType device_type, bool is_pinned_host) + : m_is_pinned_host(is_pinned_host), m_device_type(device_type) { + LITE_ERROR_HANDLER_BEGIN + m_tensor_impl = call_func>( + "create_tensor", device_type, is_pinned_host); + LITE_ERROR_HANDLER_END +} + +Tensor::Tensor(LiteDeviceType device_type, const Layout& layout, + bool is_pinned_host) + : m_is_pinned_host(is_pinned_host), + m_layout(layout), + m_device_type(device_type) { + LITE_ERROR_HANDLER_BEGIN + m_tensor_impl = call_func>( + "create_tensor", device_type, layout, is_pinned_host); + LITE_ERROR_HANDLER_END +} + +Tensor::Tensor(int device_id, LiteDeviceType device_type, const Layout& layout, + bool is_pinned_host) + : m_is_pinned_host(is_pinned_host), + m_device_id(device_id), + m_layout(layout), + m_device_type(device_type) { + LITE_ERROR_HANDLER_BEGIN + m_tensor_impl = call_func>( + "create_tensor", device_id, device_type, layout, is_pinned_host); + LITE_ERROR_HANDLER_END +} + +Tensor::Tensor(int device_id, int stream_id, LiteDeviceType device_type, + bool is_pinned_host) + : m_is_pinned_host(is_pinned_host), + m_device_id(device_id), + m_device_type(device_type) { + LITE_ERROR_HANDLER_BEGIN + m_tensor_impl = call_func>( + "create_tensor", device_id, stream_id, device_type, is_pinned_host); + LITE_ERROR_HANDLER_END +} + +Tensor::Tensor(LiteBackend backend, LiteDeviceType device_type, int device_id, + const Layout& layout, bool is_pinned_host) { + if (backend == LiteBackend::LITE_DEFAULT) { + m_tensor_impl = + call_func>( + "create_tensor", device_id, device_type, layout, + is_pinned_host); + } else { + LITE_MARK_USED_VAR(device_type); + LITE_MARK_USED_VAR(is_pinned_host); + LITE_MARK_USED_VAR(layout); + LITE_MARK_USED_VAR(device_id); + LITE_THROW("unknow backend, enum id is : %d."); + } +} + +void Tensor::reshape(const std::vector& shape) { + LITE_ASSERT(m_layout.ndim > 0, "The tensor to be reshape is empty."); + uint32_t length = shape.size(); + LITE_ASSERT(length < Layout::MAXDIM, + "The ndim of reshape input is too large."); + Layout new_layout = m_layout; + new_layout.ndim = length; + size_t total_length = + get_tensor_total_size_in_byte() / m_layout.get_elem_size(); + uint32_t unfixed_number = 0; + uint32_t unfixed_index = 0; + for (uint32_t i = 0; i < length; i++) { + if (shape[i] == -1) { + unfixed_number += 1; + unfixed_index = i; + } else { + LITE_ASSERT(shape[i] > 0, "The reshape inputs invalid."); + new_layout.shapes[i] = shape[i]; + } + } + LITE_ASSERT(unfixed_number <= 1, "The reshape inputs invalid."); + if (unfixed_number) { + size_t left = total_length; + for (uint32_t i = 0; i < length; i++) { + if (i == unfixed_index) { + continue; + } else { + LITE_ASSERT(left > 0 && (left % new_layout.shapes[i] == 0), + "The reshape inputs invalid."); + left = left / new_layout.shapes[i]; + } + } + LITE_ASSERT(left > 0, "The reshape inputs invalid."); + new_layout.shapes[unfixed_index] = left; + } + size_t new_total = 1; + for (uint32_t i = 0; i < length; i++) { + new_total *= new_layout.shapes[i]; + } + LITE_ASSERT(new_total == total_length, "The reshape inputs invalid."); + m_layout = new_layout; + m_tensor_impl->reshape(m_layout); +} + +size_t Tensor::get_tensor_total_size_in_byte() const { + LITE_ERROR_HANDLER_BEGIN + size_t elemsize = m_layout.get_elem_size(); + size_t total = m_layout.ndim == 0 ? 0 : 1; + for (size_t i = 0; i < m_layout.ndim; i++) { + total *= m_layout.shapes[i]; + } + return total * elemsize; + LITE_ERROR_HANDLER_END +} + +void* Tensor::get_memory_ptr() const { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_layout.ndim != 0, + "Tensor layout is not valid when get memory ptr."); + return m_tensor_impl->get_memory_ptr(); + LITE_ERROR_HANDLER_END +} + +void* Tensor::get_memory_ptr(const std::vector& idx) const { + LITE_ERROR_HANDLER_BEGIN + return m_tensor_impl->get_memory_ptr(idx); + LITE_ERROR_HANDLER_END +} + +std::shared_ptr Tensor::slice(const std::vector& start, + const std::vector& end, + const std::vector& step) { + LITE_ERROR_HANDLER_BEGIN + auto ret = m_tensor_impl->slice(start, end, step); + ret->update_from_implement(); + return ret; + LITE_ERROR_HANDLER_END +} + +void Tensor::fill_zero() { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_layout.ndim > 0, + "fill_zero can't apply on a tensor with empty layout."); + m_tensor_impl->fill_zero(); + LITE_ERROR_HANDLER_END +} + +void Tensor::share_memory_with(const Tensor& src_tensor) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(src_tensor.m_layout.ndim > 0, + "To be shared tensor with empty layout."); + m_tensor_impl->share_memory_with(src_tensor.m_tensor_impl.get()); + update_from_implement(); + LITE_ERROR_HANDLER_END +} + +void Tensor::set_layout(const Layout& layout) { + LITE_ERROR_HANDLER_BEGIN + m_layout = layout; + m_tensor_impl->set_layout(layout); + LITE_ERROR_HANDLER_END +} + +void Tensor::reset(void* prepared_data, size_t data_length_in_byte) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(m_layout.ndim, + "Tensor layout is empty, please reset with layout"); + LITE_ASSERT(data_length_in_byte >= get_tensor_total_size_in_byte(), + "the memory reset to the tensor is too small."); + m_tensor_impl->reset(prepared_data); + LITE_ERROR_HANDLER_END +} + +void Tensor::reset(void* prepared_data, const Layout& layout) { + LITE_ERROR_HANDLER_BEGIN + m_layout = layout; + m_tensor_impl->reset(prepared_data, layout); + LITE_ERROR_HANDLER_END +} + +bool Tensor::is_continue_memory() const { + LITE_ERROR_HANDLER_BEGIN + return m_tensor_impl->is_continue_memory(); + LITE_ERROR_HANDLER_END +} + +void Tensor::copy_from(const Tensor& src) { + LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT(src.get_layout().ndim != 0, + "when tensor copy, the src tensor layout is empty."); + m_tensor_impl->copy_from(src.m_tensor_impl.get()); + update_from_implement(); + LITE_ERROR_HANDLER_END +} + +void Tensor::update_from_implement() { + LITE_ERROR_HANDLER_BEGIN + m_layout = m_tensor_impl->get_layout(); + m_device_type = m_tensor_impl->get_device_type(); + m_device_id = m_tensor_impl->get_device_id(); + m_is_pinned_host = m_tensor_impl->is_pinned_host(); + LITE_ERROR_HANDLER_END +} + +void LiteAny::type_missmatch(size_t expect, size_t get) const { + LITE_THROW(ssprintf( + "The type store in LiteAny is not match the visit type, type of " + "storage length is %zu, type of visit length is %zu.", + expect, get)); +} + +std::shared_ptr TensorUtils::concat(const std::vector& tensors, + int dim, LiteDeviceType dst_device, + int dst_device_id) { + if (tensors.size() <= 0) { + return std::make_shared(); + } + if (dst_device == LiteDeviceType::LITE_DEVICE_DEFAULT) { + dst_device = tensors.front().get_device_type(); + } + if (dst_device_id == -1) { + dst_device_id = tensors.front().get_device_id(); + } + bool is_pinned_host = tensors.front().is_pinned_host(); + auto layout = tensors.front().get_layout(); + LITE_ASSERT(static_cast(layout.ndim) > dim, + "the dim in concat is error."); + size_t sum_in_dim = layout.shapes[dim]; + for (size_t i = 1; i < tensors.size(); ++i) { + auto other_layout = tensors[i].get_layout(); + LITE_ASSERT(other_layout.ndim == layout.ndim, + "the dim size of tensors is not same!"); + LITE_ASSERT(other_layout.data_type == layout.data_type, + "the dtype of tensors is not same!"); + for (size_t j = 0; j < other_layout.ndim; ++j) { + if (dim == static_cast(j)) { + sum_in_dim += other_layout.shapes[j]; + continue; + } + LITE_ASSERT(other_layout.shapes[j] == layout.shapes[j], + "the shape of tensors is not same!"); + } + } + layout.shapes[dim] = sum_in_dim; + auto result = std::make_shared(dst_device_id, dst_device, layout, + is_pinned_host); + size_t index = 0; + std::vector start(dim + 1, 0); + std::vector end(dim + 1, 0); + for (int i = 0; i < dim; i++) { + end[i] = layout.shapes[i]; + } + for (size_t i = 0; i < tensors.size(); ++i) { + auto&& tensor = tensors[i]; + auto layout = tensor.get_layout(); + if (layout.shapes[dim] == 0) + continue; + start[dim] = index; + end[dim] = index + layout.shapes[dim]; + auto&& sub_dst = result->slice(start, end); + sub_dst->copy_from(tensor); + index += layout.shapes[dim]; + } + return result; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/tensor_impl_base.h b/lite/src/tensor_impl_base.h new file mode 100644 index 0000000000000000000000000000000000000000..a7f1bf0dbca937b248a03859bde64061826ee3d0 --- /dev/null +++ b/lite/src/tensor_impl_base.h @@ -0,0 +1,101 @@ +/** + * \file src/tensor_impl_base.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite/tensor.h" +#include "misc.h" +#include "type_info.h" + +#include + +namespace lite { + +/*! + * \brief implement the Tensor + */ +class Tensor::TensorImplBase : public DynTypeObj { +public: + virtual ~TensorImplBase() = default; + + virtual LiteDeviceType get_device_type() const = 0; + + virtual int get_device_id() const = 0; + + virtual LiteBackend get_backend_type() const = 0; + + virtual Layout get_layout() const = 0; + + virtual bool is_pinned_host() const = 0; + + virtual void* get_memory_ptr() const = 0; + + virtual void* get_memory_ptr(const std::vector& idx) const = 0; + + virtual void set_layout(const Layout& layout) = 0; + + //! use the user allocated data to reset the memory of the tensor, the + //! memory will not be managed by the lite, later, the user should delete + //! it. + virtual void reset(void* prepared_data) = 0; + + //! use the user allocated data and corresponding layout to reset the data + //! and layout of the tensor, the memory will not be managed by lite, later, + //! the user should delete it. + virtual void reset(void* prepared_data, const Layout& layout) = 0; + + //! reshape the tensor with new shape, keep the data_type the same + virtual void reshape(const Layout& layout) = 0; + + //! get a new tensor slice from the origin tensor + virtual std::shared_ptr slice( + const std::vector& start, const std::vector& end, + const std::vector& step = {}) = 0; + + //! set the tensor memory with zero + virtual void fill_zero() = 0; + + //! copy tensor form other tensor + //! Note: the best way for tensor copy is just set the dst device, left + //! layout empty, when copying the dst layout will be set the same with + //! src + virtual void copy_from(const TensorImplBase* src_impl) = 0; + + //! share memory with other tensor + virtual void share_memory_with(const TensorImplBase* src_impl) = 0; + + //! whether the memory of tensor is continue + virtual bool is_continue_memory() const = 0; +}; + +/*! + * \brief friend class of Tensor, for convenient accessing the Network members + */ +class TensorHelper { +public: + static inline std::shared_ptr implement( + const std::shared_ptr tensor) { + LITE_ASSERT(tensor); + return tensor->m_tensor_impl; + } + static inline std::shared_ptr implement( + const Tensor* tensor) { + LITE_ASSERT(tensor); + return tensor->m_tensor_impl; + } + static inline void implement(const std::shared_ptr tensor, + std::shared_ptr impl) { + LITE_ASSERT(tensor); + tensor->m_tensor_impl = impl; + } +}; + +} // namespace lite + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/type_info.h b/lite/src/type_info.h new file mode 100644 index 0000000000000000000000000000000000000000..28785beade407b000f9cad298c9d485c536b0cd5 --- /dev/null +++ b/lite/src/type_info.h @@ -0,0 +1,97 @@ +/** + * \file src/type_info.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "misc.h" + +namespace lite { +/*! + * \brief an object to represent a type + * + * LITE has a lightweight RTTI system. Each type is represented by the + * address of a Typeinfo object, which is stored in the .bss segment. + * + * LITE_TYPEINFO_OBJ_DECL should be placed into the definition of classes that + * need compile-time type support. + * + * For classes that need RTTI, they should be derived from DynTypeObj + */ +struct Typeinfo { + //! name of the corresponding type; nullptr if MGB_VERBOSE_TYPEINFO_NAME==0 + const char* const name; + + /*! + * \brief whether this is the type of given object + * \tparam T a class with static typeinfo() method + */ + template + bool is() const { + return T::typeinfo() == this; + } +}; + +/*! + * \brief base class to emulate RTTI without compiler support + */ +class DynTypeObj { +public: + virtual Typeinfo* dyn_typeinfo() const = 0; + + //! cast this to a final object with type check + template + T& cast_final_safe() { + LITE_ASSERT(T::typeinfo() == dyn_typeinfo(), + "can not convert type %s to %s", dyn_typeinfo()->name, + T::typeinfo()->name); + return *static_cast(this); + } + + template + const T& cast_final_safe() const { + return const_cast(this)->cast_final_safe(); + } + + //! check whether this is same to given type + template + bool same_type() const { + return dyn_typeinfo() == T::typeinfo(); + } + +protected: + ~DynTypeObj() = default; +}; + +//! put in the declaration of a final class inherited from DynTypeObj +#define LITE_DYN_TYPE_OBJ_FINAL_DECL \ +public: \ + ::lite::Typeinfo* dyn_typeinfo() const override final; \ + static inline ::lite::Typeinfo* typeinfo() { return &sm_typeinfo; } \ + \ +private: \ + static ::lite::Typeinfo sm_typeinfo + +#if LITE_ENABLE_LOGGING +//! get class name from class object +#define _LITE_TYPEINFO_CLASS_NAME(_cls) #_cls +#else +#define _LITE_TYPEINFO_CLASS_NAME(_cls) nullptr +#endif + +//! put in the impl file of a class that needs static typeinfo() +#define LITE_TYPEINFO_OBJ_IMPL(_cls) \ + ::lite::Typeinfo _cls::sm_typeinfo { _LITE_TYPEINFO_CLASS_NAME(_cls) } + +//! put in the impl file of a final class inherited from DynTypeObj +#define LITE_DYN_TYPE_OBJ_FINAL_IMPL(_cls) \ + ::lite::Typeinfo* _cls::dyn_typeinfo() const { return &sm_typeinfo; } \ + LITE_TYPEINFO_OBJ_IMPL(_cls) + +} // namespace lite +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/version_lite.ld b/lite/src/version_lite.ld new file mode 100644 index 0000000000000000000000000000000000000000..66f30d7694c59a2c2c8cea75b1598a1cba7f532f --- /dev/null +++ b/lite/src/version_lite.ld @@ -0,0 +1,10 @@ +{ +global: + extern "C++" {lite::*;}; + Lite*; + LITE*; + default_config; + default_network_io; + +local: *; +}; diff --git a/lite/test/CMakeLists.txt b/lite/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c5e9d13343ef6a097f814b86ee9dd6c4ad96714 --- /dev/null +++ b/lite/test/CMakeLists.txt @@ -0,0 +1,23 @@ +if (MGE_WITH_TEST) + file (GLOB_RECURSE SOURCES ./*.cpp main.cpp) + add_executable (lite_test ${SOURCES}) + + target_link_libraries(lite_test gtest) + target_link_libraries(lite_test lite_static) + if(LITE_BUILD_WITH_MGE) + # lite_test will depends megbrain interface + target_link_libraries(lite_test megbrain) + endif() + + if(UNIX) + if(APPLE OR ANDROID) + target_link_libraries(lite_test dl) + else() + target_link_libraries(lite_test dl rt) + endif() + endif() + + install (TARGETS lite_test + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) +endif() diff --git a/lite/test/main.cpp b/lite/test/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..af75a6c4d1f167df27fed7ace1a658a44414c87a --- /dev/null +++ b/lite/test/main.cpp @@ -0,0 +1,33 @@ +/** + * \file test/main.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include +#include "../src/misc.h" +#include "lite/global.h" + +namespace { + +class ResetSeedListener : public ::testing::EmptyTestEventListener { + void OnTestStart(const ::testing::TestInfo&) override {} +}; + +} // namespace + +int main(int argc, char** argv) { + ResetSeedListener listener; + auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); + ::testing::InitGoogleTest(&argc, argv); + listeners.Append(&listener); + lite::set_log_level(LiteLogLevel::WARN); + auto ret = RUN_ALL_TESTS(); + listeners.Release(&listener); + return ret; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/npy.h b/lite/test/npy.h new file mode 100644 index 0000000000000000000000000000000000000000..552cda78f7cc203624b58a16ec0213bc65735b60 --- /dev/null +++ b/lite/test/npy.h @@ -0,0 +1,638 @@ +/* + Copyright 2017 Leon Merten Lohse + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#ifndef NPY_H +#define NPY_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace npy { + +/* Compile-time test for byte order. + If your compiler does not define these per default, you may want to define + one of these constants manually. + Defaults to little endian order. */ +#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ + defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ + defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ + defined(__MIBSEB) || defined(__MIBSEB__) +const bool big_endian = true; +#else +const bool big_endian = false; +#endif + +const char magic_string[] = "\x93NUMPY"; +const size_t magic_string_length = 6; + +const char little_endian_char = '<'; +const char big_endian_char = '>'; +const char no_endian_char = '|'; + +constexpr char host_endian_char = + (big_endian ? big_endian_char : little_endian_char); + +/* npy array length */ +typedef unsigned long int ndarray_len_t; + +inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, + unsigned char v_minor = 0) { + ostream.write(magic_string, magic_string_length); + ostream.put(v_major); + ostream.put(v_minor); +} + +inline void read_magic(std::istream& istream, unsigned char& v_major, + unsigned char& v_minor) { + char buf[magic_string_length + 2]; + istream.read(buf, magic_string_length + 2); + + if (!istream) { + fprintf(stderr, "io error: failed reading file"); + } + + if (0 != std::memcmp(buf, magic_string, magic_string_length)) { + fprintf(stderr, "this file does not have a valid npy format."); + } + + v_major = buf[magic_string_length]; + v_minor = buf[magic_string_length + 1]; +} + +// typestring magic +struct Typestring { +private: + char c_endian; + char c_type; + int len; + +public: + inline std::string str() { + const size_t max_buflen = 16; + char buf[max_buflen]; + std::sprintf(buf, "%c%c%u", c_endian, c_type, len); + return std::string(buf); + } + + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'f'}, + len{sizeof(long double)} {} + + Typestring(const std::vector&) + : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} + + Typestring(const std::vector&) + : c_endian{no_endian_char}, + c_type{'u'}, + len{sizeof(unsigned char)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned short)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned int)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned long)} {} + Typestring(const std::vector&) + : c_endian{host_endian_char}, + c_type{'u'}, + len{sizeof(unsigned long long)} {} + + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} + Typestring(const std::vector>&) + : c_endian{host_endian_char}, + c_type{'c'}, + len{sizeof(std::complex)} {} +}; + +inline void parse_typestring(std::string typestring) { + std::regex re("'([<>|])([ifuc])(\\d+)'"); + std::smatch sm; + + std::regex_match(typestring, sm, re); + + if (sm.size() != 4) { + fprintf(stderr, "invalid typestring"); + } +} + +namespace pyparse { + +/** + Removes leading and trailing whitespaces + */ +inline std::string trim(const std::string& str) { + const std::string whitespace = " \t"; + auto begin = str.find_first_not_of(whitespace); + + if (begin == std::string::npos) + return ""; + + auto end = str.find_last_not_of(whitespace); + + return str.substr(begin, end - begin + 1); +} + +inline std::string get_value_from_map(const std::string& mapstr) { + size_t sep_pos = mapstr.find_first_of(":"); + if (sep_pos == std::string::npos) + return ""; + + std::string tmp = mapstr.substr(sep_pos + 1); + return trim(tmp); +} + +/** + Parses the string representation of a Python dict + + The keys need to be known and may not appear anywhere else in the data. + */ +inline std::unordered_map parse_dict( + std::string in, std::vector& keys) { + std::unordered_map map; + + if (keys.size() == 0) + return map; + + in = trim(in); + + // unwrap dictionary + if ((in.front() == '{') && (in.back() == '}')) + in = in.substr(1, in.length() - 2); + else { + fprintf(stderr, "Not a Python dictionary."); + } + + std::vector> positions; + + for (auto const& value : keys) { + size_t pos = in.find("'" + value + "'"); + + if (pos == std::string::npos) { + fprintf(stderr, "Missing %s key.", value.c_str()); + } + + std::pair position_pair{pos, value}; + positions.push_back(position_pair); + } + + // sort by position in dict + std::sort(positions.begin(), positions.end()); + + for (size_t i = 0; i < positions.size(); ++i) { + std::string raw_value; + size_t begin{positions[i].first}; + size_t end{std::string::npos}; + + std::string key = positions[i].second; + + if (i + 1 < positions.size()) + end = positions[i + 1].first; + + raw_value = in.substr(begin, end - begin); + + raw_value = trim(raw_value); + + if (raw_value.back() == ',') + raw_value.pop_back(); + + map[key] = get_value_from_map(raw_value); + } + + return map; +} + +/** + Parses the string representation of a Python boolean + */ +inline bool parse_bool(const std::string& in) { + if (in == "True") + return true; + if (in == "False") + return false; + + fprintf(stderr, "Invalid python boolan."); + return false; +} + +/** + Parses the string representation of a Python str + */ +inline std::string parse_str(const std::string& in) { + if ((in.front() == '\'') && (in.back() == '\'')) + return in.substr(1, in.length() - 2); + + fprintf(stderr, "Invalid python string."); + return ""; +} + +/** + Parses the string represenatation of a Python tuple into a vector of its items + */ +inline std::vector parse_tuple(std::string in) { + std::vector v; + const char seperator = ','; + + in = trim(in); + + if ((in.front() == '(') && (in.back() == ')')) + in = in.substr(1, in.length() - 2); + else { + fprintf(stderr, "Invalid Python tuple."); + } + + std::istringstream iss(in); + + for (std::string token; std::getline(iss, token, seperator);) { + v.push_back(token); + } + + return v; +} + +template +inline std::string write_tuple(const std::vector& v) { + if (v.size() == 0) + return ""; + + std::ostringstream ss; + + if (v.size() == 1) { + ss << "(" << v.front() << ",)"; + } else { + const std::string delimiter = ", "; + // v.size() > 1 + ss << "("; + std::copy(v.begin(), v.end() - 1, + std::ostream_iterator(ss, delimiter.c_str())); + ss << v.back(); + ss << ")"; + } + + return ss.str(); +} + +inline std::string write_boolean(bool b) { + if (b) + return "True"; + else + return "False"; +} + +} // namespace pyparse + +inline void parse_header(std::string header, std::string& descr) { + /* + The first 6 bytes are a magic string: exactly "x93NUMPY". + The next 1 byte is an unsigned byte: the major version number of the file + format, e.g. x01. The next 1 byte is an unsigned byte: the minor version + number of the file format, e.g. x00. Note: the version of the file format + is not tied to the version of the numpy package. The next 2 bytes form a + little-endian unsigned short int: the length of the header data + HEADER_LEN. The next HEADER_LEN bytes form the header data describing the + array's format. It is an ASCII string which contains a Python literal + expression of a dictionary. It is terminated by a newline ('n') and + padded with spaces + ('x20') to make the total length of the magic string + 4 + HEADER_LEN be + evenly divisible by 16 for alignment purposes. The dictionary contains + three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. For repeatability and + readability, this dictionary is formatted using pprint.pformat() so the + keys are in alphabetic order. + */ + + // remove trailing newline + if (header.back() != '\n') + fprintf(stderr, "invalid header"); + header.pop_back(); + + // parse the dictionary + std::vector keys{"descr"}; + auto dict_map = npy::pyparse::parse_dict(header, keys); + + if (dict_map.size() == 0) + fprintf(stderr, "invalid dictionary in header"); + + std::string descr_s = dict_map["descr"]; + parse_typestring(descr_s); + // remove + descr = npy::pyparse::parse_str(descr_s); + return; +} + +inline void parse_header(std::string header, std::string& descr, + bool& fortran_order, + std::vector& shape) { + /* + The first 6 bytes are a magic string: exactly "x93NUMPY". + The next 1 byte is an unsigned byte: the major version number of the file + format, e.g. x01. The next 1 byte is an unsigned byte: the minor version + number of the file format, e.g. x00. Note: the version of the file format + is not tied to the version of the numpy package. The next 2 bytes form a + little-endian unsigned short int: the length of the header data + HEADER_LEN. The next HEADER_LEN bytes form the header data describing the + array's format. It is an ASCII string which contains a Python literal + expression of a dictionary. It is terminated by a newline ('n') and + padded with spaces + ('x20') to make the total length of the magic string + 4 + HEADER_LEN be + evenly divisible by 16 for alignment purposes. The dictionary contains + three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. "fortran_order" : bool Whether + the array data is Fortran-contiguous or not. Since Fortran-contiguous + arrays are a common form of non-C-contiguity, we allow them to be written + directly to disk for efficiency. "shape" : tuple of int The shape of the + array. For repeatability and readability, this dictionary is formatted + using pprint.pformat() so the keys are in alphabetic order. + */ + + // remove trailing newline + if (header.back() != '\n') + fprintf(stderr, "invalid header"); + header.pop_back(); + + // parse the dictionary + std::vector keys{"descr", "fortran_order", "shape"}; + auto dict_map = npy::pyparse::parse_dict(header, keys); + + if (dict_map.size() == 0) + fprintf(stderr, "invalid dictionary in header"); + + std::string descr_s = dict_map["descr"]; + std::string fortran_s = dict_map["fortran_order"]; + std::string shape_s = dict_map["shape"]; + + // TODO: extract info from typestring + parse_typestring(descr_s); + // remove + descr = npy::pyparse::parse_str(descr_s); + + // convert literal Python bool to C++ bool + fortran_order = npy::pyparse::parse_bool(fortran_s); + + // parse the shape tuple + auto shape_v = npy::pyparse::parse_tuple(shape_s); + if (shape_v.size() == 0) + fprintf(stderr, "invalid shape tuple in header"); + + for (auto item : shape_v) { + ndarray_len_t dim = static_cast(std::stoul(item)); + shape.push_back(dim); + } +} + +inline std::string write_header_dict(const std::string& descr, + bool fortran_order, + const std::vector& shape) { + std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); + std::string shape_s = npy::pyparse::write_tuple(shape); + + return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + + ", 'shape': " + shape_s + ", }"; +} + +inline void write_header(std::ostream& out, const std::string& descr, + bool fortran_order, + const std::vector& shape_v) { + std::string header_dict = write_header_dict(descr, fortran_order, shape_v); + + size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; + + unsigned char version[2] = {1, 0}; + if (length >= 255 * 255) { + length = magic_string_length + 2 + 4 + header_dict.length() + 1; + version[0] = 2; + version[1] = 0; + } + size_t padding_len = 16 - length % 16; + std::string padding(padding_len, ' '); + + // write magic + write_magic(out, version[0], version[1]); + + // write header length + if (version[0] == 1 && version[1] == 0) { + char header_len_le16[2]; + uint16_t header_len = static_cast(header_dict.length() + + padding.length() + 1); + + header_len_le16[0] = (header_len >> 0) & 0xff; + header_len_le16[1] = (header_len >> 8) & 0xff; + out.write(reinterpret_cast(header_len_le16), 2); + } else { + char header_len_le32[4]; + uint32_t header_len = static_cast(header_dict.length() + + padding.length() + 1); + + header_len_le32[0] = (header_len >> 0) & 0xff; + header_len_le32[1] = (header_len >> 8) & 0xff; + header_len_le32[2] = (header_len >> 16) & 0xff; + header_len_le32[3] = (header_len >> 24) & 0xff; + out.write(reinterpret_cast(header_len_le32), 4); + } + + out << header_dict << padding << '\n'; +} + +inline std::string read_header(std::istream& istream) { + // check magic bytes an version number + unsigned char v_major, v_minor; + read_magic(istream, v_major, v_minor); + + uint32_t header_length = 0; + if (v_major == 1 && v_minor == 0) { + char header_len_le16[2]; + istream.read(header_len_le16, 2); + header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); + + if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { + // TODO: display warning + } + } else if (v_major == 2 && v_minor == 0) { + char header_len_le32[4]; + istream.read(header_len_le32, 4); + + header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | + (header_len_le32[2] << 16) | (header_len_le32[3] << 24); + + if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { + // TODO: display warning + } + } else { + fprintf(stderr, "unsupported file format version"); + } + + auto buf_v = std::vector(); + buf_v.reserve(header_length); + istream.read(buf_v.data(), header_length); + std::string header(buf_v.data(), header_length); + + return header; +} + +inline ndarray_len_t comp_size(const std::vector& shape) { + ndarray_len_t size = 1; + for (ndarray_len_t i : shape) + size *= i; + + return size; +} + +template +inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, + unsigned int n_dims, const unsigned long shape[], + const std::vector& data) { + Typestring typestring_o(data); + std::string typestring = typestring_o.str(); + + std::ofstream stream(filename, std::ofstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::vector shape_v(shape, shape + n_dims); + write_header(stream, typestring, fortran_order, shape_v); + + auto size = static_cast(comp_size(shape_v)); + + stream.write(reinterpret_cast(data.data()), + sizeof(Scalar) * size); +} + +template +inline void LoadArrayFromNumpy(const std::string& filename, + std::vector& shape, + std::vector& data) { + bool fortran_order; + LoadArrayFromNumpy(filename, shape, fortran_order, data); +} + +template +inline void LoadArrayFromNumpy(const std::string& filename, + std::vector& shape, + bool& fortran_order, std::vector& data) { + std::ifstream stream(filename, std::ifstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::string header = read_header(stream); + + // parse header + std::string typestr; + + parse_header(header, typestr, fortran_order, shape); + + // check if the typestring matches the given one + Typestring typestring_o{data}; + std::string expect_typestr = typestring_o.str(); + if (typestr != expect_typestr) { + fprintf(stderr, "formatting error: typestrings not matching"); + } + + // compute the data size based on the shape + auto size = static_cast(comp_size(shape)); + data.resize(size); + + // read the data + stream.read(reinterpret_cast(data.data()), sizeof(Scalar) * size); +} + +inline void LoadArrayFromNumpy(const std::string& filename, + std::string& type_str, + std::vector& shape, + std::vector& data) { + std::ifstream stream(filename, std::ifstream::binary); + if (!stream) { + fprintf(stderr, "io error: failed to open a file."); + } + + std::string header = read_header(stream); + bool fortran_order; + // parse header + parse_header(header, type_str, fortran_order, shape); + + // check if the typestring matches the given one + std::string size_str = type_str.substr(type_str.size() - 1); + size_t elem_size = atoi(size_str.c_str()); + + // compute the data size based on the shape + auto byte_size = elem_size * static_cast(comp_size(shape)); + data.resize(byte_size); + + // read the data + stream.read(reinterpret_cast(data.data()), byte_size); +} + +} // namespace npy + +#endif // NPY_H diff --git a/lite/test/test_common.h b/lite/test/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..dae6fb296b5f5a864c2a6bec1dcc5758c084d17d --- /dev/null +++ b/lite/test/test_common.h @@ -0,0 +1,184 @@ +/** + * \file test/test_common.h + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#pragma once + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../src/misc.h" +#include "../src/mge/network_impl.h" +#include "../src/mge/common.h" +#include "lite/network.h" +#include "lite/tensor.h" +#include "megbrain/tensor.h" +#include "megbrain/graph/bases.h" +#include "megbrain/plugin/opr_io_dump.h" +#include "megbrain/plugin/profiler.h" +#include "megbrain/serialization/extern_c_opr.h" +#include "megbrain/serialization/file.h" +#include "megbrain/serialization/load_dump_config.h" +#include "megbrain/serialization/serializer.h" +#include "megbrain/utils/thin/hash_table.h" +#include "npy.h" + +#include + +#include +#include +#include +#include + +namespace lite { + +template +static ::testing::AssertionResult compare_memory(const void* memory0, + const void* memory1, + size_t length, + float maxerr = 1e-3) { + const T* data_ptr0 = static_cast(memory0); + const T* data_ptr1 = static_cast(memory1); + for (size_t i = 0; i < length; i++) { + auto diff = std::abs(data_ptr0[i] - data_ptr1[i]); + if (diff > maxerr) { + return ::testing::AssertionFailure() + << "Unequal value:\n" + << "value 0 = " << data_ptr0[i] << "\n" + << "value 1 = " << data_ptr1[i] << "\n" + << "At index: " << i << "\n"; + } + } + return ::testing::AssertionSuccess(); +} + +template +void compare_lite_tensor(std::shared_ptr tensor0, + std::shared_ptr tensor1, float maxerr = 1e-3) { + size_t elemsize = tensor0->get_layout().get_elem_size(); + T* data_ptr0 = static_cast(tensor0->get_memory_ptr()); + T* data_ptr1 = static_cast(tensor1->get_memory_ptr()); + size_t length = tensor0->get_tensor_total_size_in_byte() / elemsize; + EXPECT_TRUE(compare_memory(data_ptr0, data_ptr1, length, maxerr)); +} + +__attribute__((unused)) static std::shared_ptr get_input_data( + std::string path) { + std::string type_str; + std::vector stl_shape; + std::vector raw; + npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); + + auto lite_tensor = std::make_shared(LiteDeviceType::LITE_CPU); + Layout layout; + layout.ndim = stl_shape.size(); + const std::map type_map = { + {"f4", LiteDataType::LITE_FLOAT}, + {"i4", LiteDataType::LITE_INT}, + {"i1", LiteDataType::LITE_INT8}, + {"u1", LiteDataType::LITE_UINT8}}; + layout.shapes[0] = 1; + for (size_t i = 0; i < stl_shape.size(); i++) { + layout.shapes[i] = static_cast(stl_shape[i]); + } + for (auto& item : type_map) { + if (type_str.find(item.first) != std::string::npos) { + layout.data_type = item.second; + break; + } + } + lite_tensor->set_layout(layout); + size_t length = lite_tensor->get_tensor_total_size_in_byte(); + void* dest = lite_tensor->get_memory_ptr(); + memcpy(dest, raw.data(), length); + return lite_tensor; +} + +__attribute__((unused)) static std::shared_ptr mgelite_lar( + std::string model_path, const Config& config, std::string, + std::shared_ptr input) { + std::unique_ptr network = std::make_unique(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_input_tensor(0); + + auto src_ptr = input->get_memory_ptr(); + auto src_layout = input->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + Layout out_layout = output_tensor->get_layout(); + auto ret = std::make_shared(LiteDeviceType::LITE_CPU, out_layout); + void* out_data = output_tensor->get_memory_ptr(); + void* dst_data = ret->get_memory_ptr(); + memcpy(dst_data, out_data, ret->get_tensor_total_size_in_byte()); + return ret; +} + +__attribute__((unused)) static std::shared_ptr mgb_lar( + std::string model_path, const Config& config, std::string input_name, + std::shared_ptr input) { + LITE_ASSERT(config.bare_model_cryption_name.size() == 0); + using namespace mgb; + serialization::GraphLoader::LoadConfig mgb_config; + mgb_config.comp_node_mapper = [config](CompNode::Locator& loc) { + loc = to_compnode_locator(config.device_type); + }; + mgb_config.comp_graph = ComputingGraph::make(); + auto&& graph_opt = mgb_config.comp_graph->options(); + if (config.options.weight_preprocess) { + graph_opt.graph_opt.enable_weight_preprocess(); + } + graph_opt.comp_node_seq_record_level = + config.options.comp_node_seq_record_level; + + auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); + auto format = + serialization::GraphLoader::identify_graph_dump_format(*inp_file); + mgb_assert(format.valid(), + "invalid model: unknown model format, please make sure input " + "file is generated by GraphDumper"); + auto loader = + serialization::GraphLoader::make(std::move(inp_file), format.val()); + auto load_ret = loader->load(mgb_config, false); + + ComputingGraph::OutputSpec out_spec; + std::vector output_tensors(load_ret.output_var_list.size()); + for (size_t i = 0; i < load_ret.output_var_list.size(); i++) { + auto cb = [&output_tensors, i](const DeviceTensorND& dv) mutable { + output_tensors[i].copy_from(dv); + }; + out_spec.emplace_back(load_ret.output_var_list[i], std::move(cb)); + } + auto func = load_ret.graph_compile(out_spec); + + auto& in = load_ret.tensor_map.find(input_name)->second; + in->copy_from(*TensorHelper::implement(input) + ->cast_final_safe() + .host_tensor()); + func->execute(); + func->wait(); + + std::shared_ptr ret = std::make_shared( + LiteDeviceType::LITE_CPU, + to_lite_layout(output_tensors[0].layout())); + auto mge_tensor = TensorHelper::implement(ret) + ->cast_final_safe() + .host_tensor(); + mge_tensor->copy_from(output_tensors[0]); + return ret; +} +} // namespace lite + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_misc.cpp b/lite/test/test_misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2a0a40f2f1efc0798d45e9cb1827a08dd50eb90 --- /dev/null +++ b/lite/test/test_misc.cpp @@ -0,0 +1,115 @@ +/** + * \file test/test_misc.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "test_common.h" +#include "../src/decryption/decrypt_base.h" +#include "../src/network_impl_base.h" + +#include "megbrain/opr/io.h" +#include "megbrain/tensor.h" +#include "megbrain/utils/metahelper.h" + +#include + +#include +#include +#include +#include + +using namespace lite; + +TEST(TestMisc, DecryptionRegister) { + size_t number = decryption_static_data().decryption_methods.size(); + //! At least one method is register by lite + ASSERT_GE(number, 1); + DecryptionFunc func; + register_decryption_and_key("AllForTest0", func, {}); + + ASSERT_EQ(number + 1, decryption_static_data().decryption_methods.size()); +} + +TEST(TestMisc, DecryptionUpdate) { + DecryptionFunc func; + register_decryption_and_key("AllForTest1", func, {}); + func = [](const void*, size_t, + const std::vector&) -> std::vector { + return {}; + }; + update_decryption_or_key("AllForTest1", func, {}); + ASSERT_NE(decryption_static_data().decryption_methods["AllForTest1"].first, + nullptr); + ASSERT_EQ(decryption_static_data() + .decryption_methods["AllForTest1"] + .second->size(), + 0); + update_decryption_or_key("AllForTest1", {}, {1, 2, 3}); + ASSERT_EQ(decryption_static_data() + .decryption_methods["AllForTest1"] + .second->size(), + 3); +} + +TEST(TestMisc, SharedSameDeviceTensor) { + using namespace mgb; + serialization::GraphLoader::LoadConfig mgb_config; + mgb_config.comp_node_mapper = [](CompNode::Locator& loc) { + loc = to_compnode_locator(LiteDeviceType::LITE_CPU); + }; + mgb_config.comp_graph = ComputingGraph::make(); + std::string model_path = "./shufflenet.mge"; + + auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); + auto format = + serialization::GraphLoader::identify_graph_dump_format(*inp_file); + mgb_assert(format.valid(), + "invalid model: unknown model format, please make sure input " + "file is generated by GraphDumper"); + auto loader = + serialization::GraphLoader::make(std::move(inp_file), format.val()); + auto load_ret_1 = loader->load(mgb_config, true); + auto load_ret_2 = loader->load(mgb_config, true); + ASSERT_EQ(load_ret_1.output_var_list.size(), + load_ret_2.output_var_list.size()); + + ComputingGraph::OutputSpec out_spec_1, out_spec_2; + for (size_t i = 0; i < load_ret_1.output_var_list.size(); i++) { + out_spec_1.emplace_back(load_ret_1.output_var_list[i], nullptr); + out_spec_2.emplace_back(load_ret_2.output_var_list[i], nullptr); + } + auto func_1 = load_ret_1.graph_compile(out_spec_1); + auto func_2 = load_ret_2.graph_compile(out_spec_1); + std::vector oprs_1, oprs_2; + func_1->iter_opr_seq([&oprs_1](cg::OperatorNodeBase* opr) -> bool { + if (opr->try_cast_final()) { + oprs_1.push_back(opr); + } + return true; + }); + func_1->iter_opr_seq([&oprs_2](cg::OperatorNodeBase* opr) -> bool { + if (opr->try_cast_final()) { + oprs_2.push_back(opr); + } + return true; + }); + ASSERT_EQ(oprs_1.size(), oprs_2.size()); + for (size_t i = 0; i < oprs_1.size(); i++) { + auto tensor_1 = + oprs_1[i]->try_cast_final()->value(); + auto tensor_2 = + oprs_2[i]->try_cast_final()->value(); + ASSERT_EQ(tensor_1.raw_ptr(), tensor_2.raw_ptr()); + } +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4770965a60a2973b8372a59537d54c578e7dbb84 --- /dev/null +++ b/lite/test/test_network.cpp @@ -0,0 +1,1007 @@ +/** + * \file test/test_network.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "./test_common.h" +#include "megbrain/tensor.h" + +#include +#include +#include +#include +using namespace lite; + +namespace { +class CheckAllocator : public lite::Allocator { +public: + //! allocate memory of size in the given device with the given align + void* allocate(LiteDeviceType device, int, size_t size, + size_t align) override { + LITE_ASSERT(device == LiteDeviceType::LITE_CPU); + m_nr_left++; + m_nr_allocated++; +#ifdef WIN32 + return _aligned_malloc(size, align); +#elif defined(__ANDROID__) || defined(ANDROID) + return memalign(align, size); +#else + void* ptr = nullptr; + auto err = posix_memalign(&ptr, align, size); + mgb_assert(!err, "failed to malloc %zubytes with align %zu", size, + align); + return ptr; +#endif + }; + + //! free the memory pointed by ptr in the given device + void free(LiteDeviceType device, int, void* ptr) override { + m_nr_left--; + LITE_ASSERT(device == LiteDeviceType::LITE_CPU); +#ifdef WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif + }; + std::atomic_size_t m_nr_left{0}; + std::atomic_size_t m_nr_allocated{0}; +}; +} // namespace + +TEST(TestNetWork, Basic) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + auto result_lite = mgelite_lar(model_path, config, "data", lite_tensor); + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, SetDeviceId) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::shared_ptr network = std::make_shared(config); + network->set_device_id(4); + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_input_tensor(0); + std::shared_ptr output_tensor = network->get_output_tensor(0); + network->forward(); + network->wait(); + ASSERT_EQ(input_tensor->get_device_id(), 4); + ASSERT_EQ(output_tensor->get_device_id(), 4); +} + +TEST(TestNetWork, GetAllName) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + auto input_names = network->get_all_input_name(); + auto output_names = network->get_all_output_name(); + + ASSERT_EQ(input_names.size(), 1); + ASSERT_EQ(output_names.size(), 1); + ASSERT_TRUE(input_names[0] == "data"); + ASSERT_TRUE(output_names[0] == + "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); +} + +TEST(TestNetWork, BasicInplaceAndSingleThreadAffinity) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + + std::shared_ptr network = std::make_shared(config); + Runtime::set_cpu_inplace_mode(network); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_input_tensor(0); + + int affinity_set = false; + Runtime::set_runtime_thread_affinity(network, [&affinity_set](int id) { + ASSERT_EQ(id, 0); + affinity_set = true; + }); + + auto src_ptr = lite_tensor->get_memory_ptr(); + auto src_layout = lite_tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + //! inplace mode not support async mode + ASSERT_THROW(network->set_async_callback([]() {}), std::exception); + + network->forward(); + network->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + + ASSERT_EQ(affinity_set, true); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, NetworkShareWeights) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + + std::shared_ptr network = std::make_shared(config); + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_input_tensor(0); + + std::shared_ptr network2 = std::make_shared(config); + Runtime::set_cpu_inplace_mode(network2); + + Runtime::shared_weight_with_network(network2, network); + + std::shared_ptr input_tensor2 = network2->get_input_tensor(0); + + auto src_ptr = lite_tensor->get_memory_ptr(); + auto src_layout = lite_tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + input_tensor2->reset(src_ptr, src_layout); + ASSERT_NE(input_tensor, input_tensor2); + + network->forward(); + network->wait(); + + network2->forward(); + network2->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + std::shared_ptr output_tensor2 = network2->get_output_tensor(0); + + ASSERT_NE(output_tensor->get_memory_ptr(), + output_tensor2->get_memory_ptr()); + compare_lite_tensor(output_tensor, result_mgb); + compare_lite_tensor(output_tensor2, result_mgb); +} + +TEST(TestNetWork, SharedRuntimeMem) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + + std::shared_ptr network_src = std::make_shared(config); + std::shared_ptr network_dst = std::make_shared(config); + Runtime::share_runtime_memory_with(network_dst, network_src); + network_src->load_model(model_path); + network_dst->load_model(model_path); +} + +TEST(TestNetWork, UserAllocator) { + auto allocator = std::make_shared(); + { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + std::shared_ptr network = std::make_shared(config); + + Runtime::set_memory_allocator(network, allocator); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_input_tensor(0); + + auto src_ptr = lite_tensor->get_memory_ptr(); + auto src_layout = lite_tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + + ASSERT_GE(allocator->m_nr_allocated, 1); + std::shared_ptr output_tensor = network->get_output_tensor(0); + + compare_lite_tensor(output_tensor, result_mgb); + } + ASSERT_EQ(allocator->m_nr_left, 0); +} + +TEST(TestNetWork, BasicMultiThread) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + + std::shared_ptr network = std::make_shared(config); + Runtime::set_cpu_threads_number(network, 2); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_input_tensor(0); + + auto src_ptr = lite_tensor->get_memory_ptr(); + auto src_layout = lite_tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, ThreadAffinity) { + size_t nr_threads = 4; + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + + std::shared_ptr network = std::make_shared(config); + Runtime::set_cpu_threads_number(network, nr_threads); + + ASSERT_THROW(Runtime::set_runtime_thread_affinity(network, [](int) {}), + std::exception); + network->load_model(model_path); + std::vector thread_ids(nr_threads); + auto affinity = [&](int id) { + thread_ids[id] = std::this_thread::get_id(); + }; + Runtime::set_runtime_thread_affinity(network, affinity); + + std::shared_ptr input_tensor = network->get_input_tensor(0); + auto src_ptr = lite_tensor->get_memory_ptr(); + auto src_layout = lite_tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + + for (size_t i = 0; i < nr_threads; i++) { + for (size_t j = i + 1; j < nr_threads; j++) { + ASSERT_NE(thread_ids[i], thread_ids[j]); + } + } + + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, BasicCryptAes) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string model_crypt_path = "./shufflenet_crypt_aes.mge"; + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + config.bare_model_cryption_name = "AES_default"; + auto result_lite = + mgelite_lar(model_crypt_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, BasicCryptRc4) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string model_crypt_path = "./shufflenet_crypt_rc4.mge"; + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + config.bare_model_cryption_name = "RC4_default"; + auto result_lite = + mgelite_lar(model_crypt_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, PackedCryptRc4) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string model_crypt_path = "./test_packed_model_rc4.lite"; + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + auto result_lite = + mgelite_lar(model_crypt_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, BasicCryptSfRc4) { + Config config; + auto lite_tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string model_crypt_path = "./shufflenet_crypt_sfrc4.mge"; + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + config.bare_model_cryption_name = "SIMPLE_FAST_RC4_default"; + auto result_lite = + mgelite_lar(model_crypt_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, ResetInput) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, ChangeInputShape) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_layout = Layout{{2, 3, 200, 200}, 4, LiteDataType::LITE_FLOAT}; + input_tensor->set_layout(src_layout); + std::shared_ptr input_tensor2 = network->get_io_tensor(input_name); + //! Check memory is equal + ASSERT_EQ(input_tensor->get_memory_ptr(), input_tensor2->get_memory_ptr()); + + network->forward(); + network->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto output_layout = output_tensor->get_layout(); + ASSERT_EQ(output_layout.shapes[0], 2); + ASSERT_EQ(output_layout.shapes[1], 1000); +} + +TEST(TestNetWork, ResetOutput) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, AsyncExec) { + Config config; + config.options.var_sanity_check_first_run = false; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + //! set async mode and callback + volatile bool finished = false; + network->set_async_callback([&finished]() { finished = true; }); + + network->forward(); + size_t count = 0; + while (finished == false) { + count++; + } + ASSERT_GT(count, 0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, CPUDeviceInput) { + auto tensor = get_input_data("./input_data.npy"); + Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + NetworkIO IO; + bool is_host = false; + IO.inputs.push_back({input_name, is_host}); + std::shared_ptr network = std::make_shared(IO); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + input_tensor->reset(src_ptr, layout); + + network->forward(); + network->wait(); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, ShareTensorWith) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + std::shared_ptr network = std::make_shared(); + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + input_tensor->share_memory_with(*tensor); + + network->forward(); + network->wait(); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, InputCallBack) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + NetworkIO ios; + bool is_host = false; + ios.inputs.push_back({input_name, is_host}); + std::shared_ptr network = std::make_shared(ios); + network->load_model(model_path); + + volatile bool finised_check_input = false; + auto input_callback = + [&tensor, &finised_check_input, + input_name](const std::unordered_map< + std::string, std::pair>>& + input_map) { + ASSERT_EQ(input_map.size(), 1); + auto tensor_input = input_map.at(input_name).second; + compare_lite_tensor(tensor_input, tensor); + finised_check_input = true; + }; + + network->set_start_callback(input_callback); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + input_tensor->share_memory_with(*tensor); + + network->forward(); + network->wait(); + + ASSERT_TRUE(finised_check_input); + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, OutputCallBack) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + std::shared_ptr network = std::make_shared(); + network->load_model(model_path); + auto output_name = network->get_output_name(0); + + volatile bool finised_check_output = false; + auto output_callback = + [&result_mgb, &finised_check_output, + output_name](const std::unordered_map< + std::string, std::pair>>& + output_map) { + ASSERT_EQ(output_map.size(), 1); + auto tensor_output = output_map.at(output_name).second; + compare_lite_tensor(tensor_output, result_mgb); + finised_check_output = true; + }; + + network->set_finish_callback(output_callback); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + input_tensor->share_memory_with(*tensor); + + network->forward(); + network->wait(); + + ASSERT_TRUE(finised_check_output); + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, OutputShapeOnly) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + + NetworkIO IO; + bool is_host = true; + IO.outputs.push_back({output_name, is_host, LiteIOType::LITE_IO_SHAPE}); + Config config; + std::shared_ptr network = std::make_shared(config, IO); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + std::shared_ptr output_tensor = network->get_io_tensor(output_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + ASSERT_EQ(output_tensor->get_tensor_total_size_in_byte() / sizeof(float), + 1000); +} + +TEST(TestNetWork, ProfileIOdump) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + + NetworkIO IO; + Config config; + std::shared_ptr network = std::make_shared(config, IO); + network->enable_profile_performance("./profile.json"); + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + ASSERT_TRUE(fopen("./profile.json", "r")); + + Runtime::enable_io_txt_dump(network, "./io_txt_dump.txt"); + network->forward(); + network->wait(); + ASSERT_TRUE(fopen("./io_txt_dump.txt", "r")); +} + +TEST(TestNetWork, LoadPackedModel) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./test_packed_model.lite"; + std::string input_name = "data"; + + NetworkIO IO; + Config config; + std::shared_ptr network = std::make_shared(config, IO); + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); +} + +TEST(TestNetWork, GetDeviceType) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + + Config config; + std::shared_ptr network = std::make_shared(config); + network->load_model(model_path); + ASSERT_TRUE(network->get_device_type() == LiteDeviceType::LITE_CPU); +} + +TEST(TestNetWork, GetModelExtraInfo) { + std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite"; + Config config; + std::shared_ptr network = std::make_shared(config); + network->load_model(model_path); + auto& extra_info = network->get_model_extra_info(); + ASSERT_TRUE(extra_info.size() > 0); + printf("extra_info %s \n", extra_info.c_str()); +} + +#if LITE_WITH_CUDA + +TEST(TestNetWork, BasicDevice) { + auto lite_tensor = get_input_data("./input_data.npy"); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::string model_path = "./shufflenet.mge"; + auto result_lite = mgelite_lar(model_path, config, "data", lite_tensor); + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor); + compare_lite_tensor(result_lite, result_mgb); +} + +TEST(TestNetWork, DeviceInput) { + auto tensor = get_input_data("./input_data.npy"); + Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + NetworkIO IO; + bool is_host = false; + IO.inputs.push_back({input_name, is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto tensor_cuda = Tensor(LiteDeviceType::LITE_CUDA, layout); + tensor_cuda.copy_from(*tensor); + + auto src_ptr = tensor_cuda.get_memory_ptr(); + input_tensor->reset(src_ptr, layout); + + network->forward(); + network->wait(); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, ChangeInputShapeDevice) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_layout = Layout{{2, 3, 200, 200}, 4, LiteDataType::LITE_FLOAT}; + input_tensor->set_layout(src_layout); + std::shared_ptr input_tensor2 = network->get_io_tensor(input_name); + //! Check memory is equal + ASSERT_EQ(input_tensor->get_memory_ptr(), input_tensor2->get_memory_ptr()); + + network->forward(); + network->wait(); + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto output_layout = output_tensor->get_layout(); + ASSERT_EQ(output_layout.shapes[0], 2); + ASSERT_EQ(output_layout.shapes[1], 1000); +} + +TEST(TestNetWork, DeviceOutput) { + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + NetworkIO IO; + bool is_host = false; + IO.outputs.push_back({output_name, is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + std::shared_ptr output_tensor_cuda = + network->get_io_tensor(output_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + network->forward(); + network->wait(); + auto output_tensor = std::make_shared(); + output_tensor->copy_from(*output_tensor_cuda); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, WrongIONameDevice) { + auto tensor = get_input_data("./input_data.npy"); + Layout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + std::string input_name_wrong = "data0"; + std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + std::string output_name_wrong = + "w_TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + auto result_mgb = mgb_lar(model_path, {}, input_name, tensor); + + NetworkIO IO; + bool is_host = false; + IO.inputs.push_back({input_name, is_host}); + IO.outputs.push_back({output_name, is_host}); + IO.outputs.push_back({output_name_wrong, is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + + network->load_model(model_path); + + auto tensor_cuda = Tensor(LiteDeviceType::LITE_CUDA, layout); + tensor_cuda.copy_from(*tensor); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + auto src_ptr = tensor_cuda.get_memory_ptr(); + auto src_layout = tensor_cuda.get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor_cuda = + network->get_io_tensor(output_name); + + network->forward(); + network->wait(); + auto output_tensor = std::make_shared(); + output_tensor->copy_from(*output_tensor_cuda); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWork, ConfigIONameDevice) { + std::string model_path = "./model.mgb"; + + NetworkIO IO; + bool is_host = false; + IO.outputs.push_back({"clsfy", is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + network->compute_only_configured_output(); + network->load_model(model_path); + + ASSERT_EQ(network->get_all_output_name().size(), 1); + ASSERT_EQ(network->get_all_output_name()[0], "clsfy"); + + std::shared_ptr network2 = std::make_shared(config, IO); + network2->load_model(model_path); + + ASSERT_EQ(network2->get_all_output_name().size(), 2); +} + +TEST(TestNetWork, SetDeviceIdDeviceTest) { +#if LITE_WITH_CUDA + if(get_device_count(LITE_CUDA) <= 1) + return; +#endif + std::string model_path = "./model.mgb"; + + NetworkIO IO; + bool is_host = false; + IO.inputs.push_back({"data", is_host}); + IO.outputs.push_back({"clsfy", is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + network->set_device_id(1); + network->load_model(model_path); + auto inputs_names = network->get_all_input_name(); + for (auto name : inputs_names) { + auto tensor = network->get_io_tensor(name); + ASSERT_EQ(tensor->get_device_id(), 1); + if (name == "idx") { + int* index_ptr = static_cast(tensor->get_memory_ptr()); + for (int i = 0; i < 23; i++) { + index_ptr[i] = i % 3; + } + } + if (name == "landmark") { + float* landmakrk_ptr = + static_cast(tensor->get_memory_ptr()); + for (int i = 0; i < 23 * 18 * 2; i++) { + landmakrk_ptr[i] = 0.1f; + } + } + } + auto outputs_names = network->get_all_output_name(); + for (auto name : outputs_names) { + auto tensor = network->get_io_tensor(name); + ASSERT_EQ(tensor->get_device_id(), 1); + } + network->forward(); + network->wait(); +} + +TEST(TestNetWork, SetStreamIdDeviceTest) { + std::string model_path = "./model.mgb"; + + NetworkIO IO; + bool is_host = false; + IO.inputs.push_back({"data", is_host}); + IO.outputs.push_back({"clsfy", is_host}); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + std::shared_ptr network = std::make_shared(config, IO); + network->set_stream_id(1); + network->load_model(model_path); + auto inputs_names = network->get_all_input_name(); + for (auto name : inputs_names) { + auto tensor = network->get_io_tensor(name); + if (name == "idx") { + int* index_ptr = static_cast(tensor->get_memory_ptr()); + for (int i = 0; i < 23; i++) { + index_ptr[i] = i % 3; + } + } + if (name == "landmark") { + float* landmakrk_ptr = + static_cast(tensor->get_memory_ptr()); + for (int i = 0; i < 23 * 18 * 2; i++) { + landmakrk_ptr[i] = 0.1f; + } + } + } + network->forward(); + network->wait(); +} + +#if CUDART_VERSION >= 10000 +TEST(TestNetWork, DeviceAsyncExec) { + auto tensor = get_input_data("./input_data.npy"); + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + config.options.var_sanity_check_first_run = false; + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + //! set async mode and callback + volatile bool finished = false; + network->set_async_callback([&finished]() { finished = true; }); + + network->forward(); + size_t count = 0; + while (finished == false) { + count++; + } + + ASSERT_GT(count, 0); + compare_lite_tensor(output_tensor, result_mgb); +} + +#endif +#endif +#if MGB_ATLAS +TEST(TestNetWork, AtlasLoadNoDevice) { + lite::Config config; + config.device_type = LiteDeviceType::LITE_DEVICE_DEFAULT; + auto network = std::make_shared(config); + network->load_model("./model_atlas.mgb"); + network->forward(); + network->wait(); +} + +TEST(TestNetWork, AtlasLoadDeviceInput) { + lite::NetworkIO networkio; + lite::IO input_data_io = {}; + input_data_io.name = "data"; + input_data_io.is_host = false; + networkio.inputs.emplace_back(input_data_io); + lite::IO input_input0_io = {}; + input_input0_io.name = "input0"; + input_input0_io.is_host = false; + networkio.inputs.emplace_back(input_input0_io); + lite::Config config; + config.device_type = LiteDeviceType::LITE_DEVICE_DEFAULT; + auto network = std::make_shared(config, networkio); + network->load_model("./model_atlas.mgb"); + network->forward(); + network->wait(); +} + +TEST(TestNetWork, AtlasLoadAtlas) { + lite::Config config; + config.device_type = LiteDeviceType::LITE_ATLAS; + auto network = std::make_shared(config); + network->load_model("./model_atlas.mgb"); + network->forward(); + network->wait(); +} + +TEST(TestNetWork, AtlasLoadAtlasDeviceInput) { + lite::NetworkIO networkio; + lite::IO input_data_io = {}; + input_data_io.name = "data"; + input_data_io.is_host = false; + networkio.inputs.emplace_back(input_data_io); + lite::IO input_input0_io = {}; + input_input0_io.name = "input0"; + input_input0_io.is_host = false; + networkio.inputs.emplace_back(input_input0_io); + lite::Config config; + config.device_type = LiteDeviceType::LITE_ATLAS; + auto network = std::make_shared(config, networkio); + network->load_model("./model_atlas.mgb"); + network->forward(); + network->wait(); +} + +TEST(TestNetWork, AtlasDeviceID) { + lite::Config config; + config.device_type = LiteDeviceType::LITE_ATLAS; + auto network = std::make_shared(config); + network->set_device_id(1); + network->load_model("./model_atlas.mgb"); + std::shared_ptr input_tensor = network->get_input_tensor(0); + std::shared_ptr output_tensor = network->get_output_tensor(0); + network->forward(); + network->wait(); + ASSERT_EQ(output_tensor->get_device_id(), 1); +} +#endif +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_network_c.cpp b/lite/test/test_network_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..419bc6c7a6e9a7c115fbf27a6f10ae00dab36ba5 --- /dev/null +++ b/lite/test/test_network_c.cpp @@ -0,0 +1,895 @@ +/** + * \file test/test_network_c.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "../src/misc.h" + +#if LITE_BUILD_WITH_MGE +#include "../src/common.h" +#include "../src/mge/network_impl.h" + +#include "../lite-c/src/common.h" +#include "lite-c/global_c.h" +#include "lite-c/network_c.h" +#include "lite-c/tensor_c.h" + +#include "./test_common.h" +#include "megbrain/tensor.h" + +#include +#include +#include +#include +#include + +namespace { + +int affinity_set = false; +int single_thread_affinity(int) { + affinity_set = true; + return 0; +} + +std::atomic_size_t m_nr_left{0}; +std::atomic_size_t m_nr_allocated{0}; + +void* allocate(LiteDeviceType device, int, size_t size, size_t align) { + LITE_ASSERT(device == LiteDeviceType::LITE_CPU); + m_nr_left++; + m_nr_allocated++; +#ifdef WIN32 + return _aligned_malloc(size, align); +#elif defined(__ANDROID__) || defined(ANDROID) + return memalign(align, size); +#else + void* ptr = nullptr; + auto err = posix_memalign(&ptr, align, size); + mgb_assert(!err, "failed to malloc %zu bytes with align %zu", size, align); + return ptr; +#endif +} + +void free(LiteDeviceType device, int, void* ptr) { + m_nr_left--; + LITE_ASSERT(device == LiteDeviceType::LITE_CPU); +#ifdef WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +}; + +#define NUMBER_THREDS (4) +std::vector thread_ids(NUMBER_THREDS); +int multi_thread_affinity(int id) { + thread_ids[id] = std::this_thread::get_id(); + return 0; +}; + +volatile bool finished = false; +int finish_callback() { + finished = true; + return 0; +} + +volatile bool start_checked = false; +int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors, + size_t size) { + start_checked = true; + auto check_func = [&]() { + ASSERT_EQ(size, 1); + ASSERT_EQ(std::string(inputs->name), "data"); + LiteLayout layout; + LITE_get_tensor_layout(*input_tensors, &layout); + ASSERT_EQ(layout.ndim, 4); + ASSERT_EQ(layout.shapes[1], 3); + ASSERT_EQ(layout.shapes[2], 224); + ASSERT_EQ(layout.shapes[3], 224); + }; + check_func(); + return 0; +} + +volatile bool finish_checked = false; +int finish_callback(const LiteIO* outputs, const LiteTensor* output_tensors, + size_t size) { + finish_checked = true; + auto check_func = [&]() { + ASSERT_EQ(size, 1); + ASSERT_EQ(std::string(outputs->name), + "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); + LiteLayout layout; + LITE_get_tensor_layout(*output_tensors, &layout); + ASSERT_EQ(layout.shapes[1], 1000); + }; + check_func(); + return 0; +} + +} // namespace + +#define LITE_CAPI_CHECK(_expr) \ + do { \ + int _ret = (_expr); \ + if (_ret) { \ + LITE_THROW(LITE_get_last_error()); \ + } \ + } while (0) + +#define ForwardMgb \ + lite::Config config; \ + auto lite_tensor = lite::get_input_data("./input_data.npy"); \ + size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); \ + std::string model_path = "./shufflenet.mge"; \ + auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor) + +#define MakeNetwork \ + LiteNetwork c_network; \ + LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), \ + *default_network_io())) + +#define LoadNetwork \ + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str())) + +#define SetInput \ + LiteTensor c_input_tensor, c_output_tensor; \ + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, "data", LITE_INPUT, \ + &c_input_tensor)); \ + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, \ + lite_tensor->get_memory_ptr(), \ + data_length_in_byte)) + +#define ForwardNetwork \ + LITE_CAPI_CHECK(LITE_forward(c_network)); \ + LITE_CAPI_CHECK(LITE_wait(c_network)) + +#define GetOutput \ + const char* output_name; \ + LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); \ + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_OUTPUT, \ + &c_output_tensor)); \ + void* output_ptr; \ + LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)) + +#define CompareResult \ + EXPECT_TRUE(lite::compare_memory( \ + output_ptr, result_mgb->get_memory_ptr(), \ + result_mgb->get_tensor_total_size_in_byte() / sizeof(float))) + +TEST(TestCapiNetWork, BasicResetInput) { + ForwardMgb; + LiteNetwork c_network; + LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); + LoadNetwork; + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + LITE_destroy_network(c_network); +} + +TEST(TestCapiNetWork, GetAllName) { + std::string model_path = "./shufflenet.mge"; + LiteNetwork c_network; + LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); + LoadNetwork; + size_t input_size, output_size; + LITE_get_all_input_name(c_network, &input_size, nullptr); + LITE_get_all_output_name(c_network, &output_size, nullptr); + + std::vector input_names(input_size); + LITE_get_all_input_name(c_network, nullptr, input_names.data()); + ASSERT_EQ(input_names.size(), 1); + ASSERT_TRUE(std::string(input_names[0]) == "data"); + + std::vector output_names(output_size); + LITE_get_all_output_name(c_network, nullptr, output_names.data()); + ASSERT_TRUE(std::string(output_names[0]) == + "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); + ASSERT_EQ(output_names.size(), 1); + LITE_destroy_network(c_network); +} + +#if LITE_BUILD_WITH_RKNPU + +static int GetTop(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass, + uint32_t outputCount, uint32_t topNum) { + uint32_t i, j; + +#define MAX_TOP_NUM 20 + if (topNum > MAX_TOP_NUM) + return 0; + + memset(pfMaxProb, 0, sizeof(float) * topNum); + memset(pMaxClass, 0xff, sizeof(float) * topNum); + + for (j = 0; j < topNum; j++) { + for (i = 0; i < outputCount; i++) { + if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) || + (i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) || + (i == *(pMaxClass + 4))) { + continue; + } + + if (pfProb[i] > *(pfMaxProb + j)) { + *(pfMaxProb + j) = pfProb[i]; + *(pMaxClass + j) = i; + } + } + } + + return 1; +} + +TEST(TestCapiNetWork, rknntest_set_info) { +#define SET_INFO_SIZE 2 +#define TENSOR_TYPE_UINT8 3 +#define TENSOR_FORMAT_NHWC 1 + LiteConfig config; + config.backend = LiteBackend::LITE_RK_NPU; + config.device_type = LiteDeviceType::LITE_NPU; + config.bare_model_cryption_name = nullptr; + auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); + auto true_tensor = lite::get_input_data("./output_data.npy"); + auto rknn_model = "./model/mobilenet_v1.rknn"; + + LiteNetwork c_network; + LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); + + size_t input_size, output_size; + LITE_get_all_input_name(c_network, &input_size, nullptr); + LITE_get_all_output_name(c_network, &output_size, nullptr); + + std::vector input_names(input_size); + std::vector output_names(output_size); + LiteTensor c_input_tensor, c_output_tensor; + + LITE_get_all_input_name(c_network, nullptr, input_names.data()); + LITE_get_all_output_name(c_network, nullptr, output_names.data()); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, + &c_input_tensor)); + + size_t input_length = 0; + LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); + + size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); + { + LiteLayout input_layout; + LITE_get_tensor_layout(c_input_tensor, &input_layout); + ASSERT_TRUE(input_layout.data_type == LITE_INT8); + std::vector input_shape={1,224,224,3}; + for (size_t i = 0; i < input_layout.ndim; i++) { + ASSERT_TRUE(input_layout.shapes[i]=input_shape[i]); + } + } + + { + int size_attr = 0; + LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, nullptr, nullptr, + &size_attr)); + ASSERT_TRUE(size_attr > 0); + const char* keys[size_attr]; + void* values[size_attr]; + LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, keys, values, + &size_attr)); + ASSERT_TRUE(size_attr > 5); + std::unordered_map result_map = { + {"zp", 0}, + {"index", 0}, + {"size_with_stride", 150528}, + {"stride", 224}, + {"n_size", 150528}, + {"n_elems", 150528}, + {"qnt_type", 2}, + {"n_dims", 4}, + {"type", 2}, + {"fmt", 1}, + {"dims0", 1}, + {"dims1", 224}, + {"dims2", 224}, + {"dims3", 3}, + }; + for (int i = 0; i < size_attr; i++) { + std::string key(keys[i]); + if (key == "names") { + ASSERT_TRUE(std::string("input") == + std::string(static_cast(values[i]))); + } else if (key == "scale") { + float scale = *static_cast(values[i]); + ASSERT_TRUE(std::fabs(scale - 0.007812) < 0.00001); + } else if (key == "fl" || key == "pass_through") { + uint8_t val = *static_cast(values[i]); + if (key == "fl") { + ASSERT_TRUE(val == 0); + } else { + ASSERT_TRUE(val == 1); + } + } else { + uint32_t val = *static_cast(values[i]); + ASSERT_TRUE(result_map[std::string(keys[i])]==val); + } + } + } + const char* keys[] = {"type", "fmt"}; + int info_size = SET_INFO_SIZE; + int type = TENSOR_TYPE_UINT8; + int fmt = TENSOR_FORMAT_NHWC; + void* values[] = {static_cast(&type), static_cast(&fmt)}; + LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, + info_size)); + ASSERT_TRUE(std::string(output_names[0]) == + std::string("MobilenetV1/Predictions/Reshape_1")); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, + &c_output_tensor)); + + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, + lite_tensor->get_memory_ptr(), + data_length_in_byte)); + + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, + &c_output_tensor)); + //LiteLayout tmp_output_layout; + //LITE_get_tensor_layout(c_output_tensor, &tmp_output_layout); + //tmp_output_layout.data_type = LiteDataType::LITE_FLOAT; + + //LITE_set_tensor_layout(c_output_tensor, tmp_output_layout); + { + const char* keys[] = {"want_float"}; + uint8_t want_float = 1; + void* values[] = {static_cast(&want_float)}; + LITE_CAPI_CHECK( + LITE_set_tensor_information(c_output_tensor, keys, values, 1)); + } + + LITE_CAPI_CHECK(LITE_forward(c_network)); + LITE_CAPI_CHECK(LITE_wait(c_network)); + + ASSERT_TRUE(std::string(output_names[0]) == "MobilenetV1/Predictions/Reshape_1"); + ASSERT_EQ(output_names.size(), 1); + { + LiteLayout output_layout; + LITE_get_tensor_layout(c_output_tensor, &output_layout); + ASSERT_TRUE(output_layout.data_type == LITE_FLOAT); + int size_attr = 0; + + LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, nullptr, nullptr, + &size_attr)); + ASSERT_TRUE(size_attr > 0); + const char* keys[size_attr]; + void* values[size_attr]; + LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, keys, values, + &size_attr)); + ASSERT_TRUE(size_attr > 5); + std::unordered_map result_map = { + {"zp", 0}, + {"index", 0}, + {"size_with_stride", 2002}, + {"stride", 0}, + {"n_size", 2002}, + {"n_elems", 1001}, + {"qnt_type", 2}, + {"n_dims", 2}, + {"type", 0}, + {"fmt", 2}, + {"dims0", 1}, + {"dims1", 1001}, + }; + for (int i = 0; i < size_attr; i++) { + std::string key(keys[i]); + if (key == "names") { + ASSERT_TRUE("MobilenetV1/Predictions/Reshape_1" == + std::string(static_cast(values[i]))); + + } else if (key == "scale") { + float scale = *static_cast(values[i]); + ASSERT_TRUE(std::fabs(scale - 1.0) < 0.00001); + } else if (key == "fl" || key == "pass_through") { + uint8_t val = *static_cast(values[i]); + ASSERT_TRUE(val == 0); + } else { + uint32_t val = *static_cast(values[i]); + ASSERT_TRUE(result_map[std::string(keys[i])]==val); + } + } + } + { + uint32_t MaxClass[5]; + float fMaxProb[5]; + void* output_ptr; + LITE_get_tensor_memory(c_output_tensor, &output_ptr); + float* buffer = (float*)output_ptr; + uint32_t sz = true_tensor->get_tensor_total_size_in_byte() / sizeof(float); + + GetTop(buffer, fMaxProb, MaxClass, sz, 5); + + std::vector result_class = { + 286, 464, 282, 357, 285, + }; + std::vector result_prob = { + 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, + }; + + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(result_class[i] == MaxClass[i]); + ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); + } + } + + { + float* true_data = static_cast(true_tensor->get_memory_ptr()); + void* output_ptr; + LITE_get_tensor_memory(c_output_tensor, &output_ptr); + float* data1 = static_cast(output_ptr); + size_t length = + true_tensor->get_tensor_total_size_in_byte() / sizeof(float); + for (size_t i = 0; i < length; i++) { + ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); + } + } + LITE_destroy_network(c_network); +#undef SET_INFO_SIZE +#undef TENSOR_FORMAT_NHWC +#undef TENSOR_TYPE_UINT8 +} + +TEST(TestCapiNetWork, rknntest_set_info_two_input) { +#define SET_INFO_SIZE 2 +#define TENSOR_TYPE_UINT8 3 +#define TENSOR_FORMAT_NHWC 1 + LiteConfig config; + config.backend = LiteBackend::LITE_RK_NPU; + config.device_type = LiteDeviceType::LITE_NPU; + config.bare_model_cryption_name = nullptr; + auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); + auto lite_tensor_dog = lite::get_input_data("./model/dog_224x224.npy"); + auto true_tensor = lite::get_input_data("./output_data.npy"); + auto rknn_model = "./model/mobilenet_v1.rknn"; + + LiteNetwork c_network; + LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); + LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); + + size_t input_size, output_size; + LITE_get_all_input_name(c_network, &input_size, nullptr); + LITE_get_all_output_name(c_network, &output_size, nullptr); + + std::vector input_names(input_size); + std::vector output_names(output_size); + LiteTensor c_input_tensor, c_output_tensor; + + LITE_get_all_input_name(c_network, nullptr, input_names.data()); + LITE_get_all_output_name(c_network, nullptr, output_names.data()); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, + &c_input_tensor)); + + size_t input_length = 0; + LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); + + size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); + { + LiteLayout input_layout; + LITE_get_tensor_layout(c_input_tensor, &input_layout); + ASSERT_TRUE(input_layout.data_type == LITE_INT8); + std::vector input_shape = {1, 224, 224, 3}; + for (size_t i = 0; i < input_layout.ndim; i++) { + ASSERT_TRUE(input_layout.shapes[i] = input_shape[i]); + } + } + + const char* keys[] = {"type", "fmt"}; + int info_size = SET_INFO_SIZE; + int type = TENSOR_TYPE_UINT8; + int fmt = TENSOR_FORMAT_NHWC; + void* values[] = {static_cast(&type), static_cast(&fmt)}; + LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, + info_size)); + ASSERT_TRUE(std::string(output_names[0]) == + std::string("MobilenetV1/Predictions/Reshape_1")); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, + &c_output_tensor)); + + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, + lite_tensor->get_memory_ptr(), + data_length_in_byte)); + + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, + &c_output_tensor)); + { + const char* keys[] = {"want_float"}; + uint8_t want_float = 1; + void* values[] = {static_cast(&want_float)}; + LITE_CAPI_CHECK( + LITE_set_tensor_information(c_output_tensor, keys, values, 1)); + } + + LITE_CAPI_CHECK(LITE_forward(c_network)); + LITE_CAPI_CHECK(LITE_wait(c_network)); + + ASSERT_TRUE(std::string(output_names[0]) == + "MobilenetV1/Predictions/Reshape_1"); + ASSERT_EQ(output_names.size(), 1); + { + uint32_t MaxClass[5]; + float fMaxProb[5]; + void* output_ptr; + LITE_get_tensor_memory(c_output_tensor, &output_ptr); + float* buffer = (float*)output_ptr; + uint32_t sz = + true_tensor->get_tensor_total_size_in_byte() / sizeof(float); + + GetTop(buffer, fMaxProb, MaxClass, sz, 5); + + std::vector result_class = { + 286, 464, 282, 357, 285, + }; + std::vector result_prob = { + 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, + }; + + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(result_class[i] == MaxClass[i]); + ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); + } + } + + { + float* true_data = static_cast(true_tensor->get_memory_ptr()); + void* output_ptr; + LITE_get_tensor_memory(c_output_tensor, &output_ptr); + float* data1 = static_cast(output_ptr); + size_t length = + true_tensor->get_tensor_total_size_in_byte() / sizeof(float); + for (size_t i = 0; i < length; i++) { + ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); + } + } + + LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, + lite_tensor_dog->get_memory_ptr(), + data_length_in_byte)); + LITE_CAPI_CHECK(LITE_forward(c_network)); + LITE_CAPI_CHECK(LITE_wait(c_network)); + ASSERT_TRUE(std::string(output_names[0]) == + "MobilenetV1/Predictions/Reshape_1"); + ASSERT_EQ(output_names.size(), 1); + { + uint32_t MaxClass[5]; + float fMaxProb[5]; + void* output_ptr; + LITE_get_tensor_memory(c_output_tensor, &output_ptr); + float* buffer = (float*)output_ptr; + uint32_t sz = + true_tensor->get_tensor_total_size_in_byte() / sizeof(float); + + GetTop(buffer, fMaxProb, MaxClass, sz, 5); + + std::vector result_prob = { + 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, + }; + + for (int i = 0; i < 5; i++) { + ASSERT_FALSE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); + } + } + + LITE_destroy_network(c_network); +#undef SET_INFO_SIZE +#undef TENSOR_FORMAT_NHWC +#undef TENSOR_TYPE_UINT8 +} +#endif + +TEST(TestCapiNetWork, BasicResetOutput) { + ForwardMgb; + LiteNetwork c_network; + LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); + LoadNetwork; + SetInput; + LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; + std::shared_ptr ptr(new float[1000], + [](float* ptr) { delete[] ptr; }); + const char* output_name; + LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, + &c_output_tensor)); + LITE_CAPI_CHECK( + LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); + + ForwardNetwork; + + EXPECT_TRUE(lite::compare_memory( + ptr.get(), result_mgb->get_memory_ptr(), + result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, BasicInplaceAndSingleThreadAffinity) { + ForwardMgb; + MakeNetwork; + //! config the network with cpu inplace mode + LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network)); + LoadNetwork; + //! set single thread affinith callback + LITE_CAPI_CHECK(LITE_set_runtime_thread_affinity(c_network, + single_thread_affinity)); + SetInput; + ForwardNetwork; + ASSERT_EQ(affinity_set, true); + affinity_set = false; + GetOutput; + CompareResult; + LITE_destroy_network(c_network); +} + +TEST(TestCapiNetWork, UserAllocator) { + ForwardMgb; + MakeNetwork; + LITE_CAPI_CHECK(LITE_set_memory_allocator(c_network, allocate, free)); + LoadNetwork; + SetInput; + ForwardNetwork; + + ASSERT_GE(m_nr_allocated, 1); + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); + ASSERT_EQ(m_nr_left, 0); +} + +TEST(TestCapiNetWork, BasicMultiThread) { + ForwardMgb; + MakeNetwork; + LITE_CAPI_CHECK(LITE_set_cpu_threads_number(c_network, NUMBER_THREDS)); + LoadNetwork; + LITE_CAPI_CHECK( + LITE_set_runtime_thread_affinity(c_network, multi_thread_affinity)); + SetInput; + ForwardNetwork; + for (size_t i = 0; i < NUMBER_THREDS; i++) { + for (size_t j = i + 1; j < NUMBER_THREDS; j++) { + ASSERT_NE(thread_ids[i], thread_ids[j]); + } + } + for (size_t i = 0; i < NUMBER_THREDS; i++) { + thread_ids[i] = std::thread::id(); + } + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, DeviceIO) { + ForwardMgb; + LiteNetwork c_network; + LiteIO input_io = default_io; + input_io.is_host = true; + input_io.name = "data"; + LiteNetworkIO network_io = *default_network_io(); + network_io.inputs = &input_io; + network_io.input_size = 1; + LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), network_io)); + LoadNetwork; + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, StartCallBack) { + ForwardMgb; + MakeNetwork; + LoadNetwork; + LITE_CAPI_CHECK(LITE_set_start_callback(c_network, start_callback)); + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + ASSERT_TRUE(start_checked); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, FinishCallBack) { + ForwardMgb; + MakeNetwork; + LoadNetwork; + LITE_CAPI_CHECK(LITE_set_finish_callback(c_network, finish_callback)); + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + ASSERT_TRUE(finish_checked); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, BasicCryptAes) { + ForwardMgb; + + LiteConfig c_config = *default_config(); + c_config.bare_model_cryption_name = "AES_default"; + LiteNetwork c_network; + LITE_CAPI_CHECK( + LITE_make_network(&c_network, c_config, *default_network_io())); + std::string model_crypt_path = "./shufflenet_crypt_aes.mge"; + + LITE_CAPI_CHECK( + LITE_load_model_from_path(c_network, model_crypt_path.c_str())); + + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, PackedCryptRc4) { + ForwardMgb; + MakeNetwork; + + std::string model_crypt_path = "./test_packed_model_rc4.lite"; + LITE_CAPI_CHECK( + LITE_load_model_from_path(c_network, model_crypt_path.c_str())); + + SetInput; + ForwardNetwork; + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, AsyncExec) { + finished = false; + ForwardMgb; + LiteNetwork c_network; + LiteConfig c_config = *default_config(); + c_config.options.var_sanity_check_first_run = false; + LITE_CAPI_CHECK( + LITE_make_network(&c_network, c_config, *default_network_io())); + LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback)); + LoadNetwork; + SetInput; + + LITE_forward(c_network); + size_t count = 0; + while (finished == false) { + count++; + } + ASSERT_GT(count, 0); + finished = false; + + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, OutputShapeOnly) { + ForwardMgb; + LiteNetwork c_network; + LiteNetworkIO c_network_io = *default_network_io(); + LiteIO io_output = default_io; + io_output.io_type = LiteIOType::LITE_IO_SHAPE; + io_output.name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; + c_network_io.outputs = &io_output; + c_network_io.output_size = 1; + LITE_CAPI_CHECK( + LITE_make_network(&c_network, *default_config(), c_network_io)); + LoadNetwork; + SetInput; + ForwardNetwork; + GetOutput; + size_t length = 0; + LITE_CAPI_CHECK( + LITE_get_tensor_total_size_in_byte(c_output_tensor, &length)); + ASSERT_EQ(length / sizeof(float), 1000); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, ProfileIOdump) { + ForwardMgb; + MakeNetwork; + LITE_CAPI_CHECK( + LITE_enable_profile_performance(c_network, "./profile.json")); + LoadNetwork; + SetInput; + ForwardNetwork; + ASSERT_TRUE(fopen("./profile.json", "r")); + + LITE_CAPI_CHECK(LITE_enable_io_txt_dump(c_network, "./io_txt_dump.txt")); + ForwardNetwork; + ASSERT_TRUE(fopen("./io_txt_dump.txt", "r")); + + GetOutput; + CompareResult; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, GetDeviceType) { + lite::Config config; + auto lite_tensor = lite::get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + MakeNetwork; + LoadNetwork; + LiteDeviceType devicetype; + LITE_CAPI_CHECK(LITE_get_device_type(c_network, &devicetype)); + ASSERT_TRUE(devicetype == LiteDeviceType::LITE_CPU); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, GetModelExtraInfo) { + lite::Config config; + std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite"; + MakeNetwork; + LITE_load_model_from_path(c_network, model_path.c_str()); + const char* info = nullptr; + int info_size = 0; + LITE_CAPI_CHECK(LITE_get_model_extra_info(c_network, &info, &info_size)); + ASSERT_TRUE(info_size > 0); + printf("info %s \n", info); + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, TestWorkSpaceLimit) { + lite::Config config; + auto lite_tensor = lite::get_input_data("./input_data.npy"); + size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); + std::string model_path = "./shufflenet.mge"; + MakeNetwork; + LoadNetwork; + printf("go to config workspace limit\n"); + LITE_CAPI_CHECK(LITE_set_network_algo_workspace_limit(c_network, 1000)); + SetInput; + ForwardNetwork; + + GetOutput; + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); +} + +TEST(TestCapiNetWork, TestShareWeights) { + ForwardMgb; + MakeNetwork; + LoadNetwork; + SetInput; + ForwardNetwork; + + GetOutput; + CompareResult; + + LiteNetwork c_network2; + LITE_CAPI_CHECK( + LITE_make_network(&c_network2, *default_config(), *default_network_io())); + LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network2)); + LITE_CAPI_CHECK(LITE_shared_weight_with_network(c_network2, c_network)); + int is_cpu_inplace_mode = false; + LITE_CAPI_CHECK(LITE_is_cpu_inplace_mode(c_network2, &is_cpu_inplace_mode)); + ASSERT_EQ(is_cpu_inplace_mode, true); + + LiteTensor c_input_tensor2, c_output_tensor2; + LITE_CAPI_CHECK( + LITE_get_io_tensor(c_network2, "data", LITE_IO, &c_input_tensor2)); + LITE_CAPI_CHECK(LITE_reset_tensor_memory( + c_input_tensor2, lite_tensor->get_memory_ptr(), + lite_tensor->get_tensor_total_size_in_byte())); + LITE_CAPI_CHECK(LITE_forward(c_network2)); + LITE_CAPI_CHECK(LITE_wait(c_network2)); + LITE_CAPI_CHECK(LITE_get_io_tensor(c_network2, output_name, LITE_IO, + &c_output_tensor2)); + void* output_ptr2; + LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor2, &output_ptr2)); + + EXPECT_TRUE(lite::compare_memory( + output_ptr2, result_mgb->get_memory_ptr(), + result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); + + LITE_CAPI_CHECK(LITE_destroy_network(c_network)); + LITE_CAPI_CHECK(LITE_destroy_network(c_network2)); +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_network_options.cpp b/lite/test/test_network_options.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ff344ff2254f587ec50cecf3e130bc84ce8bfba --- /dev/null +++ b/lite/test/test_network_options.cpp @@ -0,0 +1,351 @@ +/** + * \file test/test_network_options.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../src/common.h" +#include "../src/misc.h" +#include "../src/mge/network_impl.h" +#include "lite/global.h" + +#include "megbrain/tensor.h" +#include "test_common.h" + +#include +#include +#include +#include + +using namespace lite; + +TEST(TestNetWorkOptions, no_var_sanity_check_and_record) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.var_sanity_check_first_run = false; + config.options.comp_node_seq_record_level = 1; + + std::shared_ptr network = std::make_shared(config); + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, const_shape) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.var_sanity_check_first_run = false; + config.options.const_shape = true; + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, NCHW44) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.var_sanity_check_first_run = false; + config.options.enable_nchw44 = true; + std::shared_ptr network = std::make_shared(config); + + Runtime::set_network_algo_policy( + network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | + LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, test_cache) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + set_persistent_cache("./algo_cache.txt", true); + network->load_model(model_path); + Runtime::set_network_algo_policy( + network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | + LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); + + dump_persistent_cache("./algo_cache.txt"); + ASSERT_TRUE(fopen("./algo_cache.txt", "r")); + + set_persistent_cache("./algo_cache.txt"); + network->forward(); + network->wait(); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, FastRunIgnorBatch) { + Config config; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + set_persistent_cache("./algo_cache.txt"); + network->load_model(model_path); + Runtime::set_network_algo_policy( + network, + LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | + LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE, + 1, true); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); + + dump_persistent_cache("./algo_cache.txt"); + ASSERT_TRUE(fopen("./algo_cache.txt", "r")); +} + +#if LITE_WITH_CUDA +TEST(TestNetWorkOptions, NCHW4) { + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.enable_nchw4 = 1; + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, NCHW32) { + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.enable_nchw32 = 1; + std::shared_ptr network = std::make_shared(config); + Runtime::set_network_algo_policy( + network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | + LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + compare_lite_tensor(output_tensor, result_mgb); +} + +TEST(TestNetWorkOptions, jit_level) { + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + config.options.jit_level = 1; + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + + compare_lite_tensor(output_tensor, result_mgb); +} +#endif + +#if MGB_ENABLE_TENSOR_RT && LITE_WITH_CUDA +TEST(TestNetWorkOptions, TensorRT) { + Config config; + config.device_type = LiteDeviceType::LITE_CUDA; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + Runtime::use_tensorrt(network); + + set_tensor_rt_cache("./tensorrt_cache.txt"); + network->load_model(model_path); + + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + auto result_tensor = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + + void* out_data = result_tensor->get_memory_ptr(); + output_tensor->reset(out_data, result_tensor->get_layout()); + + network->forward(); + network->wait(); + dump_tensor_rt_cache(); + ASSERT_TRUE(fopen("./tensorrt_cache.txt", "r")); + compare_lite_tensor(output_tensor, result_mgb); +} +#endif +#endif +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_tensor.cpp b/lite/test/test_tensor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b5ce61b0e986be3deb1cc3ed12d18ec2c95a46ce --- /dev/null +++ b/lite/test/test_tensor.cpp @@ -0,0 +1,589 @@ +/** + * \file test/test_tensor.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../src/misc.h" +#include "../src/mge/common.h" +#include "../src/mge/network_impl.h" +#include "lite/tensor.h" + +#include + +#include +#include + +using namespace lite; + +TEST(TestTensor, Basic) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1(LiteDeviceType::LITE_CPU); + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + Tensor tensor3(LiteDeviceType::LITE_CPU, layout); + //! mge tensor has created + ASSERT_TRUE(TensorHelper::implement(&tensor1)); + ASSERT_TRUE(TensorHelper::implement(&tensor2)); + ASSERT_TRUE(TensorHelper::implement(&tensor3)); + //! check member + ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); + ASSERT_EQ(tensor2.get_layout(), layout); + ASSERT_EQ(tensor3.get_layout(), layout); + //! check the real tensor + ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); + ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); + + ASSERT_TRUE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .host_tensor()); + + ASSERT_FALSE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .dev_tensor()); + ASSERT_FALSE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .dev_tensor()); + ASSERT_TRUE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .host_tensor()); +} + +TEST(TestTensor, SetLayoutReAlloc) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1; + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + Tensor tensor3(LiteDeviceType::LITE_CPU, layout); + auto old_ptr2 = tensor2.get_memory_ptr(); + auto old_ptr3 = tensor3.get_memory_ptr(); + + //! layout set through + Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; + tensor1.set_layout(layout1); + tensor2.set_layout(layout1); + tensor3.set_layout(layout1); + ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); + ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); + auto layout2 = TensorHelper::implement(&tensor2) + ->cast_final_safe() + .host_tensor() + ->layout(); + auto layout3 = TensorHelper::implement(&tensor3) + ->cast_final_safe() + .host_tensor() + ->layout(); + ASSERT_EQ(to_lite_layout(layout2), layout1); + ASSERT_EQ(to_lite_layout(layout3), layout1); + + auto new_ptr2 = tensor2.get_memory_ptr(); + auto new_ptr3 = tensor3.get_memory_ptr(); + + ASSERT_EQ(old_ptr2, new_ptr2); + ASSERT_EQ(old_ptr3, new_ptr3); +} + +TEST(TestTensor, Reset) { + Layout layout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; + Tensor tensor1; + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + Tensor tensor3(LiteDeviceType::LITE_CPU, layout); + + auto old_ptr2 = tensor2.get_memory_ptr(); + auto old_ptr3 = tensor3.get_memory_ptr(); + //! make sure memory is allocted + ASSERT_NO_THROW(memcpy(old_ptr2, old_ptr3, 3 * 20 * 2)); + + std::shared_ptr new_ptr2(new float[3 * 20], + [](float* ptr) { delete[] ptr; }); + std::shared_ptr new_ptr3(new float[3 * 20], + [](float* ptr) { delete[] ptr; }); + tensor1.reset(new_ptr2.get(), layout); + tensor2.reset(new_ptr2.get(), 3 * 20 * 4); + tensor3.reset(new_ptr3.get(), 3 * 20 * 4); + //! After reset the original mem is freed + /*ASSERT_EXIT((memcpy(old_ptr2, old_ptr3, 3 * 20 * 2), exit(0)), + ::testing::KilledBySignal(SIGSEGV), ".*");*/ + + ASSERT_EQ(tensor2.get_memory_ptr(), new_ptr2.get()); + ASSERT_EQ(tensor3.get_memory_ptr(), new_ptr3.get()); + + ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); + + Layout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; + std::shared_ptr ptr2(new float[6 * 20], + [](float* ptr) { delete[] ptr; }); + std::shared_ptr ptr3(new float[6 * 20], + [](float* ptr) { delete[] ptr; }); + tensor2.reset(ptr2.get(), layout1); + tensor3.reset(ptr3.get(), layout1); + + //! memory is not freed by Tensor reset + ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); + auto host_layout2 = TensorHelper::implement(&tensor2) + ->cast_final_safe() + .host_tensor() + ->layout(); + auto host_layout3 = TensorHelper::implement(&tensor3) + ->cast_final_safe() + .host_tensor() + ->layout(); + + ASSERT_EQ(to_lite_layout(host_layout2), layout1); + ASSERT_EQ(to_lite_layout(host_layout3), layout1); +} + +TEST(TestTensor, CrossCNCopy) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1(LiteDeviceType::LITE_CPU); + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + Tensor tensor3(LiteDeviceType::LITE_CPU, layout); + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + auto old_ptr2 = tensor2.get_memory_ptr(); + auto old_ptr3 = tensor3.get_memory_ptr(); + + //! test source tenor is empty + ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); + tensor1.copy_from(tensor2); + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); + ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); +} + +TEST(TestTensor, SharedTensorMemory) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1(LiteDeviceType::LITE_CPU); + { + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + tensor1.share_memory_with(tensor2); + auto ptr1 = tensor1.get_memory_ptr(); + auto ptr2 = tensor2.get_memory_ptr(); + ASSERT_EQ(ptr1, ptr2); + } + // check after tensor2 destroy, tensor1 can also visit + auto ptr1 = static_cast(tensor1.get_memory_ptr()); + size_t length = tensor1.get_tensor_total_size_in_byte() / + tensor1.get_layout().get_elem_size(); + for (size_t i = 0; i < length; i++) { + ptr1[i] = i; + } +} + +TEST(TestTensor, Reshape) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + auto ptr = tensor2.get_memory_ptr(); + + //! test wrong case + ASSERT_THROW(tensor2.reshape({-1, -1, 3 * 224 * 224}), std::exception); + ASSERT_THROW(tensor2.reshape({-1, 3, 3 * 224 * 224}), std::exception); + ASSERT_THROW(tensor2.reshape({1, 3, 3 * 224 * 224}), std::exception); + ASSERT_THROW(tensor2.reshape({3, 3, 3 * 224 * 224}), std::exception); + + tensor2.reshape({3 * 224 * 224}); + ASSERT_EQ(tensor2.get_layout().ndim, 1); + ASSERT_EQ(tensor2.get_layout().data_type, LiteDataType::LITE_FLOAT); + ASSERT_EQ(tensor2.get_layout().shapes[0], 3 * 224 * 224); + tensor2.reshape({-1, 224, 224}); + ASSERT_EQ(tensor2.get_layout().ndim, 3); + ASSERT_EQ(tensor2.get_layout().shapes[0], 3); + ASSERT_EQ(tensor2.get_layout().shapes[1], 224); + + ASSERT_EQ(tensor2.get_memory_ptr(), ptr); +} + +TEST(TestTensor, Slice) { + Layout layout{{20, 20}, 2}; + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + auto ptr = tensor2.get_memory_ptr(); + + //! test source tenor is empty + ASSERT_THROW(tensor2.slice({5, 10, 10}, {10, 15}), std::exception); + ASSERT_THROW(tensor2.slice({5, 10}, {10, 15}, {5}), std::exception); + ASSERT_THROW(tensor2.slice({5, 10}, {10, 15, 10}), std::exception); + for (int i = 0; i < 20 * 20; i++) { + *(static_cast(ptr) + i) = i; + } + auto check = [&](size_t start, size_t end, size_t step) { + Tensor tensor3; + tensor3.copy_from( + *tensor2.slice({start, start}, {end, end}, {step, step})); + float* new_ptr = static_cast(tensor3.get_memory_ptr()); + for (size_t i = start; i < end; i += step) { + for (size_t j = start; j < end; j += step) { + ASSERT_EQ(float(i * 20 + j), *new_ptr); + ++new_ptr; + } + } + }; + check(5, 10, 1); + check(5, 11, 2); + check(2, 18, 4); + + Tensor tensor3; + tensor3.copy_from(*tensor2.slice({3}, {9}, {2})); + float* new_ptr = static_cast(tensor3.get_memory_ptr()); + for (size_t i = 3; i < 9; i += 2) { + for (size_t j = 0; j < 20; j++) { + ASSERT_EQ(float(i * 20 + j), *new_ptr); + ++new_ptr; + } + } +} + +TEST(TestTensor, SliceCopy) { + Layout layout{{20, 20}, 2}; + Tensor tensor(LiteDeviceType::LITE_CPU, layout); + //! alloc memory + auto ptr = static_cast(tensor.get_memory_ptr()); + + Layout layout_slice{{20, 10}, 2}; + Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); + auto ptr0 = tensor0.get_memory_ptr(); + for (int i = 0; i < 10 * 20; i++) { + *(static_cast(ptr0) + i) = i; + } + Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); + auto ptr1 = tensor1.get_memory_ptr(); + for (int i = 0; i < 10 * 20; i++) { + *(static_cast(ptr1) + i) = i + 200; + } + + auto slice0 = tensor.slice({0, 0}, {20, 10}); + auto slice1 = tensor.slice({0, 10}, {20, 20}); + + slice0->copy_from(tensor0); + slice1->copy_from(tensor1); + + ASSERT_FALSE(slice0->is_continue_memory()); + ASSERT_FALSE(slice1->is_continue_memory()); + + for (size_t i = 0; i < 20; i++) { + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(i * 10 + j), *ptr); + ++ptr; + } + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(i * 10 + j + 200), *ptr); + ++ptr; + } + } + slice0->fill_zero(); + Tensor tmp; + tmp.copy_from(*slice0); + float* tmp_ptr = static_cast(tmp.get_memory_ptr()); + for (size_t i = 0; i < 20; i++) { + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(0), *tmp_ptr); + ++tmp_ptr; + } + } +} + +TEST(TestTensor, GetPtrOffset) { + Layout layout{{20, 20}, 2}; + Tensor tensor(LiteDeviceType::LITE_CPU, layout); + //! alloc memory + auto ptr = static_cast(tensor.get_memory_ptr()); + + auto ptr_offset = tensor.get_memory_ptr({10, 10}); + ASSERT_EQ(ptr_offset, ptr + 10 * 20 + 10); + + auto slice0 = tensor.slice({0, 0}, {20, 10}); + auto slice1 = tensor.slice({0, 10}, {20, 20}); + + ASSERT_FALSE(slice0->is_continue_memory()); + ASSERT_FALSE(slice1->is_continue_memory()); + + auto ptr_offset_slice0 = slice0->get_memory_ptr({6, 5}); + auto ptr_offset_slice1 = slice1->get_memory_ptr({2, 5}); + + ASSERT_EQ(ptr_offset_slice0, ptr + 6 * 20 + 5); + ASSERT_EQ(ptr_offset_slice1, ptr + 2 * 20 + 10 + 5); +} + +TEST(TestTensor, Concat) { + Layout layout{{5, 5, 5}, 3}; + std::vector tensors; + for (int i = 0; i < 4; i++) { + Tensor tensor(LiteDeviceType::LITE_CPU, layout); + auto ptr = static_cast(tensor.get_memory_ptr()); + for (int n = 0; n < 5 * 5 * 5; n++) { + ptr[n] = i; + } + tensors.push_back(tensor); + } + auto check = [&](int dim) { + auto new_tensor = TensorUtils::concat(tensors, dim); + auto ptr = static_cast(new_tensor->get_memory_ptr()); + size_t stride = std::pow(5, (3 - dim)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < stride; j++) { + ASSERT_EQ(ptr[i * stride + j], i); + } + } + }; + check(0); + check(1); + check(2); +} + +#if LITE_WITH_CUDA +TEST(TestTensor, BasicDevice) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1(LiteDeviceType::LITE_CUDA, layout); + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + //! mge tensor has created + ASSERT_TRUE(TensorHelper::implement(&tensor1)); + ASSERT_TRUE(TensorHelper::implement(&tensor2)); + + //! check member + ASSERT_EQ(tensor1.get_device_type(), LiteDeviceType::LITE_CUDA); + ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); + ASSERT_EQ(tensor2.get_layout(), layout); + //! check the real tensor + ASSERT_EQ(tensor1.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); + ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); + + ASSERT_TRUE(TensorHelper::implement(&tensor2) + ->cast_final_safe() + .host_tensor()); + + ASSERT_FALSE(TensorHelper::implement(&tensor2) + ->cast_final_safe() + .dev_tensor()); + ASSERT_TRUE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .dev_tensor()); + ASSERT_FALSE(TensorHelper::implement(&tensor1) + ->cast_final_safe() + .host_tensor()); +} + +TEST(TestTensor, SetLayoutReAllocDevice) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor2(LiteDeviceType::LITE_CUDA, layout); + auto old_ptr2 = tensor2.get_memory_ptr(); + + //! layout set through + Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; + tensor2.set_layout(layout1); + ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); + auto layout2 = TensorHelper::implement(&tensor2) + ->cast_final_safe() + .dev_tensor() + ->layout(); + ASSERT_EQ(to_lite_layout(layout2), layout1); + + auto new_ptr2 = tensor2.get_memory_ptr(); + + ASSERT_EQ(old_ptr2, new_ptr2); +} + +TEST(TestTensor, CrossCNCopyDevice) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor0; + Tensor tensor1(LiteDeviceType::LITE_CPU); + Tensor tensor2(LiteDeviceType::LITE_CPU, layout); + Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); + + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + auto old_ptr2 = tensor2.get_memory_ptr(); + auto old_ptr3 = tensor3.get_memory_ptr(); + ASSERT_THROW(tensor3.copy_from(tensor1), std::exception); + + tensor1.copy_from(tensor3); + tensor0.copy_from(tensor3); + + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); + ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); +} + +TEST(TestTensor, PinnedHostMem) { + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor1(LiteDeviceType::LITE_CPU); + bool is_pinned_host = true; + Tensor tensor2(LiteDeviceType::LITE_CUDA, layout, is_pinned_host); + Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + ASSERT_EQ(tensor2.is_pinned_host(), true); + ASSERT_EQ(tensor3.is_pinned_host(), false); + + auto old_ptr2 = tensor2.get_memory_ptr(); + auto old_ptr3 = tensor3.get_memory_ptr(); + + //! test source tenor is empty + ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); + tensor1.copy_from(tensor2); + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); + ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); +} + +TEST(TestTensor, DeviceId) { + if(get_device_count(LITE_CUDA) <= 1) + return; + Layout layout{{1, 3, 224, 224}, 4}; + Tensor tensor2(0, LiteDeviceType::LITE_CUDA, layout); + Tensor tensor3(1, LiteDeviceType::LITE_CUDA, layout); + + tensor2.copy_from(tensor3); + tensor3.copy_from(tensor2); + + Tensor tensor1; + tensor1.copy_from(tensor2); + tensor1.copy_from(tensor3); +} + +TEST(TestTensor, SliceDevice) { + Layout layout{{20, 20}, 2}; + Tensor host_tensor0; + Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); + host_tensor0.copy_from(dev_tensor0); + auto ptr = host_tensor0.get_memory_ptr(); + + for (int i = 0; i < 20 * 20; i++) { + *(static_cast(ptr) + i) = i; + } + dev_tensor0.copy_from(host_tensor0); + + auto check = [&](size_t start, size_t end, size_t step) { + Tensor host_tensor; + host_tensor.copy_from( + *dev_tensor0.slice({start, start}, {end, end}, {step, step})); + float* new_ptr = static_cast(host_tensor.get_memory_ptr()); + for (size_t i = start; i < end; i += step) { + for (size_t j = start; j < end; j += step) { + ASSERT_EQ(float(i * 20 + j), *new_ptr); + ++new_ptr; + } + } + }; + check(5, 10, 1); + check(5, 11, 2); + check(2, 18, 4); +} + +TEST(TestTensor, MemSetDevice) { + Layout layout{{20, 20}, 2, LiteDataType::LITE_INT8}; + Tensor host_tensor0(LiteDeviceType::LITE_CPU, layout); + Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); + auto check = [&](uint8_t val, const Tensor& tensor) { + auto ptr = static_cast(tensor.get_memory_ptr()); + for (int i = 0; i < 20 * 20; i++) { + ASSERT_EQ(val, *(ptr + i)); + } + }; + host_tensor0.fill_zero(); + check(0, host_tensor0); + + Tensor host_tensor1; + dev_tensor0.fill_zero(); + host_tensor1.copy_from(dev_tensor0); + check(0, host_tensor1); +} + +TEST(TestTensor, DeviceSliceCopy) { + Layout layout{{20, 20}, 2}; + Tensor tensor(LiteDeviceType::LITE_CUDA, layout); + //! alloc memory + tensor.get_memory_ptr(); + + Layout layout_slice{{20, 10}, 2}; + Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); + auto ptr0 = tensor0.get_memory_ptr(); + for (int i = 0; i < 10 * 20; i++) { + *(static_cast(ptr0) + i) = i; + } + Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); + auto ptr1 = tensor1.get_memory_ptr(); + for (int i = 0; i < 10 * 20; i++) { + *(static_cast(ptr1) + i) = i + 200; + } + + auto slice0 = tensor.slice({0, 0}, {20, 10}); + auto slice1 = tensor.slice({0, 10}, {20, 20}); + + slice0->copy_from(tensor0); + slice1->copy_from(tensor1); + + ASSERT_FALSE(slice0->is_continue_memory()); + ASSERT_FALSE(slice1->is_continue_memory()); + + Tensor host_tensor; + host_tensor.copy_from(tensor); + auto ptr = static_cast(host_tensor.get_memory_ptr()); + + for (size_t i = 0; i < 20; i++) { + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(i * 10 + j), *ptr); + ++ptr; + } + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(i * 10 + j + 200), *ptr); + ++ptr; + } + } + slice0->fill_zero(); + Tensor tmp; + tmp.copy_from(*slice0); + float* tmp_ptr = static_cast(tmp.get_memory_ptr()); + for (size_t i = 0; i < 20; i++) { + for (size_t j = 0; j < 10; j++) { + ASSERT_EQ(float(0), *tmp_ptr); + ++tmp_ptr; + } + } +} + +TEST(TestTensor, ConcatDevice) { + Layout layout{{5, 5, 5}, 3}; + std::vector tensors; + for (int i = 0; i < 4; i++) { + Tensor tensor(LiteDeviceType::LITE_CPU, layout); + auto ptr = static_cast(tensor.get_memory_ptr()); + for (int n = 0; n < 5 * 5 * 5; n++) { + ptr[n] = i; + } + tensors.push_back(tensor); + } + auto check = [&](int dim) { + auto new_tensor = + TensorUtils::concat(tensors, dim, LiteDeviceType::LITE_CUDA, 0); + + Tensor tensor(LiteDeviceType::LITE_CPU); + tensor.copy_from(*new_tensor); + auto ptr = static_cast(tensor.get_memory_ptr()); + size_t stride = std::pow(5, (3 - dim)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < stride; j++) { + ASSERT_EQ(ptr[i * stride + j], i); + } + } + ASSERT_EQ(new_tensor->get_device_type(), LiteDeviceType::LITE_CUDA); + ASSERT_EQ(new_tensor->get_device_id(), 0); + }; + check(0); + check(1); + check(2); +} +#endif +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/test/test_tensor_c.cpp b/lite/test/test_tensor_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2c206043516ff36d38730451e2b5a989d0989d66 --- /dev/null +++ b/lite/test/test_tensor_c.cpp @@ -0,0 +1,316 @@ +/** + * \file test/test_tensor_c.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include "lite_build_config.h" + +#if LITE_BUILD_WITH_MGE +#include "../src/misc.h" +#include "lite-c/global_c.h" +#include "lite-c/tensor_c.h" + +#include +#include + +TEST(TestCapiTensor, Basic) { + LiteTensor c_tensor0, c_tensor1; + LiteTensorDesc description = default_desc; + LITE_make_tensor(description, &c_tensor0); + int is_pinned_host = false; + LITE_is_pinned_host(c_tensor0, &is_pinned_host); + ASSERT_FALSE(is_pinned_host); + LiteDeviceType device_type; + LITE_get_tensor_device_type(c_tensor0, &device_type); + ASSERT_EQ(device_type, LiteDeviceType::LITE_CPU); + size_t length = 0; + LITE_get_tensor_total_size_in_byte(c_tensor0, &length); + ASSERT_EQ(length, 0); + + LiteLayout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + description.device_type = LiteDeviceType::LITE_CPU; + description.layout = layout; + description.is_pinned_host = true; + LITE_make_tensor(description, &c_tensor1); + LITE_is_pinned_host(c_tensor1, &is_pinned_host); + ASSERT_TRUE(is_pinned_host); + LITE_get_tensor_total_size_in_byte(c_tensor1, &length); + ASSERT_EQ(length, 1 * 3 * 224 * 224 * 4); + + LiteLayout get_layout; + LITE_get_tensor_layout(c_tensor1, &get_layout); + ASSERT_EQ(get_layout.ndim, layout.ndim); + ASSERT_EQ(get_layout.data_type, layout.data_type); + ASSERT_EQ(get_layout.shapes[0], layout.shapes[0]); + ASSERT_EQ(get_layout.shapes[1], layout.shapes[1]); + ASSERT_EQ(get_layout.shapes[2], layout.shapes[2]); + ASSERT_EQ(get_layout.shapes[3], layout.shapes[3]); + + //! test error + ASSERT_EQ(LITE_is_pinned_host(c_tensor0, nullptr), -1); + ASSERT_NE(strlen(LITE_get_last_error()), 0); + printf("The last error is: %s\n", LITE_get_last_error()); + + LITE_destroy_tensor(c_tensor0); + LITE_destroy_tensor(c_tensor1); +} + +TEST(TestCapiTensor, SetLayoutReAlloc) { + LiteTensor c_tensor0; + LiteTensorDesc description = default_desc; + description.layout = + LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + void *old_ptr, *new_ptr; + LITE_get_tensor_memory(c_tensor0, &old_ptr); + + LiteLayout new_layout = + LiteLayout{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; + LITE_set_tensor_layout(c_tensor0, new_layout); + LITE_get_tensor_memory(c_tensor0, &new_ptr); + + size_t length = 0; + LITE_get_tensor_total_size_in_byte(c_tensor0, &length); + + ASSERT_EQ(length, 1 * 3 * 100 * 100); + ASSERT_EQ(old_ptr, new_ptr); +} + +TEST(TestCapiTensor, Reset) { + LiteTensor c_tensor0, c_tensor1; + LiteTensorDesc description = default_desc; + description.layout = LiteLayout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + LITE_make_tensor(description, &c_tensor1); + void *old_ptr0, *old_ptr1; + LITE_get_tensor_memory(c_tensor0, &old_ptr0); + LITE_get_tensor_memory(c_tensor1, &old_ptr1); + //! make sure memory is allocted + ASSERT_NO_THROW(memcpy(old_ptr0, old_ptr1, 3 * 20 * 4)); + + std::shared_ptr new_ptr0(new float[3 * 20], + [](float* ptr) { delete[] ptr; }); + std::shared_ptr new_ptr1(new float[3 * 20], + [](float* ptr) { delete[] ptr; }); + LITE_reset_tensor_memory(c_tensor0, new_ptr0.get(), 3 * 20 * 4); + LITE_reset_tensor_memory(c_tensor1, new_ptr1.get(), 3 * 20 * 4); + void *tmp_ptr0, *tmp_ptr1; + LITE_get_tensor_memory(c_tensor0, &tmp_ptr0); + LITE_get_tensor_memory(c_tensor1, &tmp_ptr1); + ASSERT_EQ(tmp_ptr0, new_ptr0.get()); + ASSERT_EQ(tmp_ptr1, new_ptr1.get()); + + ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); + + LiteLayout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; + std::shared_ptr ptr2(new float[6 * 20], + [](float* ptr) { delete[] ptr; }); + std::shared_ptr ptr3(new float[6 * 20], + [](float* ptr) { delete[] ptr; }); + LITE_reset_tensor(c_tensor0, layout1, new_ptr0.get()); + LITE_reset_tensor(c_tensor1, layout1, new_ptr1.get()); + + //! memory is not freed by Tensor reset + ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); + + LiteLayout tmp_layout0, tmp_layout1; + LITE_get_tensor_layout(c_tensor0, &tmp_layout0); + LITE_get_tensor_layout(c_tensor1, &tmp_layout1); + ASSERT_EQ(tmp_layout0.ndim, tmp_layout1.ndim); + ASSERT_EQ(tmp_layout0.data_type, tmp_layout1.data_type); + ASSERT_EQ(tmp_layout0.shapes[0], tmp_layout1.shapes[0]); + ASSERT_EQ(tmp_layout0.shapes[1], tmp_layout1.shapes[1]); + + LITE_destroy_tensor(c_tensor0); + LITE_destroy_tensor(c_tensor1); +} + +TEST(TestCapiTensor, CrossCNCopy) { + LiteTensor c_tensor0, c_tensor1, c_tensor2; + LiteTensorDesc description = default_desc; + LITE_make_tensor(description, &c_tensor0); + + description.layout = + LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor1); + LITE_make_tensor(description, &c_tensor2); + + LITE_tensor_copy(c_tensor1, c_tensor2); + LITE_tensor_copy(c_tensor2, c_tensor1); + void *old_ptr1, *old_ptr2, *new_ptr1, *new_ptr2; + LITE_get_tensor_memory(c_tensor1, &old_ptr1); + LITE_get_tensor_memory(c_tensor2, &old_ptr2); + + //! test source tenor is empty + ASSERT_EQ(LITE_tensor_copy(c_tensor1, c_tensor0), -1); + ASSERT_NE(strlen(LITE_get_last_error()), 0); + printf("The last error is: %s\n", LITE_get_last_error()); + + LITE_tensor_copy(c_tensor0, c_tensor1); + LITE_tensor_copy(c_tensor1, c_tensor2); + LITE_tensor_copy(c_tensor2, c_tensor0); + + LITE_get_tensor_memory(c_tensor1, &new_ptr1); + LITE_get_tensor_memory(c_tensor2, &new_ptr2); + + ASSERT_EQ(old_ptr1, new_ptr1); + ASSERT_EQ(old_ptr2, new_ptr2); + + LITE_destroy_tensor(c_tensor0); + LITE_destroy_tensor(c_tensor1); + LITE_destroy_tensor(c_tensor2); +} + +TEST(TestCapiTensor, ShareMemoryWith) { + LiteTensor c_tensor0, c_tensor1; + LiteTensorDesc description = default_desc; + LITE_make_tensor(description, &c_tensor0); + + description.layout = + LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor1); + + ASSERT_EQ(LITE_tensor_share_memory_with(c_tensor1, c_tensor0), -1); + LITE_tensor_share_memory_with(c_tensor0, c_tensor1); + void *ptr0, *ptr1; + LITE_get_tensor_memory(c_tensor0, &ptr0); + LITE_get_tensor_memory(c_tensor1, &ptr1); + + ASSERT_EQ(ptr0, ptr1); + + LITE_destroy_tensor(c_tensor0); + LITE_destroy_tensor(c_tensor1); +} + +TEST(TestCapiTensor, Reshape) { + LiteTensor c_tensor0; + LiteTensorDesc description = default_desc; + description.layout = + LiteLayout{{8, 8, 100, 100}, 4, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + void* old_ptr; + LITE_get_tensor_memory(c_tensor0, &old_ptr); + + auto check = [&](std::vector expect, const LiteTensor& tensor) { + LiteLayout get_layout; + LITE_get_tensor_layout(tensor, &get_layout); + ASSERT_EQ(get_layout.ndim, expect.size()); + for (size_t i = 0; i < expect.size(); i++) { + ASSERT_EQ(get_layout.shapes[i], expect[i]); + } + void* new_ptr; + LITE_get_tensor_memory(tensor, &new_ptr); + ASSERT_EQ(old_ptr, new_ptr); + }; + { + int shape[2] = {-1, 50}; + LITE_tensor_reshape(c_tensor0, shape, 2); + check({8 * 8 * 100 * 2, 50}, c_tensor0); + } + { + int shape[3] = {64, 100, 100}; + LITE_tensor_reshape(c_tensor0, shape, 3); + check({8 * 8, 100, 100}, c_tensor0); + } + { + int shape[3] = {16, 100, -1}; + LITE_tensor_reshape(c_tensor0, shape, 3); + check({16, 100, 400}, c_tensor0); + } + LITE_destroy_tensor(c_tensor0); +} + +TEST(TestCapiTensor, Slice) { + LiteTensor c_tensor0; + LiteTensorDesc description = default_desc; + description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + void* old_ptr; + LITE_get_tensor_memory(c_tensor0, &old_ptr); + for (size_t i = 0; i < 20 * 20; i++) { + *(static_cast(old_ptr) + i) = i; + } + auto check = [&](size_t start, size_t end, size_t step, bool have_step) { + LiteTensor tensor, slice_tensor; + LITE_make_tensor(default_desc, &tensor); + size_t start_ptr[2] = {start, start}; + size_t end_ptr[2] = {end, end}; + size_t step_ptr[2] = {step, step}; + + if (have_step) { + LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, step_ptr, 2, + &slice_tensor); + } else { + LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, nullptr, 2, + &slice_tensor); + } + int is_continue = true; + LITE_is_memory_continue(slice_tensor, &is_continue); + ASSERT_FALSE(is_continue); + + LITE_tensor_copy(tensor, slice_tensor); + void* new_ptr; + LITE_get_tensor_memory(tensor, &new_ptr); + float* ptr = static_cast(new_ptr); + for (size_t i = start; i < end; i += step) { + for (size_t j = start; j < end; j += step) { + ASSERT_EQ(float(i * 20 + j), *ptr); + ++ptr; + } + } + LITE_destroy_tensor(tensor); + }; + check(1, 8, 1, true); + check(1, 8, 1, false); + check(2, 10, 2, true); + check(10, 18, 4, true); + check(10, 18, 1, false); + LITE_destroy_tensor(c_tensor0); +} + +TEST(TestCapiTensor, Memset) { + LiteTensor c_tensor0; + LiteTensorDesc description = default_desc; + description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + void* ptr; + uint8_t* uint8_ptr; + LITE_get_tensor_memory(c_tensor0, &ptr); + LITE_tensor_fill_zero(c_tensor0); + uint8_ptr = static_cast(ptr); + for (size_t i = 0; i < 20 * 20; i++) { + ASSERT_EQ(0, *uint8_ptr); + uint8_ptr++; + } + + LITE_destroy_tensor(c_tensor0); +} + +TEST(TestCapiTensor, GetMemoryByIndex) { + LiteTensor c_tensor0; + LiteTensorDesc description = default_desc; + description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; + LITE_make_tensor(description, &c_tensor0); + void *ptr0, *ptr1, *ptr2, *ptr3; + LITE_get_tensor_memory(c_tensor0, &ptr0); + size_t index0[] = {3, 4}; + LITE_get_tensor_memory_with_index(c_tensor0, &index0[0], 2, &ptr1); + size_t index1[] = {5, 7}; + LITE_get_tensor_memory_with_index(c_tensor0, &index1[0], 2, &ptr2); + size_t index2[] = {5}; + LITE_get_tensor_memory_with_index(c_tensor0, &index2[0], 1, &ptr3); + + ASSERT_EQ(ptr1, static_cast(ptr0) + 3 * 20 + 4); + ASSERT_EQ(ptr2, static_cast(ptr0) + 5 * 20 + 7); + ASSERT_EQ(ptr3, static_cast(ptr0) + 5 * 20); + + LITE_destroy_tensor(c_tensor0); +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/tools/aes_encrypt.sh b/lite/tools/aes_encrypt.sh new file mode 100755 index 0000000000000000000000000000000000000000..37900e1a25ae1e4ae69ed76b161fe064cec3b5b8 --- /dev/null +++ b/lite/tools/aes_encrypt.sh @@ -0,0 +1,26 @@ +#! /bin/bash -e +set -e + +if [ $# -lt 2 ] ; then +echo "USAGE: $0 src dst" +echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl" +echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl key" +exit 1; +fi + +IV=`openssl rand -hex 16` + +Key=000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F +if [ $# == 3 ] ; then +Key=$3 +fi + +# get file size +size=`wc -c $1` + +echo "encrypt aes-256-cbc ..." +openssl enc -e -aes-256-cbc -in $1 -out $1.tmp -K $Key -iv $IV +echo $IV | xxd -r -p | cat - $1.tmp > $2 +# write size into file +printf "%016x" ${size%\ *} | xxd -r -p >> $2 +rm -f $1.tmp diff --git a/lite/tools/dump_model_mgb.py b/lite/tools/dump_model_mgb.py new file mode 100755 index 0000000000000000000000000000000000000000..0f34d1b8c4dac9f14fbb46d3862720ad480a7e6b --- /dev/null +++ b/lite/tools/dump_model_mgb.py @@ -0,0 +1,134 @@ +#!/usr/bin/env mdl +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2020-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from megskull.graph import NodeFilter, FpropEnv +from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization +from megskull.utils.logconf import get_logger +from meghair.utils import io +import megbrain as mgb + +import argparse +import struct +import re +import os + +import numpy as np +import cv2 + +logger = get_logger(__name__) + +def optimize_for_inference(args, outputs): + args_map = { + 'enable_io16xc32': 'f16_io_f32_comp', + 'enable_ioc16': 'f16_io_comp', + 'enable_hwcd4': 'use_nhwcd4', + 'enable_nchw4': 'use_nchw4', + 'enable_nchw88': 'use_nchw88', + 'enable_nchw44': 'use_nchw44', + 'enable_nchw44_dot': 'use_nchw44_dot', + 'enable_nchw32': 'use_nchw32', + 'enable_chwn4': 'use_chwn4', + 'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity', + 'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z', + } + kwargs = {} + for k, v in args_map.items(): + if getattr(args, k): + assert args.optimize_for_inference, ( + 'optimize_for_inference should be set when {} is given'.format( + k)) + kwargs[v] = True + + if args.optimize_for_inference: + return mgb.optimize_for_inference(outputs, **kwargs) + + return outputs + +def main(): + parser = argparse.ArgumentParser( + description='Dump the Python Megbrain model to C++ model, by the way ' + 'optimizing for inference', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('input', help='input pkl model file ') + parser.add_argument('-o', '--output', help='output file', required=True) + parser.add_argument('--init-bn', action='store_true', + help='initialize untrained batch-normalization, to ' + 'avoid NaN or Inf results') + parser.add_argument('--silent', action='store_true', + help='set verbose to False in AssertEqual opr') + parser.add_argument('--optimize-for-inference', action='store_true', + help='enbale optimization for inference') + parser.add_argument('--discard-var-name', action='store_true', + help='discard variable and param names in the ' + 'generated output') + parser.add_argument('--output-strip-info', action='store_true', + help='output code strip information') + parser.add_argument('--enable-io16xc32', action='store_true', + help='transform the mode to float16 io float32 compute') + parser.add_argument('--enable-ioc16', action='store_true', + help='transform the dtype of the model to float16 io ' + 'and compute') + parser.add_argument('--enable-fuse-conv-bias-nonlinearity', + action='store_true', + help='fuse convolution bias and nonlinearity opr to a ' + 'conv_bias opr and compute') + parser.add_argument('--enable-hwcd4', action='store_true', + help='transform the model format from NCHW to NHWCD4 ' + 'for inference; you may need to disable CUDA and set ' + 'MGB_USE_MEGDNN_DBG=2') + parser.add_argument('--enable-nchw4', action='store_true', + help='transform the model format from NCHW to NCHW4 ' + 'for inference') + parser.add_argument('--enable-nchw88', action='store_true', + help='transform the model format from NCHW to NCHW88 ' + 'for inference') + parser.add_argument('--enable-nchw44', action='store_true', + help='transform the model format from NCHW to NCHW44 ' + 'for inference') + parser.add_argument('--enable-nchw44-dot', action='store_true', + help='transform the model format from NCHW to NCHW44_DOT ' + 'for optimizing armv8.2 dot in inference') + parser.add_argument('--enable-chwn4', action='store_true', + help='transform the model format to CHWN4 ' + 'for inference, mainly used for nvidia tensorcore') + parser.add_argument('--enable-nchw32', action='store_true', + help='transform the model format from NCHW4 to NCHW32 ' + 'for inference on nvidia TensoCore') + parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true', + help='fuse conv_bias with z input for inference on ' + 'nvidia GPU (this optimization pass will result in mismatch ' + 'of the precision of output of training and inference)') + args = parser.parse_args() + + env = FpropEnv(verbose_fprop=False) + + + outputs = io.load_network(args.input).outputs + + output_mgbvars = list(map(env.get_mgbvar, outputs)) + + output_mgbvars = optimize_for_inference(args, output_mgbvars) + + if args.discard_var_name: + sereg_kwargs = dict(keep_var_name=0, keep_param_name=False) + else: + sereg_kwargs = dict(keep_var_name=2, keep_param_name=True) + + stat = mgb.serialize_comp_graph_to_file( + args.output, output_mgbvars, append=False, + output_strip_info=args.output_strip_info, + **sereg_kwargs) + logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'. + format(stat.tot_bytes / 1024, + (stat.tot_bytes - stat.tensor_value_bytes) / 1024)) + +if __name__ == '__main__': + main() diff --git a/lite/tools/pack_model/encrypt_info_and_model.sh b/lite/tools/pack_model/encrypt_info_and_model.sh new file mode 100755 index 0000000000000000000000000000000000000000..b1e18fa519045077acc876d4ed5c9cceb32ea501 --- /dev/null +++ b/lite/tools/pack_model/encrypt_info_and_model.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -e + +function usage() { + echo "$0 args1 args2 .." + echo "available args detail:" + echo "-i info.json : input info.json file" + echo "-m model: model name" + echo "-e encryption mode: encryption mode rc4 encrypt_predefined_rc4 " + echo "-o output name: output name" + echo "-n input model name: input model name match with info.json" + echo "-h : show usage" + exit -1 +} + +while getopts "i:m:e:o:n:h" arg +do + case $arg in + i) + INFO_NAME=$OPTARG + ;; + m) + MODEL_NAME=$OPTARG + ;; + n) + INPUT_MODEL_NAME=$OPTARG + ;; + e) + ENCRYPT_MODE=$OPTARG + ;; + o) + OUTPUT_NAME=$OPTARG + ;; + h) + usage + ;; + \?) + echo "show usage" + usage + ;; + esac +done +echo "----------------------------------------------------" +echo "commad args summary:" +echo "INFO_NAME: $INFO_NAME" +echo "MODEL_NAME: $MODEL_NAME" +echo "ENCRYPT_MODE: $ENCRYPT_MODE" +echo "OUTPUT_NAME: $OUTPUT_NAME" +echo "INPUT_MODEL_NAME: $INPUT_MODEL_NAME" +echo "----------------------------------------------------" + +if [[ $INFO_NAME == '' ]]; then + echo "INFO_NAME is NULL,exit now..." + exit -1 +fi +if [[ $MODEL_NAME == '' ]]; then + echo "MODEL_NAME is NULL,exit now..." + exit -1 +fi +if [[ $INPUT_MODEL_NAME == '' ]]; then + echo "INPUT_MODEL_NAME is NULL,exit now..." + exit -1 +fi +if [[ $OUTPUT_NAME == '' ]]; then + echo "OUTPUT_NAME is NULL,exit now..." + exit -1 +fi +ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod +ENCRYPT_MODEL_NAME=$MODEL_NAME.pr_rc4.emod +./rc4_encryptor $ENCRYPT_MODE $INFO_NAME $INFO_NAME.pr_rc4.emod +./rc4_encryptor $ENCRYPT_MODE $MODEL_NAME $MODEL_NAME.pr_rc4.emod + + +ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod +python3 pack_model_and_info.py --input-model=$ENCRYPT_MODEL_NAME --model-name=$INPUT_MODEL_NAME --model-cryption="RC4_default" --info-cryption="RC4_default" --input-info=$ENCRYPT_INFO_NAME --info-parser="LITE_default" -o $OUTPUT_NAME diff --git a/lite/tools/pack_model/pack_model_and_info.py b/lite/tools/pack_model/pack_model_and_info.py new file mode 100644 index 0000000000000000000000000000000000000000..06378f05f06a0491c98e6111a787920e47e96930 --- /dev/null +++ b/lite/tools/pack_model/pack_model_and_info.py @@ -0,0 +1,135 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# This file is part of MegEngine, a deep learning framework developed by +# Megvii. +# +# copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + +import argparse +import struct +import os +import subprocess + +import flatbuffers + +def generate_flatbuffer(): + status, path = subprocess.getstatusoutput('which flatc') + if not status: + cwd = os.path.dirname(os.path.dirname(__file__)) + fbs_file = os.path.abspath(os.path.join(cwd, + "../../src/parse_model/pack_model.fbs")) + cmd = path + ' -p -b '+fbs_file + ret, _ = subprocess.getstatusoutput(str(cmd)) + if ret: + raise Exception("flatc generate error!") + else: + raise Exception('no flatc in current environment, please build flatc ' + 'and put in the system PATH!') + +def main(): + parser = argparse.ArgumentParser( + description='load a encrypted or not encrypted model and a ' + 'json format of the infomation of the model, pack them to a file ' + 'which can be loaded by lite.') + parser.add_argument('--input-model', help='input a encrypted or not encrypted model') + parser.add_argument('--input-info', help='input a encrypted or not encrypted ' + 'json format file.') + parser.add_argument('--model-name', help='the model name, this must match ' + 'with the model name in model info', default = 'NONE') + parser.add_argument('--model-cryption', help='the model encryption method ' + 'name, this is used to find the right decryption method. e.g. ' + '--model_cryption = "AES_default", default is NONE.', default = + 'NONE') + parser.add_argument('--info-cryption', help='the info encryption method ' + 'name, this is used to find the right decryption method. e.g. ' + '--model_cryption = "AES_default", default is NONE.', default = + 'NONE') + parser.add_argument('--info-parser', help='The information parse method name ' + 'default is "LITE_default". ', default = 'LITE_default') + parser.add_argument('--append', '-a', help='append another model to a ' + 'packed model.') + parser.add_argument('--output', '-o', help='output file of packed model.') + + args = parser.parse_args() + + generate_flatbuffer() + assert not args.append, ('--append is not support yet') + assert args.input_model, ('--input_model must be given') + with open(args.input_model, 'rb') as fin: + raw_model = fin.read() + + model_length = len(raw_model) + + if args.input_info: + with open(args.input_info, 'rb') as fin: + raw_info = fin.read() + info_length = len(raw_info) + else: + raw_info = None + info_length = 0 + + # Generated by `flatc`. + from model_parse import Model, ModelData, ModelHeader, ModelInfo, PackModel + + builder = flatbuffers.Builder(1024) + + model_name = builder.CreateString(args.model_name) + model_cryption = builder.CreateString(args.model_cryption) + info_cryption = builder.CreateString(args.info_cryption) + info_parser = builder.CreateString(args.info_parser) + + info_data = builder.CreateByteVector(raw_info) + arr_data = builder.CreateByteVector(raw_model) + + #model header + ModelHeader.ModelHeaderStart(builder) + ModelHeader.ModelHeaderAddName(builder, model_name) + ModelHeader.ModelHeaderAddModelDecryptionMethod(builder, model_cryption) + ModelHeader.ModelHeaderAddInfoDecryptionMethod(builder, info_cryption) + ModelHeader.ModelHeaderAddInfoParseMethod(builder, info_parser) + model_header = ModelHeader.ModelHeaderEnd(builder) + + #model info + ModelInfo.ModelInfoStart(builder) + ModelInfo.ModelInfoAddData(builder, info_data) + model_info = ModelInfo.ModelInfoEnd(builder) + + #model data + ModelData.ModelDataStart(builder) + ModelData.ModelDataAddData(builder, arr_data) + model_data = ModelData.ModelDataEnd(builder) + + Model.ModelStart(builder) + Model.ModelAddHeader(builder, model_header) + Model.ModelAddData(builder, model_data) + Model.ModelAddInfo(builder, model_info) + model = Model.ModelEnd(builder) + + PackModel.PackModelStartModelsVector(builder, 1) + builder.PrependUOffsetTRelative(model) + models = builder.EndVector(1) + + PackModel.PackModelStart(builder) + PackModel.PackModelAddModels(builder, models) + packed_model = PackModel.PackModelEnd(builder) + + builder.Finish(packed_model) + buff = builder.Output() + + result = struct.pack(str(len("packed_model")) + 's', "packed_model".encode('ascii')) + result += buff + + assert args.output, ('--output must be given') + with open(args.output, 'wb') as fin: + fin.write(result) + + print("Model packaged successfully!!!") + print("model name is: {}.".format(args.model_name)) + print("model encryption method is: {}. ".format(args.model_cryption)) + print("model json infomation encryption method is: {}. ".format(args.info_cryption)) + print("model json infomation parse method is: {}. ".format(args.info_parser)) + print("packed model is write to {} ".format(args.output)) + +if __name__ == '__main__': + main() diff --git a/lite/tools/rc4_encrypt.cpp b/lite/tools/rc4_encrypt.cpp new file mode 100644 index 0000000000000000000000000000000000000000..75d3ff9c9ffa03bb4890e5d9acccb38e5b448207 --- /dev/null +++ b/lite/tools/rc4_encrypt.cpp @@ -0,0 +1,211 @@ +/** \file tools/rc4_encrypt.cpp + * + * This file is part of MegEngine, a deep learning framework developed by + * Megvii. + * + * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include "../src/decryption/rc4/rc4_cryption_base.h" +#include "../src/decryption/rc4_cryption.h" + +using namespace lite; + +std::shared_ptr read_file(std::string file_path, size_t& size) { + FILE* fin = fopen(file_path.c_str(), "rb"); + if (!fin) { + printf("failed to open %s.", file_path.c_str()); + }; + fseek(fin, 0, SEEK_END); + size = ftell(fin); + fseek(fin, 0, SEEK_SET); + void* ptr = malloc(size); + std::shared_ptr buf{ptr, ::free}; + fread(buf.get(), 1, size, fin); + fclose(fin); + return buf; +} + +void write_file(std::string file_path, const std::vector& data) { + FILE* fin = fopen(file_path.c_str(), "wb"); + if (!fin) { + printf("failed to open %s.", file_path.c_str()); + }; + fwrite(data.data(), 1, data.size(), fin); + fclose(fin); +} + +typedef int (*CommandHandler)(int, char**); + +const char* usage = + "Usage:\n" + " rc4_encryptor encrypt_predefined_rc4 \n" + " rc4_encryptor encrypt_rc4 \n" + " rc4_encryptor encrypt_predefined_sfrc4 \n" + " rc4_encryptor encrypt_sfrc4 " + "\n" + " rc4_encryptor hash \n"; + +int command_encrypt_predefined_rc4(int argc, char** argv) { + if (argc != 4) { + printf("Invalid encrypt_predefined_rc4 arguments.\n"); + return 1; + } + + const char* input_file_path = argv[2]; + const char* output_file_path = argv[3]; + + size_t size = 0; + auto keys = RC4::get_decrypt_key(); + auto input = read_file(input_file_path, size); + printf("Reading input file ...\n"); + auto output = RC4::encrypt_model(input.get(), size, keys); + + write_file(output_file_path, output); + + printf("Done.\n"); + return 0; +} + +int command_encrypt_rc4(int argc, char** argv) { + if (argc != 6) { + printf("Invalid encrypt_rc4 arguments.\n"); + return 1; + } + + uint64_t hash_key = std::stoull(argv[2], 0, 0); + uint64_t enc_key = std::stoull(argv[3], 0, 0); + const char* input_file_path = argv[4]; + const char* output_file_path = argv[5]; + + std::vector keys(128, 0); + uint64_t* data = reinterpret_cast(keys.data()); + data[0] = hash_key; + data[1] = enc_key; + + size_t size = 0; + auto input = read_file(input_file_path, size); + printf("Reading input file ...\n"); + auto output = RC4::encrypt_model(input.get(), size, keys); + + printf("Encrypting ...\n"); + write_file(output_file_path, output); + + printf("Done.\n"); + return 0; +} + +int command_encrypt_predefined_sfrc4(int argc, char** argv) { + if (argc != 4) { + printf("Invalid encrypt_predefined_rc4 arguments.\n"); + return 1; + } + + const char* input_file_path = argv[2]; + const char* output_file_path = argv[3]; + + size_t size = 0; + auto keys = SimpleFastRC4::get_decrypt_key(); + auto input = read_file(input_file_path, size); + printf("Reading input file ...\n"); + auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); + + write_file(output_file_path, output); + + printf("Done.\n"); + return 0; +} + +int command_encrypt_sfrc4(int argc, char** argv) { + if (argc != 6) { + printf("Invalid encrypt_rc4 arguments.\n"); + return 1; + } + + uint64_t hash_key = std::stoull(argv[2], 0, 0); + uint64_t enc_key = std::stoull(argv[3], 0, 0); + const char* input_file_path = argv[4]; + const char* output_file_path = argv[5]; + + std::vector keys(128, 0); + uint64_t* data = reinterpret_cast(keys.data()); + data[0] = hash_key; + data[1] = enc_key; + + size_t size = 0; + auto input = read_file(input_file_path, size); + printf("Reading input file ...\n"); + auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); + + printf("Encrypting ...\n"); + write_file(output_file_path, output); + + printf("Done.\n"); + return 0; +} + +int command_hash(int argc, char** argv) { + if (argc != 3) { + printf("Invalid hash arguments.\n"); + return 1; + } + + const char* input_file_path = argv[2]; + + size_t len = 0; + auto input = read_file(input_file_path, len); + + rc4::FastHash64 hasher(rc4::key_gen_hash_key()); + auto start = static_cast(input.get()); + + auto ptr = reinterpret_cast(start); + while (reinterpret_cast(ptr + 1) <= start + len) { + hasher.feed(*ptr); + ++ptr; + } + + auto cptr = reinterpret_cast(ptr); + if (cptr < start + len) { + uint64_t v = 0; + std::copy(cptr, start + len, reinterpret_cast(&v)); + hasher.feed(v); + } + + printf("%llx\n", static_cast(hasher.get())); + return 0; +} + + +std::unordered_map commands = { + {"encrypt_predefined_rc4", command_encrypt_predefined_rc4}, + {"encrypt_rc4", command_encrypt_rc4}, + {"encrypt_predefined_sfrc4", command_encrypt_predefined_sfrc4}, + {"encrypt_sfrc4", command_encrypt_sfrc4}, + {"hash", command_hash}, +}; + +int main(int argc, char** argv) { + if (argc == 1) { + printf("%s", usage); + return 1; + } + + auto it = commands.find(argv[1]); + if (it == commands.end()) { + printf("Invalid command arguments.\n"); + printf("%s", usage); + return 1; + } + return it->second(argc, argv); +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh index e1bafb7448c823fdcb543d6cd7807dcf087545e9..86f3aaa35570233de977ec83f51d6ac6759acb58 100755 --- a/scripts/whl/macos/macos_build_whl.sh +++ b/scripts/whl/macos/macos_build_whl.sh @@ -209,6 +209,35 @@ function do_build() { echo "comapt whl name: ${compat_whl_name}" cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} + # handle megenginelite + cd ${BUILD_DIR} + rm -rf lite_staging + mkdir -p lite_staging/megenginelite + cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ + cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ + cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ + VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py + if [ -f ${VER_FILE} ];then + cp ${VER_FILE} lite_staging/megenginelite + else + echo "ERROR: can not find version file" + exit -1 + fi + mkdir -p ${BUILD_DIR}/lite_staging/megenginelite/libs + LITE_LIB=${BUILD_DIR}/lite_staging/megenginelite/libs/liblite_shared.dylib + cp ${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/lite/liblite_shared.dylib ${LITE_LIB} + llvm-strip -s ${LITE_LIB} + + cd ${BUILD_DIR}/lite_staging/ + ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel + cd ${BUILD_DIR}/lite_staging/dist/ + org_whl_name=`ls Meg*.whl` + index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'` + compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl + echo "megenginelite org whl name: ${org_whl_name}" + echo "megenginelite comapt whl name: ${compat_whl_name}" + cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} + cd ${SRC_DIR} echo "" echo "##############################################################################################" diff --git a/scripts/whl/manylinux2014/do_build_common.sh b/scripts/whl/manylinux2014/do_build_common.sh index 0f1fc771e3359f907b92e344586c298989eab789..57e6b391857cf7bfc805045ff2530d445d544616 100755 --- a/scripts/whl/manylinux2014/do_build_common.sh +++ b/scripts/whl/manylinux2014/do_build_common.sh @@ -155,6 +155,33 @@ do echo "comapt whl name: ${compat_whl_name}" mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} + # handle megenginelite + cd ${BUILD_DIR} + rm -rf lite_staging + mkdir -p lite_staging/megenginelite + cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ + cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ + cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ + VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py + if [ -f ${VER_FILE} ];then + cp ${VER_FILE} lite_staging/megenginelite + else + echo "ERROR: can not find version file" + exit -1 + fi + patch_elf_depend_lib_megenginelite + + cd ${BUILD_DIR}/lite_staging/ + ${PYTHON_DIR}/bin/python setup.py bdist_wheel + cd /home/output + mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME} + cd ${BUILD_DIR}/lite_staging/dist/ + org_whl_name=`ls Meg*${ver}*.whl` + compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` + echo "megenginelite org whl name: ${org_whl_name}" + echo "megenginelite comapt whl name: ${compat_whl_name}" + mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} + cd /home/output chown -R ${UID}.${UID} . # compat for root-less docker env to remove output at host side diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh index c917c4895c2432f2bb8f7ab3c6686f5682ed42a7..4896071c5cd70e7be21955ba15e12cdf172148b9 100755 --- a/scripts/whl/windows/windows_build_whl.sh +++ b/scripts/whl/windows/windows_build_whl.sh @@ -106,6 +106,23 @@ function copy_more_dll() { depend_real_copy ${CP_WHL_DST_IMP} fi } + +function lite_copy_more_dll() { + # for python whl real use + echo "config megenginelite core lib dir" + CP_WHL_DST_IMP=${BUILD_DIR}/lite_staging/megenginelite/libs + + if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then + echo "copy nvidia lib to whl use...." + depend_real_copy ${CP_WHL_DST_IMP} + if [ ${IN_CI} = "true" ]; then + echo "copy lib for lite for ci test" + IMP_TEST_DST=${SRC_DIR}/build_dir/host/build/lite/test/ + depend_real_copy ${IMP_TEST_DST} + fi + fi +} + BUILD_DIR=${SRC_DIR}/build_dir/host/build/ # here we just treat cu file should not in the increment build file list @@ -196,6 +213,32 @@ function do_build() { ${PYTHON_DIR}/python3 setup.py bdist_wheel cp ${BUILD_DIR}/staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/ + # handle megenginelite + cd ${BUILD_DIR} + rm -rf lite_staging + mkdir -p lite_staging/megenginelite + cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ + cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ + cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ + VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py + if [ -f ${VER_FILE} ];then + cp ${VER_FILE} lite_staging/megenginelite + else + echo "ERROR: can not find version file" + exit -1 + fi + + LITE_CORE_LIB_DIR=${BUILD_DIR}/lite_staging/megenginelite/libs/ + mkdir -p ${LITE_CORE_LIB_DIR} + cd ${LITE_CORE_LIB_DIR} + cp ${BUILD_DIR}/lite/lite_shared.dll liblite_shared.dll + llvm-strip -s liblite_shared.dll + lite_copy_more_dll + + cd ${BUILD_DIR}/lite_staging/ + ${PYTHON_DIR}/python3 setup.py bdist_wheel + cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/ + echo "" echo "##############################################################################################" echo "windows whl package location: ${WINDOWS_WHL_HOME}"