From 6ca756f3235d1125a1a3d9d3e88edb197b94598e Mon Sep 17 00:00:00 2001 From: jackzhang235 <56432943+jackzhang235@users.noreply.github.com> Date: Thu, 9 Apr 2020 09:59:37 +0800 Subject: [PATCH] [MLU] add mlu related pass, kernels and gtests; modify api in padddle_api.h (#3307) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [MLU] add some basic support for MLU, including related passes, kernels, gtests and some api in padddle_api.h Passes:mlu_subgraph_pass ,mlu_postprocess_pass Kernels: act,batch_norm, concat, conv, elementwise, fc, interpolate, pool, scale, softmax --- CMakeLists.txt | 5 + cmake/configure.cmake | 4 + cmake/lite.cmake | 32 +- lite/CMakeLists.txt | 1 + lite/api/CMakeLists.txt | 15 +- lite/api/cxx_api_impl.cc | 9 + lite/api/opt.cc | 2 + lite/api/paddle_api.cc | 33 ++ lite/api/paddle_api.h | 34 ++ lite/api/paddle_place.cc | 5 +- lite/api/paddle_place.h | 4 +- lite/api/paddle_use_passes.h | 2 + lite/api/python/pybind/pybind.cc | 32 ++ lite/backends/CMakeLists.txt | 1 + lite/core/CMakeLists.txt | 3 +- lite/core/arena/CMakeLists.txt | 2 +- lite/core/context.h | 97 +++++ lite/core/device_info.cc | 83 +++- lite/core/device_info.h | 66 ++- lite/core/kernel.h | 3 + lite/core/memory.cc | 16 + lite/core/memory.h | 9 + lite/core/mir/CMakeLists.txt | 1 + lite/core/mir/mlu_postprocess_pass.cc | 179 ++++++-- lite/core/mir/mlu_postprocess_pass.h | 10 + lite/core/mir/ssa_graph.cc | 38 ++ lite/core/mir/ssa_graph.h | 5 + lite/core/mir/subgraph/subgraph_detector.cc | 21 +- lite/core/mir/subgraph/subgraph_pass.cc | 16 + lite/core/mir/subgraph/subgraph_pass.h | 5 + lite/core/mir/subgraph_cast_display_pass.cc | 111 ----- lite/core/optimizer.h | 6 + lite/core/workspace.h | 7 + lite/kernels/CMakeLists.txt | 1 + lite/kernels/mlu/CMakeLists.txt | 1 + lite/kernels/mlu/bridges/CMakeLists.txt | 27 +- lite/kernels/mlu/bridges/act_op.cc | 33 +- lite/kernels/mlu/bridges/act_op_test.cc | 13 +- lite/kernels/mlu/bridges/batch_norm_op.cc | 2 +- .../kernels/mlu/bridges/batch_norm_op_test.cc | 8 +- lite/kernels/mlu/bridges/concat_op.cc | 73 ++++ lite/kernels/mlu/bridges/concat_op_test.cc | 154 +++++++ lite/kernels/mlu/bridges/conv_op.cc | 112 +++-- lite/kernels/mlu/bridges/conv_op_test.cc | 14 +- lite/kernels/mlu/bridges/elementwise_ops.cc | 4 +- .../mlu/bridges/elementwise_ops_test.cc | 18 +- lite/kernels/mlu/bridges/fc_op.cc | 9 +- lite/kernels/mlu/bridges/fc_op_test.cc | 35 +- lite/kernels/mlu/bridges/graph.cc | 4 +- lite/kernels/mlu/bridges/graph.h | 89 +++- lite/kernels/mlu/bridges/interpolate_op.cc | 99 +++++ .../mlu/bridges/interpolate_op_test.cc | 406 ++++++++++++++++++ lite/kernels/mlu/bridges/paddle_use_bridges.h | 4 + lite/kernels/mlu/bridges/pool_op.cc | 29 +- lite/kernels/mlu/bridges/pool_op_test.cc | 11 +- lite/kernels/mlu/bridges/scale_op.cc | 74 ++++ lite/kernels/mlu/bridges/scale_op_test.cc | 147 +++++++ lite/kernels/mlu/bridges/softmax_op.cc | 3 +- lite/kernels/mlu/bridges/softmax_op_test.cc | 8 +- lite/kernels/mlu/bridges/tensor.h | 2 + lite/kernels/mlu/bridges/test_helper.cc | 8 +- lite/kernels/mlu/bridges/utility.h | 2 +- lite/kernels/mlu/io_copy_compute.cc | 19 - lite/kernels/mlu/layout_compute.cc | 108 +++++ lite/kernels/mlu/layout_compute.h | 175 ++++++++ lite/kernels/mlu/subgraph_compute.h | 58 ++- lite/kernels/npu/bridges/CMakeLists.txt | 2 +- lite/kernels/x86/cast_compute.cc | 11 + lite/tests/cv/CMakeLists.txt | 2 +- lite/tests/kernels/CMakeLists.txt | 2 +- lite/tests/math/CMakeLists.txt | 2 +- lite/tools/build_mlu.sh | 15 +- 72 files changed, 2259 insertions(+), 382 deletions(-) delete mode 100644 lite/core/mir/subgraph_cast_display_pass.cc create mode 100644 lite/kernels/mlu/bridges/concat_op.cc create mode 100644 lite/kernels/mlu/bridges/concat_op_test.cc create mode 100644 lite/kernels/mlu/bridges/interpolate_op.cc create mode 100644 lite/kernels/mlu/bridges/interpolate_op_test.cc create mode 100644 lite/kernels/mlu/bridges/scale_op.cc create mode 100644 lite/kernels/mlu/bridges/scale_op_test.cc create mode 100644 lite/kernels/mlu/layout_compute.cc create mode 100644 lite/kernels/mlu/layout_compute.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 12b077f11a..aefe8cc19c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) @@ -178,6 +179,10 @@ if(LITE_WITH_XPU) include(device/xpu) endif() +if(LITE_WITH_MLU) + include(mlu) +endif() + include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 57ffab98df..caf4563670 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -153,6 +153,10 @@ if (LITE_WITH_BM) add_definitions("-DLITE_WITH_BM") endif() +if (LITE_WITH_MLU) +add_definitions("-DLITE_WITH_MLU") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 780cdea445..a07edaa575 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -100,6 +100,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_MLU) + foreach(var ${lite_deps_MLU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -144,6 +150,7 @@ function(lite_cc_library TARGET) PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) if (args_SHARED OR ARGS_shared) @@ -170,7 +177,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -189,6 +196,7 @@ function(lite_cc_binary TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) @@ -218,7 +226,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -245,6 +253,7 @@ function(lite_cc_test TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") +set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -369,6 +379,12 @@ function(add_kernel TARGET device level) endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "MLU") + if (NOT LITE_WITH_MLU) + return() + endif() + set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) foreach(src ${args_SRCS}) @@ -409,6 +425,7 @@ function(add_kernel TARGET device level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -427,7 +444,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -462,6 +479,7 @@ function(add_operator TARGET level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 9f7aa73eb5..e7c4e5fcc5 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -10,6 +10,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") +message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index f33140bada..4ce95776f3 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -67,7 +67,8 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels}) endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) @@ -89,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") +message(STATUS "get MLU kernels ${mlu_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -126,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -145,6 +148,7 @@ if(WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -291,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -328,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) @@ -341,6 +347,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -353,6 +360,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -365,6 +373,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -377,6 +386,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} @@ -388,6 +398,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 133b7f7ccf..ccd7c98138 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { } } #endif +#ifdef LITE_WITH_MLU + Env::Init(); + lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_use_first_conv(), + config.mlu_first_conv_mean(), + config.mlu_first_conv_std(), + config.mlu_input_layout()); +#endif // LITE_WITH_MLU std::vector passes{}; auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 12003050af..7a8cd7f1ef 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -109,6 +109,8 @@ std::vector ParserValidPlaces() { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); + } else if (target_repr == "mlu") { + valid_places.emplace_back(TARGET(kMLU)); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index ab23918240..daef2c66dd 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -204,6 +204,39 @@ void ConfigBase::set_threads(int threads) { #endif } +#ifdef LITE_WITH_MLU +void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) { + mlu_core_version_ = core_version; +} +void CxxConfig::set_mlu_core_number(int core_number) { + mlu_core_number_ = core_number; +} +void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { + mlu_input_layout_ = layout; +} +void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { + mlu_use_first_conv_ = use_first_conv; +} +void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { + mlu_first_conv_mean_ = mean; +} +void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { + mlu_first_conv_std_ = std; +} +lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { + return mlu_core_version_; +} +int CxxConfig::mlu_core_number() const { return mlu_core_number_; } +DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } +bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } +const std::vector &CxxConfig::mlu_first_conv_mean() const { + return mlu_first_conv_mean_; +} +const std::vector &CxxConfig::mlu_first_conv_std() const { + return mlu_first_conv_std_; +} +#endif + void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { #ifdef LITE_WITH_XPU lite::Context::SetWorkspaceL3Size(l3_size); diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 82cfc75eb2..ce0f0e15d8 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase { #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif +#ifdef LITE_WITH_MLU + lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; + int mlu_core_number_{1}; + DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; + bool mlu_use_first_conv_{false}; + std::vector mlu_first_conv_mean_; + std::vector mlu_first_conv_std_; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -163,6 +171,32 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif + +#ifdef LITE_WITH_MLU + // set MLU core version, which is used when compiling MLU kernels + void set_mlu_core_version(lite_api::MLUCoreVersion core_version); + // set MLU core number, which is used when compiling MLU kernels + void set_mlu_core_number(int core_number); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW + void set_mlu_input_layout(DataLayoutType layout); + // whether use MLU's first conv kernel. First conv is a special kernel + // provided by MLU, its input is uint8, and also needs two 3-dimentional + // vectors which save all inputs' mean and std values + void set_mlu_use_first_conv(bool use_first_conv); + // set the 3-dimentional mean vector used by MLU's first conv + void set_mlu_first_conv_mean(const std::vector& mean); + // set the 3-dimentional std vector used by MLU's first conv + void set_mlu_first_conv_std(const std::vector& std); + + lite_api::MLUCoreVersion mlu_core_version() const; + int mlu_core_number() const; + DataLayoutType mlu_input_layout() const; + bool mlu_use_first_conv() const; + const std::vector& mlu_first_conv_mean() const; + const std::vector& mlu_first_conv_std() const; +#endif + // XPU only, set the size of the workspace memory from L3 cache for the // current thread. void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index dba65656cb..aceb047b64 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) { "fpga", "npu", "xpu", - "bm"}; + "bm", + "mlu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) { "kFPGA", "kNPU", "kXPU", + "kMLU", "kBM"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); @@ -153,6 +155,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kNPU), TARGET(kXPU), TARGET(kBM), + TARGET(kMLU), TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index c9e4597839..f57b9832f2 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -53,8 +53,8 @@ enum class TargetType : int { kNPU = 8, kXPU = 9, kBM = 10, - kAny = 6, // any target kMLU = 11, + kAny = 6, // any target NUM = 12, // number of fields. }; enum class PrecisionType : int { @@ -89,6 +89,8 @@ typedef enum { LITE_POWER_RAND_LOW = 5 } PowerMode; +typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion; + enum class ActivationType : int { kIndentity = 0, kRelu = 1, diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index ad34fe77ee..219952bd2a 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -45,6 +45,8 @@ USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); +USE_MIR_PASS(mlu_subgraph_pass); +USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index e86d570e18..5512e7bc43 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -47,6 +47,7 @@ using lite_api::TargetType; using lite_api::PrecisionType; using lite_api::DataLayoutType; using lite_api::Place; +using lite_api::MLUCoreVersion; using lite::LightPredictorImpl; using lite_api::OptBase; @@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m); static void BindLitePowerMode(py::module *m); static void BindLitePlace(py::module *m); static void BindLiteTensor(py::module *m); +static void BindLiteMLUCoreVersion(py::module *m); void BindLiteApi(py::module *m) { BindLiteCxxConfig(m); @@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) { BindLitePowerMode(m); BindLitePlace(m); BindLiteTensor(m); + BindLiteMLUCoreVersion(m); #ifndef LITE_ON_TINY_PUBLISH BindLiteCxxPredictor(m); #endif @@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) { .def("set_power_mode", &CxxConfig::set_power_mode) .def("power_mode", &CxxConfig::power_mode); #endif +#ifdef LITE_WITH_MLU + cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version) + .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number) + .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout) + .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv) + .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean) + .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std); +#endif } // TODO(sangoly): Should MobileConfig be renamed to LightConfig ?? @@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) { .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW); } +void BindLiteMLUCoreVersion(py::module *m) { + py::enum_(*m, "MLUCoreVersion") + .value("LITE_MLU_220", MLUCoreVersion::MLU_220) + .value("LITE_MLU_270", MLUCoreVersion::MLU_270); +} + void BindLitePlace(py::module *m) { // TargetType py::enum_(*m, "TargetType") @@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) { .value("OpenCL", TargetType::kOpenCL) .value("FPGA", TargetType::kFPGA) .value("NPU", TargetType::kNPU) + .value("MLU", TargetType::kMLU) .value("Any", TargetType::kAny); // PrecisionType @@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) { DO_GETTER_ONCE(data_type__, name__##_data) DATA_GETTER_SETTER_ONCE(int8_t, int8); +#ifdef LITE_WITH_MLU + tensor.def("set_uint8_data", + [](Tensor &self, + const std::vector &data, + TargetType type = TargetType::kHost) { + if (type == TargetType::kHost) { + self.CopyFromCpu(data.data()); + } + }, + py::arg("data"), + py::arg("type") = TargetType::kHost); + + DO_GETTER_ONCE(uint8_t, "uint8_data"); +#endif DATA_GETTER_SETTER_ONCE(int32_t, int32); DATA_GETTER_SETTER_ONCE(float, float); #undef DO_GETTER_ONCE diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index e351746481..fb459ae362 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -6,4 +6,5 @@ add_subdirectory(fpga) add_subdirectory(host) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) add_subdirectory(bm) diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index f75f87660a..278f971b0b 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -8,7 +8,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper - BM_DEPS target_wrapper_bm) + BM_DEPS target_wrapper_bm + MLU_DEPS target_wrapper_mlu) lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 0f3f36768b..afc1040736 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.h b/lite/core/context.h index bdf0a93c90..061638d63f 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -24,6 +24,11 @@ #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" #endif +#ifdef LITE_WITH_MLU +#include +#include +#include "lite/backends/mlu/mlu_utils.h" +#endif #ifdef LITE_WITH_XPU #include "lite/backends/xpu/xpu_header_sitter.h" #endif @@ -202,6 +207,85 @@ class Context { }; #endif +#ifdef LITE_WITH_MLU +template <> +class Context { + public: + typename Env::Devs& devs = Env::Global(); + + void InitOnce() {} + + MLUContext& operator=(const MLUContext& ctx) { + this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_); + return *this; + } + + void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) { + CHECK_GT(devs.size(), 0UL) + << "Env is not initialized or current target is not exit!"; + if (dev_id >= static_cast(devs.size())) { + LOG(WARNING) << "device index exceeds the number of devices, set to " + "default device(0)!"; + device_id_ = 0; + } else { + device_id_ = dev_id; + } + SetMluDevice(device_id_); + if (io_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "data queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + io_queue_id = 0; + } + if (exec_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "exec queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + exec_queue_id = 0; + } + io_queue_ = devs[dev_id].io_queues()[io_queue_id]; + exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id]; + + exec_queue_id_ = exec_queue_id; + io_queue_id_ = io_queue_id; + } + + void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } + + const cnrtQueue_t& exec_queue() const { return exec_queue_; } + void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; } + + const cnrtQueue_t& io_queue() const { return io_queue_; } + void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } + + cnmlCoreVersion_t MLUCoreVersion() { + return DeviceInfo::Global().MLUCoreVersion(); + } + + int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } + + u32_t affinity() { return affinity_; } + + cnrtInvokeFuncParam_t forward_param() { return forward_param_; } + + int device_id() { return device_id_; } + + std::string name() const { return "MLUContext"; } + + private: + int device_id_; + // overall information + int exec_queue_id_; + int io_queue_id_; + cnrtQueue_t io_queue_; + cnrtQueue_t exec_queue_; + + std::vector input_notifiers_; + std::vector output_notifiers_; + + cnrtInvokeFuncParam_t forward_param_; + u32_t affinity_ = 0x01; +}; +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA // Only works with CUDA kernels. template <> @@ -428,6 +512,16 @@ class ContextScheduler { kernel_contexts_[TargetType::kBM].As().CopySharedTo( &ctx->As()); break; +#endif +#ifdef LITE_WITH_MLU + case TARGET(kMLU): { + int dev_id = TargetWrapper::GetCurDevice(); + auto& context = ctx->As(); + context.Init(dev_id); + kernel_contexts_[TargetType::kMLU].As().CopySharedTo( + &context); + LOG(INFO) << "New Context for MLU"; + } break; #endif default: #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) @@ -469,6 +563,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_BM InitContext(); +#endif +#ifdef LITE_WITH_MLU + InitContext(); #endif } diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 6e0d743fb9..29ac96ed74 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -58,7 +58,7 @@ namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) thread_local lite_api::PowerMode DeviceInfo::mode_; thread_local ARMArch DeviceInfo::arch_; thread_local int DeviceInfo::mem_size_; @@ -66,6 +66,15 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; +#ifdef LITE_WITH_MLU +thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; +thread_local int DeviceInfo::mlu_core_number_{1}; +thread_local bool DeviceInfo::use_first_conv_{false}; +thread_local std::vector DeviceInfo::mean_vec_; +thread_local std::vector DeviceInfo::std_vec_; +thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; +#endif + #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() { return 0; } +#ifdef LITE_WITH_MLU +void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + use_first_conv_ = use_first_conv; + mean_vec_ = mean_vec; + std_vec_ = std_vec; + input_layout_ = input_layout; +} + +cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } + +int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } + +bool DeviceInfo::UseFirstConv() { return use_first_conv_; } + +const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } + +const std::vector& DeviceInfo::StdVec() const { return std_vec_; } + +DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } + +#endif // LITE_WITH_MLU + void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); @@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) { #endif // LITE_WITH_ARM +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id) { + LOG(INFO) << "Set mlu device " << device_id; + cnrtDev_t dev_handle; + CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id)); + CNRT_CALL(cnrtSetCurrentDevice(dev_handle)); +} + +void Device::Init() { + SetMluDevice(idx_); + GetInfo(); + CreateQueue(); +} + +void Device::GetInfo() {} + +void Device::CreateQueue() { + exec_queue_.clear(); + io_queue_.clear(); + for (size_t i = 0; i < max_queue_; ++i) { + cnrtQueue_t exec_queue; + cnrtQueue_t io_queue; + cnrtCreateQueue(&exec_queue); + cnrtCreateQueue(&io_queue); + exec_queue_.push_back(exec_queue); + io_queue_.push_back(io_queue); + + cnrtCreateQueue(&exec_queue); + exec_queue_.push_back(exec_queue); + } +} +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA void Device::Init() { diff --git a/lite/core/device_info.h b/lite/core/device_info.h index 5727933f47..a108ae3d4b 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -19,11 +19,14 @@ #include #include "lite/core/tensor.h" #include "lite/utils/cp_logging.h" +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/mlu_utils.h" +#endif namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) typedef enum { kAPPLE = 0, @@ -52,6 +55,20 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); +#ifdef LITE_WITH_MLU + void SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout); + cnmlCoreVersion_t MLUCoreVersion(); + int MLUCoreNumber(); + bool UseFirstConv(); + const std::vector& MeanVec() const; + const std::vector& StdVec() const; + DataLayoutType InputLayout() const; +#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -103,6 +120,15 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; +#ifdef LITE_WITH_MLU + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; +#endif + void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); @@ -134,6 +160,9 @@ class Env { return *devs; } static void Init(int max_stream = 4) { +#ifdef LITE_WITH_MLU + CNRT_CALL(cnrtInit(0)); +#endif Devs& devs = Global(); if (devs.size() > 0) { return; @@ -156,6 +185,41 @@ class Env { } }; +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id); + +template <> +class Device { + public: + Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {} + void Init(); + + int id() { return idx_; } + int max_queue() { return max_queue_; } + void SetId(int idx) { idx_ = idx; } + std::string name() { return "MLU"; } + int core_num() { return 16; } + float max_memory() { return 16 * 1024; } + std::vector io_queues() { return io_queue_; } + std::vector exec_queues() { return exec_queue_; } + + private: + void CreateQueue(); + void GetInfo(); + + private: + int idx_{0}; + int max_queue_; + std::string device_name_; + float max_memory_; + + std::vector io_queue_; + std::vector exec_queue_; +}; + +template class Env; +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA template <> class Device { diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 18a1243c11..ff848dae9e 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -83,6 +83,9 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif +#if defined(LITE_WITH_MLU) + WorkSpace::Global_MLU().AllocReset(); +#endif #ifdef LITE_WITH_PROFILE profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); diff --git a/lite/core/memory.cc b/lite/core/memory.cc index f0b00140ec..1f2f7fed7d 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) { data = TargetWrapper::Malloc(size); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + data = TargetWrapper::Malloc(size); + break; +#endif // LITE_WITH_MLU #ifdef LITE_WITH_XPU case TargetType::kXPU: data = TargetWrapperXPU::Malloc(size); @@ -88,6 +93,11 @@ void TargetFree(TargetType target, void* data, std::string free_flag) { TargetWrapper::Free(data); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::Free(data); + break; +#endif // LITE_WITH_MLU #ifdef LITE_WITH_XPU case TargetType::kXPU: TargetWrapperXPU::Free(data); @@ -124,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { TargetWrapper::MemcpySync(dst, src, size, IoDirection::DtoD); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::MemcpySync( + dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index 20780ea17a..a101391001 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -31,6 +31,10 @@ #include "lite/backends/bm/target_wrapper.h" #endif // LITE_WITH_BM +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/target_wrapper.h" +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_XPU #include "lite/backends/xpu/target_wrapper.h" #endif // LITE_WITH_XPU @@ -79,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { TargetWrapperCL::MemcpySync(dst, src, size, dir); break; #endif // LITE_WITH_OPENCL +#ifdef LITE_WITH_MLU + case TARGET(kMLU): + TargetWrapperMlu::MemcpySync(dst, src, size, dir); + break; +#endif #ifdef LITE_WITH_FPGA case TARGET(kFPGA): TargetWrapper::MemcpySync(dst, src, size, dir); diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 4685bea9d7..91accc907e 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -37,6 +37,7 @@ lite_cc_library(mir_passes demo_pass.cc runtime_context_assign_pass.cc memory_optimize_pass.cc + mlu_postprocess_pass.cc weight_quantization_preprocess_pass.cc quantized_op_attributes_inference_pass.cc DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index d6240888d0..15f62f36b0 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -15,7 +15,6 @@ #include "lite/core/mir/mlu_postprocess_pass.h" #include #include -#include #include #include #include @@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, op_desc.SetAttr("out_dtype", 4); // FP16 op_desc.SetInput("X", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cast_arg_name}); - } else if (op_type == "transpose") { + } else if (op_type == "layout") { // NCHW -> NHWC - op_desc.SetAttr>("axis", {0, 2, 3, 1}); - op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetInput("Input", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cast_arg_name}); } else if (op_type == "io_copy") { op_desc.SetInput("Input", {cur_node->AsArg().name}); @@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { is_found = true; } - } else if (op_type == "transpose") { - is_found = true; + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) && + DataLayoutCompatible(*out_arg_ty, *cast_type) && + // for first conv + PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); @@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } break; } } @@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cast_arg->AsArg().type = cast_type; auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); // for CastAfter manully set the tensor's type - var->GetMutable<::paddle::lite::Tensor>(); + var->GetMutable(); // create the stmt node auto* cast_inst = graph->NewInstructNode(); @@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, op_desc.SetAttr("out_dtype", 5); // FP16 op_desc.SetInput("X", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); - } else if (op_type == "transpose") { + } else if (op_type == "layout") { // NHWC -> NCHW - op_desc.SetAttr>("axis", {0, 3, 1, 2}); - op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetInput("Input", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); } else if (op_type == "io_copy") { op_desc.SetInput("Input", {cast_arg_name}); @@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { is_found = true; } - } else if (op_type == "transpose") { - is_found = true; + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cast_type) && + DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); @@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } break; } } @@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, auto* cur_node = head_node; const auto name_prefix = head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + bool is_first_conv_head = + std::find(first_conv_nodes_.begin(), + first_conv_nodes_.end(), + head_node->AsArg().name) != first_conv_nodes_.end(); - // layout cast node - if (head_type->layout() != inst_type->layout()) { + // precision cast node + if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { cur_node = InsertCastBefore( - "transpose", - name_prefix + "transpose", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), head_type->precision(), inst_type->layout())); + head_type->target(), inst_type->precision(), head_type->layout())); } - // precision cast node - if (head_type->precision() != inst_type->precision()) { + // layout cast node + if (head_type->layout() != inst_type->layout()) { cur_node = InsertCastBefore( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, // get subgraph's valid precision const auto& places = graph->valid_places(); - std::set<::paddle::lite_api::PrecisionType> prec_set; + std::set prec_set; for (const auto& place : places) { if (place.target == TARGET(kMLU)) { prec_set.insert(place.precision); @@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, const auto name_prefix = tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; - // layout cast node - if (tail_type->layout() != inst_type->layout()) { + // precision cast node + if (tail_type->precision() != inst_type->precision()) { cur_node = InsertCastAfter( - "transpose", - name_prefix + "transpose", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), tail_type->precision(), inst_type->layout())); + tail_type->target(), inst_type->precision(), tail_type->layout())); } - // precision cast node - if (tail_type->precision() != inst_type->precision()) { + // layout cast node + if (tail_type->layout() != inst_type->layout()) { cur_node = InsertCastAfter( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, auto* sub_block_op_desc = sub_block_desc->GetOp(i); UpdateOutputTo( sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + /* graph like this + * subgraph_op_0 + * / \ + * / \ + * subgraph_op_1 host_op + */ + UpdateInputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); } // recreate the op @@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { } } +bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { + auto* block_desc = + static_cast(inst->AsStmt().op().get()) + ->GetSubBlock(); + for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { + auto op_desc = block_desc->GetOp(op_idx); + CHECK(op_desc); + if (op_desc->Type() == "conv2d") { + for (auto& names : op_desc->inputs()) { + if (std::find(names.second.begin(), + names.second.end(), + arg_node->AsArg().name) != names.second.end()) { + return true; + } + } + } + } + return false; +} + +bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) { + CHECK(arg_node->IsArg()); + for (auto& inst : arg_node->outlinks) { + if (inst->AsStmt().op_type() == "subgraph") { + return IsFirstConvInSubgraph(arg_node, inst); + } + } + return false; +} + +void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + if (IsFirstConvNode(out)) { + first_conv_nodes_.insert(out->AsArg().name); + // modify first conv nodes' type + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + paddle::lite_api::PrecisionType::kInt8, + old_type->layout(), + old_type->device()); + } + } + } + } +} + void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; @@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { out->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { inp->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { } void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { - // currently for non-persistent input and output args, mlu subgraph op - // only support float16/float32 data type - - // in two situations as folllows: - // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; - // arg_in and arg_out are assumed to be NHWC which user should be aware of. - // Thus here we change these args' layout to NHWC - ModifyLayout(graph.get()); +// currently for non-persistent input and output args, mlu subgraph op +// only support float16/float32 data type + +// in two situations as folllows: +// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; +// arg_in and arg_out are assumed to be NHWC which user should be aware of. +// Thus here we change these args' layout to NHWC +#ifdef LITE_WITH_MLU + if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyLayout(graph.get()); + } + + if (lite::DeviceInfo::Global().UseFirstConv()) { + GatherAndModifyFirstConvNodes(graph.get()); + } +#endif // insert io_copy, layout and precision cast of subgraph's inputs and outputs for (auto& node : graph->mutable_nodes()) { diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h index 8ffcbc952a..688dd06fb5 100644 --- a/lite/core/mir/mlu_postprocess_pass.h +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include "lite/core/mir/pass.h" @@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass { const Type* cast_type); void RecreateOp(Node* inst_node, SSAGraph* graph); + + void GatherAndModifyFirstConvNodes(SSAGraph* graph); + + bool IsFirstConvNode(Node* arg_node); + + bool IsFirstConvInSubgraph(Node* arg_node, Node* inst); + + private: + std::set first_conv_nodes_; }; } // namespace mir diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index 95434a3c07..54f5f4d46c 100644 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -64,6 +64,26 @@ std::map> SSAGraph::BuildOperationAdjList() { return adj_list; } +std::map> SSAGraph::BuildNodeAdjList() { + std::map> adj_list; + + for (auto &n : mutable_nodes()) { + if (adj_list.find(&n) == adj_list.end()) { + adj_list[&n] = std::set(); + } + std::vector nodes; + for (auto &var : n.inlinks) { + nodes.push_back(var); + } + std::sort(nodes.begin(), + nodes.end(), + [](mir::Node *node1, mir::Node *node2) { return node1 > node2; }); + adj_list[&n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); + } + return adj_list; +} + void SSAGraph::SortHelper( const std::map> &adj_list, mir::Node *node, @@ -98,6 +118,24 @@ std::vector SSAGraph::StmtTopologicalOrder() { return res; } +std::vector SSAGraph::NodeTopologicalOrder() { + CheckBidirectionalConnection(); + + std::stack stack; + std::set visited; + std::vector res; + + auto adj_list = BuildNodeAdjList(); + + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &res); + } + } + + return res; +} + Node *SSAGraph::GraphCreateInstructNode( const std::shared_ptr &op, const std::vector &valid_places) { node_storage_.emplace_back(); diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h index b5b9fb1cb2..e2967cf96a 100644 --- a/lite/core/mir/ssa_graph.h +++ b/lite/core/mir/ssa_graph.h @@ -42,6 +42,8 @@ class SSAGraph : GraphBase { std::vector StmtTopologicalOrder(); + std::vector NodeTopologicalOrder(); + // The inputs of the graph. std::vector inputs(); @@ -86,6 +88,9 @@ class SSAGraph : GraphBase { // Build operator inlink edge table. std::map> BuildOperationAdjList(); + // Build node inlink edge table. + std::map> BuildNodeAdjList(); + void SortHelper(const std::map> &adj_list, mir::Node *node, std::set *visited, diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 4e1a4ab3bb..b61f7f365f 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -312,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { std::vector> SubgraphDetector::ExtractSubgraphs( node_map_t *nodes) { - for (auto &it : *nodes) { - node_dat_t *node = it.second; + for (auto &ordered_node : graph_->NodeTopologicalOrder()) { + // different orders when traversing nodes in graph may lead to + // different subgraph division, which may generate different result + // with device such as MLU. These different results are all "right" + // but a little confusing. Thus the topological order is used instead + // of the address of the node in graph. + CHECK(nodes->find(ordered_node) != nodes->end()); + node_dat_t *node = (*nodes)[ordered_node]; if (!node->marked) { continue; } @@ -571,13 +577,14 @@ void ExtractInputsOutputs(const std::vector &op_nodes, unused_var_nodes->insert(var_node); continue; } - // Var can have more than one next op node, So, if any one in the - // op_nodes then continue - bool next_op_in_nodes = false; + // Var can have more than one next op node, So, if all next nodes are in + // op_nodes then it should be put into local_var_nodes + bool next_op_in_nodes = true; for (auto &next_op_node : var_node->outlinks) { - if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) != + if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) == op_nodes.end()) { - next_op_in_nodes = true; + next_op_in_nodes = false; + break; } } if (next_op_in_nodes) { diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 229941c4b4..eecd9348ae 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -69,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/mlu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -79,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index 1ba0f2ab4a..f83448df42 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class MLUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc deleted file mode 100644 index 3a2c94d232..0000000000 --- a/lite/core/mir/subgraph_cast_display_pass.cc +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class SubgraphCastDisplayPass : public DebugPass { - public: - void Apply(const std::unique_ptr& graph) override { - VLOG(3) << "== Argument types =="; - for (auto& node : graph->mutable_nodes()) { - if (!node.IsArg()) continue; - - auto* type = node.AsArg().type; - if (type) { - VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type; - } else { - VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK"; - } - } - VLOG(3) << "---------------------"; - - // - VLOG(0) << "== SubgraphOp Debug Info =="; - for (auto& node : graph->mutable_nodes()) { - if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { - VLOG(0) << "FOUND SUBGRAPH OP"; - display_debug_info(node, "subgraph"); - break; - } - } - VLOG(0) << "---------------------"; - } - - void display_debug_info(const Node& node, - std::string op_type, - bool display_in_nodes = true, - bool display_out_nodes = true) { - CHECK(node.IsStmt()); - VLOG(0) << node.AsStmt(); - if (display_in_nodes) { - for (auto p_in_arg_node : node.inlinks) { - CHECK(p_in_arg_node->IsArg()); - VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name - << " type: " << *p_in_arg_node->AsArg().type - << " is_weight: " << p_in_arg_node->AsArg().is_weight - << " is_persist: " << p_in_arg_node->AsArg().is_persist - << " input_count: " << p_in_arg_node->inlinks.size(); - if (p_in_arg_node->inlinks.size() == 0) { - VLOG(0) << "** END with No Op"; - } - for (auto p_in_stmt_node : p_in_arg_node->inlinks) { - CHECK(p_in_stmt_node->IsStmt()); - std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type(); - if (stmt_op_type == "cast" || stmt_op_type == "transpose" || - stmt_op_type == "io_copy") { - display_debug_info(*p_in_stmt_node, stmt_op_type, true, false); - } else { - VLOG(0) << "** END with op type: " << stmt_op_type; - } - } - } - } - if (display_out_nodes) { - for (auto p_out_arg_node : node.outlinks) { - CHECK(p_out_arg_node->IsArg()); - VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name - << " type: " << *p_out_arg_node->AsArg().type - << " is_weight: " << p_out_arg_node->AsArg().is_weight - << " is_persist: " << p_out_arg_node->AsArg().is_persist - << " output_count: " << p_out_arg_node->outlinks.size(); - if (p_out_arg_node->outlinks.size() == 0) { - VLOG(0) << "** END with No Op"; - } - for (auto p_out_stmt_node : p_out_arg_node->outlinks) { - CHECK(p_out_stmt_node->IsStmt()); - std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type(); - if (stmt_op_type == "cast" || stmt_op_type == "transpose" || - stmt_op_type == "io_copy") { - display_debug_info(*p_out_stmt_node, stmt_op_type, false, true); - } else { - VLOG(0) << "** END with op type: " << stmt_op_type; - } - } - } - } - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(subgraph_cast_display_pass, - paddle::lite::mir::SubgraphCastDisplayPass) - .BindTargets({TARGET(kAny)}); diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 224dc33f89..80c2bd553f 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -117,9 +117,15 @@ class Optimizer { "variable_place_inference_pass", // "argument_type_display_pass", + "mlu_subgraph_pass", + "runtime_context_assign_pass", "argument_type_display_pass", + + "mlu_postprocess_pass", + "memory_optimize_pass"}}; + if (passes.size() == 1) { passes_local.push_back(passes[0]); } diff --git a/lite/core/workspace.h b/lite/core/workspace.h index 117b80aaa7..54efb6699a 100644 --- a/lite/core/workspace.h +++ b/lite/core/workspace.h @@ -69,6 +69,13 @@ class WorkSpace { } #endif +#if defined(LITE_WITH_MLU) + static WorkSpace& Global_MLU() { + thread_local std::unique_ptr x(new WorkSpace(TARGET(kMLU))); + return *x; + } +#endif + private: explicit WorkSpace(TargetType x) : target_(x) {} diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 4e0092b392..78bb8d10b7 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -10,4 +10,5 @@ add_subdirectory(opencl) add_subdirectory(fpga) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) add_subdirectory(bm) diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt index 1c41f05ca0..f9395d45cc 100644 --- a/lite/kernels/mlu/CMakeLists.txt +++ b/lite/kernels/mlu/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(bridges) add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) +add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt index 302d580ee1..82510ab9b6 100644 --- a/lite/kernels/mlu/bridges/CMakeLists.txt +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu}) set(mlu_subgraph_bridges subgraph_bridge_registry subgraph_bridge_utility_mlu @@ -26,16 +29,20 @@ set(mlu_subgraph_bridges subgraph_bridge_softmax_op_mlu subgraph_bridge_fc_op_mlu subgraph_bridge_batch_norm_op_mlu + subgraph_bridge_scale_op_mlu + subgraph_bridge_interp_op_mlu + subgraph_bridge_concat_op_mlu CACHE INTERNAL "mlu_subgraph_bridges") - -# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) -# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) - +lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) +lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc index 50291ec297..286195d9d5 100644 --- a/lite/kernels/mlu/bridges/act_op.cc +++ b/lite/kernels/mlu/bridges/act_op.cc @@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { VLOG(3) << "[MLU] Converting " + op_type + "..."; // Create act node and set params from op + auto fp_type = graph->FPType(); auto x_var_name = op_info->Input("X").front(); auto out_var_name = op_info->Output("Out").front(); auto output = scope->FindVar(out_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); auto output_tensor = graph->AddNode( - out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); CHECK(graph->HasNode(x_var_name)); auto input_tensor = graph->GetNode(x_var_name); - cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type); cnmlBaseOp_t activation_op; - CNML_CALL(cnmlCreateActiveOp(&activation_op, - act_type, - input_tensor->mlu_tensor(), - output_tensor->mlu_tensor())); + if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + std::vector shape = {1, 1, 1, 1}; + std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op); + auto alpha_tensor = + graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type); + graph->BindConstRawData(alpha_var_name, &alpha, 1, true); + CNML_CALL(cnmlCreatePreluOp(&activation_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor())); + } else { + cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type); + CNML_CALL(cnmlCreateActiveOp(&activation_op, + act_type, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } graph->FuseOp(activation_op); return SUCCESS; } @@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle +REGISTER_SUBGRAPH_BRIDGE(sigmoid, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(leaky_relu, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc index 51cdc52dc6..7cec0529e4 100644 --- a/lite/kernels/mlu/bridges/act_op_test.cc +++ b/lite/kernels/mlu/bridges/act_op_test.cc @@ -25,8 +25,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ActConverter(void* ctx, OpLite* op); - template void FillTensor(Tensor* x, float lower = -2, float upper = -2); @@ -136,7 +134,7 @@ void test_act(std::vector x_shape, std::string op_type) { TEST(MLUBridges, activation) { std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; - std::vector types{"sigmoid", "relu", "tanh"}; + std::vector types{"sigmoid", "relu", "tanh", "leaky_relu"}; for (auto x_shape : shapes) { for (auto op_type : types) { test_act(x_shape, op_type); @@ -149,8 +147,7 @@ TEST(MLUBridges, activation) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - sigmoid, - paddle::lite::subgraph::mlu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter); +USE_SUBGRAPH_BRIDGE(sigmoid, kMLU) +USE_SUBGRAPH_BRIDGE(relu, kMLU) +USE_SUBGRAPH_BRIDGE(tanh, kMLU) +USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU) diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc index d95a5115c9..7353a685dd 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto output = scope->FindVar(y_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); auto output_tensor = graph->AddNode( - y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); CHECK(graph->HasNode(x_var_name)); diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc index 47e291bf3d..65b24a0a72 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc @@ -23,8 +23,6 @@ namespace lite { namespace subgraph { namespace mlu { -int BatchNormConverter(void* ctx, OpLite* op); - template void batch_norm_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -139,9 +137,7 @@ void test_batch_norm( {bs, ic, ih, iw}, {0, 2, 3, 1}); - out->Resize({bs, ih, iw, ic}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - batch_norm, - paddle::lite::subgraph::mlu::BatchNormConverter); +USE_SUBGRAPH_BRIDGE(batch_norm, kMLU) diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc new file mode 100644 index 0000000000..14f0da746a --- /dev/null +++ b/lite/kernels/mlu/bridges/concat_op.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X"); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto param_axis = op_info->GetAttr("axis"); + + std::vector input_tensor; + for (auto x_name : x_var_name) { + CHECK(graph->HasNode(x_name)); + input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor()); + } + + auto dims = output_dims.size(); + int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; + CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; + int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2}; + int nhwc_axis = nchw_to_nhwc_axis_map[axis]; + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + cnmlBaseOp_t concat_op; + cnmlTensor_t outputs = output_tensor->mlu_tensor(); + CNML_CALL(cnmlCreateNdConcatOp(&concat_op, + nhwc_axis, + input_tensor.data(), + x_var_name.size(), + &outputs, + 1)); + graph->FuseOp(concat_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(concat, + kMLU, + paddle::lite::subgraph::mlu::ConcatConverter); diff --git a/lite/kernels/mlu/bridges/concat_op_test.cc b/lite/kernels/mlu/bridges/concat_op_test.cc new file mode 100644 index 0000000000..c4b48a9ef4 --- /dev/null +++ b/lite/kernels/mlu/bridges/concat_op_test.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/concat_op.h" +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void concat_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = op_info->Input("X"); + std::vector inputs; + for (auto var : x) { + inputs.push_back(scope->FindVar(var)->GetMutable()); + } + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + std::vector inputs_concat(inputs.size()); + for (int j = 0; j < inputs.size(); ++j) { + inputs_concat[j] = inputs[j]; + } + size_t num = inputs.size(); + int rows = 1; + auto dim_0 = inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int out_rows = rows, out_cols = 0; + std::vector inputs_cols(inputs.size()); + for (int i = 0; i < num; ++i) { + int t_cols = inputs[i]->numel() / rows; + out_cols += t_cols; + inputs_cols[i] = t_cols; + } + for (int k = 0; k < out_rows; ++k) { + float* dst_ptr = out->mutable_data() + k * out_cols; + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = inputs_cols[j]; + const float* src_prt = inputs[j]->data() + k * col_len; + std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); + col_idx += col_len; + } + } +} + +void test_concat(std::vector> input, int axis) { + std::string x_var_name = "x"; + std::string y_var_name = "y"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + + // prepare input&output variables + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* y = scope.Var(y_var_name)->GetMutable(); + x->Resize(DDim(input[0])); + y->Resize(DDim(input[1])); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + CHECK_EQ(out->dims(), out_ref->dims()); + + // initialize input&output data + FillTensor(x); + FillTensor(y); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("concat"); + opdesc.SetInput("X", {x_var_name, y_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + auto op = CreateOp(opdesc, &scope); + concat_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x, input_y; + input_x.Resize(DDim(input[0])); + input_y.Resize(DDim(input[1])); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input[0][0]), + static_cast(input[0][1]), + static_cast(input[0][2]), + static_cast(input[0][3])}, + {0, 2, 3, 1}); + transpose(y->mutable_data(), + input_y.mutable_data(), + {static_cast(input[1][0]), + static_cast(input[1][1]), + static_cast(input[1][2]), + static_cast(input[1][3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + y->CopyDataFrom(input_y); + + LaunchOp(op, {x_var_name, y_var_name}, {out_var_name}); + + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out->dims()); + auto os = out->dims(); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4); + } +} + +TEST(MLUBridges, concat) { + test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0); + test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1); + test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2); + test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(concat, kMLU); diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc index e9fdacdca9..6a7ef408eb 100644 --- a/lite/kernels/mlu/bridges/conv_op.cc +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto* scope = op->scope(); VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; - // Get input, filter and op attributes + // get input, filter and op attributes const auto input_var_name = op_info->Input("Input").front(); - const auto& input_dims_nhwc = + const auto& input_dims = scope->FindVar(input_var_name)->GetMutable()->dims(); - const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); const auto filter_var_name = op_info->Input("Filter").front(); auto* filter = scope->FindVar(filter_var_name)->GetMutable(); const auto& filter_dims = filter->dims(); const auto output_var_name = op_info->Output("Output").front(); + auto* output = scope->FindVar(output_var_name)->GetMutable(); + const auto output_shape = output->dims().Vectorize(); const auto bs = input_dims[0]; const auto oc = filter_dims[0]; CHECK_EQ(input_dims.size(), 4); @@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { input_dims, filter_dims); - std::vector output_shape({bs, oc}); - for (size_t i = 0; i < 2; i++) { - const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; - output_shape.push_back( - (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / - strides[i] + - 1); - } - - const auto output_shape_nhwc = DimNCHW2NHWC(output_shape); - const auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, - CNML_TENSOR, - CNML_NHWC, - graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); + const auto output_tensor = graph->AddNode( + output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); // Create filter node const auto filter_tensor = graph->AddNode(filter_var_name, @@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(FATAL) << "UnSupported weight precision!"; } - cnmlConvOpParam_t conv_param; - CNML_CALL(cnmlCreateConvOpParam(&conv_param, - strides[0], - strides[1], - dilations[0], - dilations[1], - paddings[0] * 2, - paddings[2] * 2)); std::string bias_var_name; std::shared_ptr bias_tensor; if (HasInputArg(op_info, scope, "Bias")) { @@ -160,15 +137,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->FPType()); graph->BindConstData(bias_var_name, bias); } - cnmlBaseOp_t conv_op; + const auto input_scale = op_info->GetAttr("input_scale"); - CNML_CALL(cnmlCreateConvOpForward( - &conv_op, - conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), - output_tensor->mlu_tensor(), - filter_tensor->mlu_tensor(), - bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + + bool use_first_conv = false; + if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) { + use_first_conv = true; + } + + cnmlBaseOp_t conv_op; + if (use_first_conv) { + cnmlConvFirstOpParam_t conv_param; + CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[2], + paddings[2], + paddings[0], + paddings[0])); + const auto mean_tensor = graph->AddNode("first_conv_mean_tensor", + std::vector{3}, + CNML_CONST, + CNML_CNHW, + graph->FPType()); + const auto std_tensor = graph->AddNode("first_conv_std_tensor", + std::vector{3}, + CNML_CONST, + CNML_CNHW, + graph->FPType()); + + graph->BindConstRawData("first_conv_mean_tensor", + lite::DeviceInfo::Global().MeanVec().data(), + 3, + false); + graph->BindConstRawData("first_conv_std_tensor", + lite::DeviceInfo::Global().StdVec().data(), + 3, + false); + + graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8); + CNML_CALL(cnmlCreateConvFirstOpForward( + &conv_op, + conv_param, + graph->GetNode(input_var_name)->mlu_tensor(), + mean_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr, + std_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param)); + } else { + cnmlConvOpParam_t conv_param; + CNML_CALL(cnmlCreateConvOpParam(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[0] * 2, + paddings[2] * 2)); + CNML_CALL(cnmlCreateConvOpForward( + &conv_op, + conv_param, + graph->GetNode(input_var_name)->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); + } graph->SetComputingDataType( conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); @@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->BindConstData(filter_var_name, filter); graph->FuseOp(conv_op); - CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc index e8ef9ba04f..e34dd7c2a8 100644 --- a/lite/kernels/mlu/bridges/conv_op_test.cc +++ b/lite/kernels/mlu/bridges/conv_op_test.cc @@ -25,8 +25,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ConvConverter(void* ctx, OpLite* op); - void conv_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -246,10 +244,6 @@ void test_conv(int bs, } } - input->Resize({bs, ih, iw, ic}); - output->Resize( - {output_shape[0], output_shape[2], output_shape[3], output_shape[1]}); - // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc_mlu, &scope); LaunchOp(op, {input_var_name}, {output_var_name}); @@ -342,9 +336,5 @@ TEST(MLUBridges, conv) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - conv2d, - paddle::lite::subgraph::mlu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - depthwise_conv2d, - paddle::lite::subgraph::mlu::ConvConverter); +USE_SUBGRAPH_BRIDGE(conv2d, kMLU) +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU) diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc index 4ef949925d..41526a0100 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto output_tensor = graph->AddNode(out_var_name, x->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); cnmlBaseOp_t elementwise_op; @@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto mid_tensor = graph->AddNode(out_var_name + "_mid", x->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op, x_tensor->mlu_tensor(), diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc index 388aa68600..e5087dd708 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ElementwiseConverter(void* ctx, OpLite* op); - template void elementwise_add_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_add, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_sub, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_mul, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_div, - paddle::lite::subgraph::mlu::ElementwiseConverter); +USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU) diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc index 43a75daa2b..286feec8d4 100644 --- a/lite/kernels/mlu/bridges/fc_op.cc +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { // int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); auto x = scope->FindVar(x_var_name)->GetMutable(); auto w = scope->FindVar(w_var_name)->GetMutable(); + auto output = scope->FindVar(output_var_name)->GetMutable(); auto x_dims = x->dims(); auto w_dims = w->dims(); @@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto input_scale = op_info->GetAttr("input_scale"); - std::vector output_shape_nhwc({1, 1, 1, w_dims[1]}); auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, + output->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); std::string bias_var_name; std::shared_ptr bias_tensor; diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc index 7e5cfdb32e..8f92b6abad 100644 --- a/lite/kernels/mlu/bridges/fc_op_test.cc +++ b/lite/kernels/mlu/bridges/fc_op_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int FCConverter(void* ctx, OpLite* op); - void fc_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -141,15 +139,34 @@ void test_fc(const std::vector& input_shape, } auto fc_op_mlu = CreateOp(fc_op_desc_mlu, &scope); - input->Resize({static_cast(input_shape[0]), - static_cast(input_shape[2]), - static_cast(input_shape[3]), - static_cast(input_shape[1])}); - out->Resize({static_cast(input_shape[0]), static_cast(w_shape[1])}); + + Tensor input_tmp, out_tmp; + input_tmp.Resize(input_shape); + transpose(input->mutable_data(), + input_tmp.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + input->CopyDataFrom(input_tmp); + LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name}); - // compare results + auto os = out->dims(); + out_tmp.Resize(os); auto* out_data = out->mutable_data(); + // transpose(out_data, + // out_tmp.mutable_data(), + // {static_cast(os[0]), + // static_cast(os[2]), + // static_cast(os[3]), + // static_cast(os[1])}, + // {0, 3, 1, 2}); + // + // out_data = out_tmp.mutable_data(); + + // compare results auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); @@ -170,4 +187,4 @@ TEST(MLUBridges, fc) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter); +USE_SUBGRAPH_BRIDGE(fc, kMLU); diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc index 27c6ab2597..65c2f8214c 100644 --- a/lite/kernels/mlu/bridges/graph.cc +++ b/lite/kernels/mlu/bridges/graph.cc @@ -25,12 +25,12 @@ namespace mlu { std::shared_ptr Graph::AddNode(const std::string& name, std::vector shape, cnmlTensorType_t tensor_type, - cnmlDataOrder_t data_order, + cnmlDataOrder_t shape_order, cnmlDataType_t mlu_dtype, void* raw_ptr) { CHECK(!HasNode(name)); auto node = std::shared_ptr( - new MLUTensor(shape, tensor_type, data_order, mlu_dtype)); + new MLUTensor(shape, tensor_type, shape_order, mlu_dtype)); node->set_mlu_ptr(raw_ptr); nodes_.insert(std::make_pair(name, node)); return node; diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index 140900a2dd..b846d15af0 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -23,6 +23,12 @@ #include "lite/core/tensor.h" #include "lite/kernels/mlu/bridges/tensor.h" +#define PRINT_HW_TIME false + +#if PRINT_HW_TIME +#include //NOLINT +#endif + namespace paddle { namespace lite { namespace subgraph { @@ -32,13 +38,30 @@ namespace mlu { // to the MLU IR graph class Graph { public: - Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); } + Graph() { + CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtCreateNotifier(¬ifier_start_)); + CNRT_CALL(cnrtCreateNotifier(¬ifier_end_)); +#endif + } ~Graph() { + FreeConstData(); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); for (auto op : ops_) { CNML_CALL(cnmlDestroyBaseOp(&op)); } +#if PRINT_HW_TIME + CNRT_CALL(cnrtDestroyNotifier(¬ifier_start_)); + CNRT_CALL(cnrtDestroyNotifier(¬ifier_end_)); + double total_time = 0; + for (auto& f : time_log_) { + total_time += f; + } + std::cout << "cnml hardware time for " << time_log_.size() + << " process:" << total_time / time_log_.size() << std::endl; +#endif } // Data node @@ -89,6 +112,10 @@ class Graph { } void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { +#if PRINT_HW_TIME + thread_local float hw_time; + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); +#endif CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, input_addrs_.data(), input_addrs_.size(), @@ -96,7 +123,61 @@ class Graph { output_addrs_.size(), &forward_param, que)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); +#endif + CNRT_CALL(cnrtSyncQueue(que)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); + hw_time /= 1000.0f; + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; + std::lock_guard lk(time_mut_); + time_log_.push_back(hw_time); +#endif + } + + template + void* RegisterConstData(size_t len) { + void* addr = malloc(len * sizeof(T)); + const_data_storage_.push_back(addr); + return addr; + } + + void FreeConstData() { + for (auto& addr : const_data_storage_) { + free(addr); + } + } + + void BindConstRawData(std::string tensor_name, + const float* data, + size_t len, + bool alloc = true) { + void* alloc_data; + if (fp_type_ == CNML_DATA_FLOAT32) { + if (alloc) { + alloc_data = RegisterConstData(len); + memcpy(alloc_data, data, len * sizeof(float)); + } else { + alloc_data = const_cast(static_cast(data)); + } + CNML_CALL(cnmlBindConstData_V2( + nodes_[tensor_name]->mlu_tensor(), alloc_data, false)); + } else if (fp_type_ == CNML_DATA_FLOAT16) { + void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len); + CNRT_CALL( + cnrtCastDataType(const_cast(static_cast(data)), + CNRT_FLOAT32, + data_fp16, + CNRT_FLOAT16, + len, + nullptr)); + CNML_CALL(cnmlBindConstData_V2( + nodes_[tensor_name]->mlu_tensor(), data_fp16, false)); + } else { + CHECK(0); + } } void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { @@ -158,6 +239,12 @@ class Graph { std::vector> output_tensors_; std::vector ops_; cnmlFusionOp_t fusion_op_; + std::vector const_data_storage_; +#if PRINT_HW_TIME + cnrtNotifier_t notifier_start_{}, notifier_end_{}; + std::mutex time_mut_; + std::vector time_log_; +#endif }; } // namespace mlu diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc new file mode 100644 index 0000000000..e201199824 --- /dev/null +++ b/lite/kernels/mlu/bridges/interpolate_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto out = scope->FindVar(out_var_name)->GetMutable(); + auto x_dims = x->dims(); + CHECK_EQ(x_dims.size(), 4); + auto scale = op_info->GetAttr("scale"); + auto out_w = op_info->GetAttr("out_w"); + auto out_h = op_info->GetAttr("out_h"); + auto align_corners = op_info->GetAttr("align_corners"); + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto in_h = x_dims[2]; + auto in_w = x_dims[3]; + + // Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w + if (HasInputArg(op_info, scope, "SizeTensor")) { + LOG(ERROR) << "Not support SizeTensor input now"; + CHECK(0); + } else { + if (HasInputArg(op_info, scope, "Scale")) { + LOG(ERROR) << "Not support Scale input now"; + CHECK(0); + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + out_h = out_h > 0 ? out_h : -1; + out_w = out_w > 0 ? out_w : -1; + } + if (HasInputArg(op_info, scope, "OutSize")) { + LOG(ERROR) << "Not support OutSize input now"; + CHECK(0); + } + } + + auto output_tensor = graph->AddNode(out_var_name, + out->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + + cnmlBaseOp_t interp_op; + cnmlNearestNeighborOpParam_t nn_param; + CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h)); + CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners)); + CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + nn_param)); + CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param)); + graph->FuseOp(interp_op); + + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(nearest_interp, + kMLU, + paddle::lite::subgraph::mlu::InterpolateConverter); diff --git a/lite/kernels/mlu/bridges/interpolate_op_test.cc b/lite/kernels/mlu/bridges/interpolate_op_test.cc new file mode 100644 index 0000000000..0e99da6435 --- /dev/null +++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc @@ -0,0 +1,406 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/interpolate_op.h" +#include +#include +#include "lite/core/device_info.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void ResizeNearestAlign(const lite::Tensor* x, + lite::Tensor* out, + bool with_align) { + auto x_dims = x->dims(); + int num = x_dims[0]; + int channels = x_dims[1]; + int hin = x_dims[2]; + int win = x_dims[3]; + int hout = out->dims()[2]; + int wout = out->dims()[3]; + dtype scale_w = (with_align) ? (static_cast(win - 1) / (wout - 1)) + : (static_cast(win) / (wout)); + dtype scale_h = (with_align) ? (static_cast(hin - 1) / (hout - 1)) + : (static_cast(hin) / (hout)); + const dtype* src = x->data(); + dtype* dst = out->mutable_data(); + int dst_stride_w = 1; + int dst_stride_h = wout; + int dst_stride_c = wout * hout; + int dst_stride_batch = wout * hout * channels; + int src_stride_w = 1; + int src_stride_h = win; + int src_stride_c = win * hin; + int src_stride_batch = win * hin * channels; + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + int src_index = n * src_stride_batch + c * src_stride_c; + for (int h = 0; h < hout; ++h) { + for (int w = 0; w < wout; ++w) { + int fw = (with_align) ? static_cast(scale_w * w + 0.5) + : static_cast(scale_w * w); + fw = (fw < 0) ? 0 : fw; + int fh = (with_align) ? static_cast(scale_h * h + 0.5) + : static_cast(scale_h * h); + fh = (fh < 0) ? 0 : fh; + int w_start = static_cast(fw); + int h_start = static_cast(fh); + int dst_index = n * dst_stride_batch + c * dst_stride_c + + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = + src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + } + } + } + } +} + +template +void BilinearInterpRef(const lite::Tensor* x, + lite::Tensor* out, + bool align_corners, + int align_mode) { + auto x_dims = x->dims(); + int batch_size = x_dims[0]; + int channel_size = x_dims[1]; + auto x_h = x_dims[2]; + auto x_w = x_dims[3]; + CHECK_EQ(x_dims.size(), 4); + + auto out_dims = out->dims(); + int out_h = out_dims[2]; + int out_w = out_dims[3]; + + // copy from x if no change + if (x_h == out_h && x_w == out_w) { + out->CopyDataFrom(*x); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(x_h - 1) / (out_h - 1) + : static_cast(x_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(x_w - 1) / (out_w - 1) + : static_cast(x_w) / out_w; + } + + // naive bilinear interpolation + auto x_data = x->data(); + auto out_data = out->mutable_data(); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); + for (int k = 0; k < out_h; k++) { + int yn = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + yn = (yn > 0) ? yn : 0; + int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn; + float ds = 1.f - dn; + { + vy_n[k] = yn; + vy_s[k] = ys; + vd_n[k] = dn; + vd_s[k] = ds; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); + for (int l = 0; l < out_w; l++) { + int xw = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + xw = (xw > 0) ? xw : 0; + int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw; + float de = 1.f - dw; + { + vx_w[l] = xw; + vx_e[l] = xe; + vd_w[l] = dw; + vd_e[l] = de; + } + } + + std::vector x_strides(x_dims.size(), 1); + for (int idx = x_strides.size() - 2; idx >= 0; idx--) { + x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1]; + } + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < channel_size; j++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + DType x0 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]]; + DType x1 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]]; + DType x2 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]]; + DType x3 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]]; + *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] + + x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l]; + out_data++; + } + } + } + } +} + +class InterpComputeTester { + protected: + // common attributes for this op. + std::string x_var_name = "X"; + std::string outsize_var_name = "OutSize"; + std::string out_var_name = "Out"; + std::string out_ref_var_name = "out_ref"; + DDim dims_{{1, 2, 3, 4}}; + + Scope scope; + std::string interp_method_ = "nearest"; + float scale_ = -1.f; + int out_h_ = -1; + int out_w_ = -1; + bool align_corners_ = true; + int align_mode_ = 1; + bool use_outsize_ = false; + + public: + InterpComputeTester(const std::string& alias, + DDim dims, + std::string interp_method = "nearest", + float scale = -1.f, + int out_h = -1, + int out_w = -1, + bool align_corners = true, + int align_mode = 1, + bool use_outsize = false) + : dims_(dims), + interp_method_(interp_method), + scale_(scale), + out_h_(out_h), + out_w_(out_w), + align_corners_(align_corners), + align_mode_(align_mode), + use_outsize_(use_outsize) {} + + void Execute(float abs_error) { + cpp::OpDesc op_desc; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* outsize = scope.Var(outsize_var_name)->GetMutable(); + auto* outref = scope.Var(out_ref_var_name)->GetMutable(); + int out_h = out_h_; + int out_w = out_w_; + if (scale_ > 0) { + out_h = static_cast(dims_[2] * scale_); + out_w = static_cast(dims_[3] * scale_); + } + x->Resize(dims_); + /* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h, + * out_w, dims_[1]); */ + std::vector out_shape_nchw = {dims_[0], dims_[1], out_h, out_w}; + outref->Resize(out_shape_nchw); + outsize->Resize({2}); + + FillTensor(x, -1.f, 1.f); + + if (use_outsize_) { + outsize->mutable_data()[0] = out_h; + outsize->mutable_data()[1] = out_w; + outsize->set_persistable(true); + } + + if (interp_method_ == "nearest") { + op_desc.SetType("nearest_interp"); + } else if (interp_method_ == "bilinear") { + op_desc.SetType("bilinear_interp"); + } else { + LOG(FATAL) << "unsupport"; + } + op_desc.SetInput("X", {x_var_name}); + if (use_outsize_) { + op_desc.SetInput("OutSize", {outsize_var_name}); + } + op_desc.SetOutput("Out", {out_var_name}); + op_desc.SetAttr("scale", scale_); + op_desc.SetAttr("out_h", out_h_); + op_desc.SetAttr("out_w", out_w_); + op_desc.SetAttr("align_corners", align_corners_); + op_desc.SetAttr("align_mode", align_mode_); + op_desc.SetAttr("interp_method", interp_method_); + auto op = CreateOp(op_desc, &scope); + + if (interp_method_ == "nearest") { + ResizeNearestAlign(x, outref, align_corners_); + } else if (interp_method_ == "bilinear") { + BilinearInterpRef(x, outref, align_corners_, align_mode_); + } + + int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3]; + Tensor input_trans; + input_trans.Resize(dims_); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {in, ic, ih, iw}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + if (use_outsize_) { + LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name}); + } else { + LaunchOp(op, {x_var_name}, {out_var_name}); + } + + auto* out_ref_data = outref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out_shape_nchw); + transpose( + out->mutable_data(), + output_trans.mutable_data(), + {static_cast(dims_[0]), out_h, out_w, static_cast(dims_[1])}, + {0, 3, 1, 2}); + auto* out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); ++i) { + EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error); + } + } +}; + +void TestInterpOuthw(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + for (int out_h : {6, 8, 12}) { + for (int out_w : {6, 9}) { + printf("testcase %s: out_w %d, out_h %d\n", + interp_method.c_str(), + out_w, + out_h); + InterpComputeTester tester( + "def", DDim(x_dims), interp_method, -1.f, out_h, out_w); + tester.Execute(abs_error); + } + } + } + } +} + +void TestInterpScale(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + for (float scale : {0.3f, 1.f, 1.7f}) { + printf("testcase %s: scale: %f\n", interp_method.c_str(), scale); + InterpComputeTester tester("def", DDim(x_dims), interp_method, scale); + tester.Execute(abs_error); + } + } + } +} + +void TestInterpOutsize(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4); + InterpComputeTester tester( + "def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true); + tester.Execute(abs_error); + } + } +} + +void TestInterpAlignCorners(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + for (bool align_corners : {true, false}) { + printf( + "testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n", + align_corners); + InterpComputeTester tester( + "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners); + tester.Execute(abs_error); + } + } +} + +void TestInterpAlignMode(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + for (bool align_corners : {true, false}) { + for (int align_mode : {0, 1}) { + printf( + "testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners " + "%d, mode %d\n", + align_corners, + align_mode); + InterpComputeTester tester("def", + DDim(x_dims), + "bilinear", + 0.7, + -1, + -1, + align_corners, + align_mode); + tester.Execute(abs_error); + } + } + } +} + +TEST(MLUBridges, interpolate) { + float abs_error = 2e-5; + TestInterpOuthw(abs_error); + TestInterpScale(abs_error); + // bug, not usable + // TestInterpOutsize(abs_error); + TestInterpAlignCorners(abs_error); + // only for bilinear interp + // TestInterpAlignMode(abs_error); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h index 1b12970afa..d31ba0dd41 100644 --- a/lite/kernels/mlu/bridges/paddle_use_bridges.h +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU); USE_SUBGRAPH_BRIDGE(softmax, kMLU); USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); USE_SUBGRAPH_BRIDGE(fc, kMLU); +USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); +USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU); +USE_SUBGRAPH_BRIDGE(concat, kMLU); +USE_SUBGRAPH_BRIDGE(scale, kMLU); diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc index 3119b6c77d..f77c8084c7 100644 --- a/lite/kernels/mlu/bridges/pool_op.cc +++ b/lite/kernels/mlu/bridges/pool_op.cc @@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input, and attributes auto x_var_name = op_info->Input("X").front(); auto x = scope->FindTensor(x_var_name); - auto input_dims_nhwc = x->dims(); - const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); auto output_var_name = op_info->Output("Out").front(); + auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize(); auto pooling_type = op_info->GetAttr("pooling_type"); auto ceil_mode = op_info->GetAttr("ceil_mode"); auto paddings = op_info->GetAttr>("paddings"); @@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides, ksize); - std::vector output_shape({input_dims[0], input_dims[1]}); - for (size_t i = 0; i < 2; i++) { - output_shape.push_back( - (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) / - strides[i] + - 1); - } + // std::vector output_shape({input_dims[0], input_dims[1]}); + // for (size_t i = 0; i < 2; i++) { + // output_shape.push_back( + // (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - + // ksize[0]) / + // strides[i] + + // 1); + // } - auto output_shape_nhwc = DimNCHW2NHWC(output_shape); - auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, - CNML_TENSOR, - CNML_NHWC, - graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); + auto output_tensor = graph->AddNode( + output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlPoolOpParam_t pool_param; CNML_CALL( diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc index 29ef68781f..8cee8dbe86 100644 --- a/lite/kernels/mlu/bridges/pool_op_test.cc +++ b/lite/kernels/mlu/bridges/pool_op_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int PoolConverter(void* ctx, OpLite* op); - void pool_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -182,12 +180,7 @@ void test_pool(int bs, {0, 2, 3, 1}); auto os = out->dims(); - out->Resize({static_cast(os[0]), - static_cast(os[2]), - static_cast(os[3]), - static_cast(os[1])}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -275,6 +268,4 @@ TEST(MLUBridges, pool) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - pool2d, - paddle::lite::subgraph::mlu::PoolConverter); +USE_SUBGRAPH_BRIDGE(pool2d, kMLU) diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc new file mode 100644 index 0000000000..5557602bd7 --- /dev/null +++ b/lite/kernels/mlu/bridges/scale_op.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + auto bias_after_scale = op_info->GetAttr("bias_after_scale"); + auto scale = op_info->GetAttr("scale"); + auto bias = op_info->GetAttr("bias"); + auto beta = bias_after_scale ? bias : bias * scale; + + std::vector shape = {1, 1, 1, 1}; + + std::string prefix = string_format("_%p", op); + auto alpha_tensor = graph->AddNode( + "Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + auto beta_tensor = graph->AddNode( + "Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + + graph->BindConstRawData("Alpha" + prefix, &scale, 1); + graph->BindConstRawData("Beta" + prefix, &beta, 1); + + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t scale_op; + CNML_CALL(cnmlCreateScaleOp(&scale_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor(), + beta_tensor->mlu_tensor())); + graph->FuseOp(scale_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(scale, + kMLU, + paddle::lite::subgraph::mlu::ScaleConverter); diff --git a/lite/kernels/mlu/bridges/scale_op_test.cc b/lite/kernels/mlu/bridges/scale_op_test.cc new file mode 100644 index 0000000000..e0ed975a84 --- /dev/null +++ b/lite/kernels/mlu/bridges/scale_op_test.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/scale_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void scale_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + float scale = op_info->GetAttr("scale"); + float bias = op_info->GetAttr("bias"); + bool bias_after_scale = op_info->GetAttr("bias_after_scale"); + if (!bias_after_scale) { + bias *= scale; + } + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + DDim out_dims = out->dims(); + CHECK_EQ(x_dims.production(), out_dims.production()); + for (int i = 0; i < out_dims.production(); i++) { + out_data[i] = x_data[i] * scale + bias; + } +} + +void test_scale(int bs, + int ic, + int ih, + int iw, + bool bias_after_scale, + float scale, + float bias) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("scale"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("bias_after_scale", bias_after_scale); + opdesc.SetAttr("scale", scale); + opdesc.SetAttr("bias", bias); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + scale_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + auto os = out->dims(); + out->Resize({static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor('out') + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(os); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, scale) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 4}) { + for (auto iw : {4, 3}) { + for (auto bias_after_scale : {false, true}) { + for (auto scale : {-1.0f, 5.0f}) { + for (auto bias : {-2.0f, 30.0f}) { + VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih + << " iw: " << iw + // << " bias_after_scale: " << bias_after_scale + << " scale: " << scale << " bias: " << bias; + test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias); + } + } + } + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(scale, kMLU); diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc index b9e2b1116d..17c9116757 100644 --- a/lite/kernels/mlu/bridges/softmax_op.cc +++ b/lite/kernels/mlu/bridges/softmax_op.cc @@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis = output_dims.size() + axis; } } - int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; auto output_tensor = graph->AddNode( - out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlBaseOp_t softmax_op; CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op, nhwc_axis, diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc index 7ceb050d80..a5251ed43c 100644 --- a/lite/kernels/mlu/bridges/softmax_op_test.cc +++ b/lite/kernels/mlu/bridges/softmax_op_test.cc @@ -23,8 +23,6 @@ namespace lite { namespace subgraph { namespace mlu { -int SoftmaxConverter(void* ctx, OpLite* op); - template void softmax_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -112,9 +110,7 @@ void test_softmax(const std::vector& input_shape, int axis) { {bs, ic, ih, iw}, {0, 2, 3, 1}); - out->Resize({bs, ih, iw, ic}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - softmax, - paddle::lite::subgraph::mlu::SoftmaxConverter); +USE_SUBGRAPH_BRIDGE(softmax, kMLU) diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 7bb2e1b203..12dc97a772 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -47,6 +47,8 @@ class MLUTensor { return mlu_ptr_; } + void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } + ~MLUTensor(); private: diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc index cf2d7bd6c1..377a00689e 100644 --- a/lite/kernels/mlu/bridges/test_helper.cc +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, const std::vector& output_var_names) { CNRT_CALL(cnrtInit(0)); - SetMluDevice(0); + ::paddle::lite::SetMluDevice(0); cnrtQueue_t queue_; cnrtInvokeFuncParam_t forward_param; u32_t affinity = 1; @@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr op, const auto& bridges = subgraph::Registry::Instance(); CHECK(bridges.Exists(op_type, TARGET(kMLU))); - // Convert all of input data vars and added into the MLU IR graph + // Convert input data var and add it into the MLU IR graph for (auto& input_name : input_var_names) { auto input_tensor = scope->FindMutableTensor(input_name); CHECK(input_tensor); @@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr op, graph.AddNode(input_name, input_tensor->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph.FPType(), reinterpret_cast( input_tensor->mutable_data(TARGET(kMLU)))); @@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr op, sizeof(float) * input_tensor->dims().production(), CNRT_MEM_TRANS_DIR_HOST2DEV)); } + op->CheckShape(); + op->InferShape(); bridges.Select(op_type, TARGET(kMLU))( reinterpret_cast(&graph), const_cast(op.get()), nullptr); diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index 2af8274e07..fa8fb1597c 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -84,7 +84,7 @@ struct FPTypeTraits { template <> struct FPTypeTraits { - typedef ::paddle::lite::fluid::float16 T; + typedef paddle::lite::fluid::float16 T; }; } // namespace mlu diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc index bc6e1838d7..02e4d8b28e 100644 --- a/lite/kernels/mlu/io_copy_compute.cc +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL( .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); - -// kMLU, -// kFloat, -// kNHWC, -// paddle::lite::kernels::mlu::IoCopyHostToMluCompute, -// host_to_device) -// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) -// .Finalize(); -// -// -// kMLU, -// kFloat, -// kNHWC, -// paddle::lite::kernels::mlu::IoCopyMluToHostCompute, -// device_to_host) -// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) -// .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc new file mode 100644 index 0000000000..d4e16734d6 --- /dev/null +++ b/lite/kernels/mlu/layout_compute.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/layout_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu {} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, + def_layout_nhwc2nchw_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, + def_layout_nhwc2nchw_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kInt8, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp32_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h new file mode 100644 index 0000000000..edacdf8a98 --- /dev/null +++ b/lite/kernels/mlu/layout_compute.h @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/x86/math/math_function.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/layout_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +template +struct FPTypeTraits {}; + +template <> +struct FPTypeTraits { + typedef float T; +}; + +template <> +struct FPTypeTraits { + typedef paddle::lite::fluid::float16 T; +}; + +template <> +struct FPTypeTraits { + typedef int8_t T; +}; + +template +inline void LayoutTransCompute(const int dim, + const lite::Context& context, + const lite::Tensor& in, + lite::Tensor* out, + const std::vector& axis) { + switch (dim) { + case 2: + paddle::lite::x86::math::Transpose trans2; + trans2(context, in, out, axis); + break; + case 3: + paddle::lite::x86::math::Transpose trans3; + trans3(context, in, out, axis); + break; + case 4: + paddle::lite::x86::math::Transpose trans4; + trans4(context, in, out, axis); + break; + default: + CHECK(0) << ("Unsupport dim in mlu layout"); + } +} + +template +class LayoutNchwToNhwcCompute + : public KernelLite { + public: + using param_t = operators::LayoutParam; + + void Run() override { + auto& param = this->template Param(); + auto* x = param.x; + auto* out = param.y; + out->template mutable_data::T>(); + auto x_dims = param.x->dims().size(); + auto& context = this->ctx_->template As(); + + const auto origin_dims = out->dims().Vectorize(); + + std::vector axis; + switch (x_dims) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[1]}); + break; + case 4: + axis = {0, 2, 3, 1}; + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + break; + default: + CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc"; + } + + LayoutTransCompute::T>( + x_dims, context, *x, out, axis); + + if (x_dims > 2) { + out->Resize(origin_dims); + } + } + + std::string doc() const override { + return "Mlu layout transform nchw to nhwc"; + } +}; + +template +class LayoutNhwcToNchwCompute + : public KernelLite { + public: + using param_t = operators::LayoutParam; + + void Run() override { + auto& param = this->template Param(); + auto* x = param.x; + auto* out = param.y; + out->template mutable_data::T>(); + auto x_dims = param.x->dims().size(); + auto& context = this->ctx_->template As(); + + const auto origin_dims = out->dims().Vectorize(); + + std::vector axis; + switch (x_dims) { + case 2: + axis = {0, 1}; + break; + case 3: + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[1]}); + axis = {0, 2, 1}; + break; + case 4: + out->Resize(std::vector{ + out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]}); + axis = {0, 3, 1, 2}; + break; + default: + CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw"; + } + + LayoutTransCompute::T>( + x_dims, context, *x, out, axis); + + if (x_dims > 2) { + out->Resize(origin_dims); + } + } + + std::string doc() const override { + return "Mlu layout transform nhwc to nchw"; + } +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 06fc791fe7..3bfba33f4d 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine { graph_.SetFPType(type); } + int Build() { + // In order to attach all of the ops of the block desc, we need to build + // the original program firstly. + BuildOriginProgram(); + // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph + build_device_program_status_ = BuildDeviceProgram(); + return build_device_program_status_; + } + + int Launch() { + // Rebuild device program when the shapes of input tensors have been + // changed. + if (subgraph::CHECK_SUCCESS(build_device_program_status_) && + subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED( + build_device_program_status_) && + InputShapeChanged()) { + Build(); + } + if (subgraph::CHECK_FAILED(build_device_program_status_)) { + LaunchOriginProgram(); + } else { + LaunchDeviceProgram(); + } + return 0; + } + protected: int BuildDeviceProgram() override { int status = 0; @@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine { graph_.AddNode(input_name, input_tensor->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph_.FPType(), const_cast(input_tensor->raw_data())); CHECK(input_node); @@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine { for (auto& inst : origin_program_) { auto op = inst.op(); CHECK(op); - op->CheckShape(); - op->InferShape(); std::string op_type = op->op_info()->Type(); + op->CheckShape(); + const_cast(op)->InferShape(); if (!bridges.Exists(op_type, TARGET(kMLU))) { LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; return subgraph::FAILED; @@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine { graph_.AddInput(graph_.GetNode(input_name)); } CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; - // auto& mlu_context = this->ctx_->template As(); - // auto core_version = mlu_context.MLUCoreVersion(); - // auto core_number = mlu_context.MLUCoreNumber(); - // graph_.Compile(core_version, core_number); + auto& mlu_context = this->ctx_->template As(); + auto core_version = mlu_context.MLUCoreVersion(); + auto core_number = mlu_context.MLUCoreNumber(); + graph_.Compile(core_version, core_number); return status; } int LaunchDeviceProgram() override { - // auto& mlu_context = this->ctx_->template As(); - // auto exec_queue = mlu_context.exec_queue(); - // u32_t affinity = mlu_context.affinity(); - // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); - // int data_param = 1; - // forward_param.data_parallelism = &data_param; - // forward_param.affinity = &affinity; - // forward_param.end = CNRT_PARAM_END; - // graph_.Compute(forward_param, exec_queue); + auto& mlu_context = this->ctx_->template As(); + auto exec_queue = mlu_context.exec_queue(); + u32_t affinity = mlu_context.affinity(); + cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + int data_param = 1; + forward_param.data_parallelism = &data_param; + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + graph_.Compute(forward_param, exec_queue); return 0; } diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 443dcbcc6d..e53bd60c6b 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) return() endif() diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc index d342056c7f..bbb63e5952 100644 --- a/lite/kernels/x86/cast_compute.cc +++ b/lite/kernels/x86/cast_compute.cc @@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); + +REGISTER_LITE_KERNEL( + cast, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>, + fp16_to_any) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt index 697c9874ef..1ab73792e7 100644 --- a/lite/tests/cv/CMakeLists.txt +++ b/lite/tests/cv/CMakeLists.txt @@ -1,3 +1,3 @@ -if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) +if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM) lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) endif() diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 60e5a92833..cb454c4da5 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt index 7dd4f522db..e02307aa73 100644 --- a/lite/tests/math/CMakeLists.txt +++ b/lite/tests/math/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh index 1912efda5e..01d71aaf21 100755 --- a/lite/tools/build_mlu.sh +++ b/lite/tools/build_mlu.sh @@ -2,10 +2,10 @@ set -ex # global variables with default value -NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK +NEUWARE_HOME="${NEUWARE_HOME}" TARGET_NAME="all" # default target BUILD_EXTRA=OFF # ON(with sequence ops)/OFF -WITH_TESTING=OFF # ON/OFF +WITH_TESTING=ON # ON/OFF function print_usage { echo -e "\nUSAGE:" @@ -20,10 +20,9 @@ function print_usage { # readonly variables with default value readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ - -DWITH_PYTHON=OFF \ -DLITE_WITH_ARM=OFF" -readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} +readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8} readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz readonly workspace=$(pwd) @@ -37,8 +36,7 @@ function prepare_thirdparty { fi tar xzf third-party-05b862.tar.gz else - # git submodule update --init --recursive - echo "third-party is in ready" + git submodule update --init --recursive fi } @@ -62,12 +60,12 @@ function prepare_workspace { } function build_mlu { + prepare_workspace build_dir=${workspace}/build.lite.mlu mkdir -p $build_dir cd $build_dir export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" - prepare_workspace cmake .. \ ${CMAKE_COMMON_OPTIONS} \ -DWITH_GPU=OFF \ @@ -75,9 +73,10 @@ function build_mlu { -DLITE_WITH_X86=ON \ -DWITH_MKL=ON \ -DLITE_WITH_MLU=ON \ + -DLITE_WITH_PYTHON=OFF \ -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ -DWITH_TESTING=${WITH_TESTING} \ - -DMLU_SDK_ROOT=${XPU_SDK_ROOT} + -DNEUWARE_HOME=${NEUWARE_HOME} make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE -- GitLab