From 0a42986c5465a561e9c7ac64ee4c11def5370a65 Mon Sep 17 00:00:00 2001 From: Wilber Date: Sun, 8 Nov 2020 20:21:12 -0600 Subject: [PATCH] Cherry-pick. (#28454) --- cmake/external/lite.cmake | 96 +++++++++++++----- cmake/inference_lib.cmake | 2 +- paddle/fluid/inference/analysis/argument.h | 4 + .../inference/analysis/ir_pass_manager.cc | 2 + .../analysis/ir_passes/lite_subgraph_pass.cc | 14 ++- paddle/fluid/inference/api/analysis_config.cc | 2 +- .../fluid/inference/api/analysis_predictor.cc | 36 ++++++- .../fluid/inference/api/analysis_predictor.h | 11 +++ paddle/fluid/inference/lite/CMakeLists.txt | 2 +- paddle/fluid/inference/lite/engine.cc | 50 ++++++---- paddle/fluid/inference/lite/engine.h | 27 ++--- paddle/fluid/inference/lite/tensor_utils.cc | 98 ++++++++++++++----- paddle/fluid/inference/lite/test_engine.cc | 8 +- .../fluid/inference/lite/test_tensor_utils.cc | 37 ++++++- .../inference/tests/api/lite_resnet50_test.cc | 8 +- paddle/fluid/operators/lite/lite_engine_op.h | 8 +- .../operators/lite/lite_engine_op_test.cc | 6 +- paddle/fluid/pybind/inference_api.cc | 4 +- python/setup.py.in | 4 + 19 files changed, 308 insertions(+), 111 deletions(-) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index b541d73bc6a..bc5442c7ab7 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(NOT LINUX OR NOT WITH_MKL) - message("Paddle-lite will not build because the required Linux and MKL do not exist.") +if(NOT LINUX) + message("Paddle-lite will not build because the required Linux do not exist.") set(WITH_LITE OFF) return() endif() @@ -22,9 +22,11 @@ if(XPU_SDK_ROOT) set(LITE_WITH_XPU ON) include_directories("${XPU_SDK_ROOT}/XTDK/include") include_directories("${XPU_SDK_ROOT}/XTCL/include") - add_definitions(-DPADDLE_WITH_XPU) + add_definitions(-DLITE_SUBGRAPH_WITH_XPU) LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/") LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/") + set(XPURT_LIB ${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so) + set(XPUAPI_LIB ${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so) endif() if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) @@ -42,30 +44,30 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) endif() # No quotes, so cmake can resolve it as a command with arguments. - set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) - set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON - -DLITE_WITH_CUDA=${WITH_GPU} - -DWITH_MKLDNN=OFF - -DLITE_WITH_X86=ON - -DLITE_WITH_PROFILE=OFF - -DWITH_LITE=OFF - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF - -DWITH_PYTHON=OFF - -DWITH_TESTING=OFF - -DLITE_BUILD_EXTRA=ON - -DCUDNN_ROOT=${CUDNN_ROOT} - -DLITE_WITH_STATIC_CUDA=OFF - -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} - -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_ROOT=${XPU_SDK_ROOT} - -DLITE_WITH_ARM=OFF) - - ExternalProject_Add( + if(WITH_ARM) + set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) + message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}") + set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF + -DLITE_WITH_CUDA=OFF + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON + -DLITE_WITH_PROFILE=OFF + -DARM_TARGET_OS=armlinux + -DWITH_LITE=ON + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DLITE_WITH_ARM=ON) + ExternalProject_Add( ${LITE_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/PaddlePaddle/Paddle-Lite.git" GIT_TAG ${LITE_GIT_TAG} PREFIX ${LITE_SOURCES_DIR} + PATCH_COMMAND mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc UPDATE_COMMAND "" BUILD_COMMAND ${LITE_BUILD_COMMAND} INSTALL_COMMAND "" @@ -81,7 +83,51 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} ${LITE_OPTIONAL_ARGS} - ) + ) + set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8) + else() + set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) + set(LITE_OUTPUT_BIN_DIR inference_lite_lib) + set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON + -DLITE_WITH_CUDA=${WITH_GPU} + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=ON + -DLITE_WITH_PROFILE=OFF + -DWITH_LITE=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DCUDNN_ROOT=${CUDNN_ROOT} + -DLITE_WITH_STATIC_CUDA=OFF + -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DLITE_WITH_ARM=OFF) + + ExternalProject_Add( + ${LITE_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/PaddlePaddle/Paddle-Lite.git" + GIT_TAG ${LITE_GIT_TAG} + PREFIX ${LITE_SOURCES_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${LITE_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${LITE_OPTIONAL_ARGS} + ) + endif() ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR) ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR) set(LITE_BINARY_DIR ${BINARY_DIR}) @@ -103,8 +149,8 @@ function(external_lite_libs alias path) endif() endfunction() -external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) -set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) +external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) +set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) add_definitions(-DPADDLE_WITH_LITE) add_definitions(-DLITE_WITH_LOG) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 3ae3a87e3cf..96446aaaea7 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -125,7 +125,7 @@ function(copy_part_of_thrid_party TARGET DST) if (LITE_BINARY_DIR) set(dst_dir "${DST}/third_party/install/lite") copy(${TARGET} - SRCS ${LITE_BINARY_DIR}/inference_lite_lib/* + SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/* DSTS ${dst_dir}) endif() endfunction() diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index dc21f25da66..c1f35517d8a 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -219,6 +219,10 @@ struct Argument { DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); + // Only used in paddle-lite subgraph. + DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads, + int); + private: std::unordered_set valid_fields_; }; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7017cab5e3a..94c42d1433f 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -151,6 +151,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_xpu", new bool(argument->use_xpu())); pass->Set("xpu_l3_workspace_size", new int(argument->xpu_l3_workspace_size())); + pass->Set("cpu_math_library_num_threads", + new int(argument->cpu_math_library_num_threads())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 6b16a481dde..2c454893a62 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine( bool enable_int8 = Get("enable_int8"); bool use_xpu = Get("use_xpu"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); + int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); lite_api::TargetType target_type; if (use_gpu) { @@ -251,7 +252,11 @@ void LiteSubgraphPass::SetUpEngine( } else if (use_xpu) { target_type = TARGET(kXPU); } else { +#ifdef PADDLE_WITH_ARM + target_type = TARGET(kARM); +#else target_type = TARGET(kX86); +#endif } paddle::lite_api::PrecisionType precision_type = @@ -263,11 +268,12 @@ void LiteSubgraphPass::SetUpEngine( // Notice: The ordering here determines the device where the // input tensor of the Lite engine is located, and then affects // whether tensor sharing is feasible. - paddle::lite::Place({target_type, precision_type}), - paddle::lite::Place({target_type, PRECISION(kInt64)}), - paddle::lite::Place({target_type, PRECISION(kFloat)}), - paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), + paddle::lite_api::Place({target_type, precision_type}), + paddle::lite_api::Place({target_type, PRECISION(kInt64)}), + paddle::lite_api::Place({target_type, PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}), }; + config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 9fd312de7e2..0dc0260a1d2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -356,7 +356,7 @@ void AnalysisConfig::Update() { } if (use_xpu_) { -#ifndef PADDLE_WITH_XPU +#ifndef LITE_SUBGRAPH_WITH_XPU PADDLE_THROW(platform::errors::Unavailable( "You tried to use an XPU device, but Paddle was not compiled " "with XPU-runtime.")); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 12855d706c2..360f797af71 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -232,8 +232,17 @@ bool AnalysisPredictor::PrepareExecutor() { void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { #ifdef PADDLE_WITH_MKLDNN - VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id=" - << platform::get_cur_mkldnn_session_id(); + std::vector> inputs_shape; + for (size_t i = 0; i < inputs.size(); ++i) { + inputs_shape.emplace_back(inputs[i].shape); + } + MkldnnPreSet(inputs_shape); +#endif +} + +void AnalysisPredictor::MkldnnPreSet( + const std::vector> &inputs_shape) { +#ifdef PADDLE_WITH_MKLDNN // In cache clearing mode. if (config_.mkldnn_cache_capacity_ > 0) { VLOG(2) << "In mkldnn cache clear mode."; @@ -243,9 +252,9 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { config_.mkldnn_cache_capacity_); // Set current_input_shape for caching dynamic shape. std::stringstream ss; - for (size_t i = 0; i < inputs.size(); ++i) { - for (size_t j = 0; j < inputs[i].shape.size(); ++j) { - ss << inputs[i].shape[j] << "-"; + for (size_t i = 0; i < inputs_shape.size(); ++i) { + for (size_t j = 0; j < inputs_shape[i].size(); ++j) { + ss << inputs_shape[i][j] << "-"; } } VLOG(2) << "Set input shape=" << ss.str(); @@ -445,6 +454,8 @@ void AnalysisPredictor::PrepareArgument() { } if (config_.lite_engine_enabled()) { + argument_.SetCpuMathLibraryNumThreads( + config_.cpu_math_library_num_threads()); argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_); @@ -656,6 +667,18 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); +#ifdef PADDLE_WITH_MKLDNN + if (config_.use_mkldnn_) { + std::vector> shape_vector; + auto names = GetInputNames(); + for (size_t i = 0; i < names.size(); ++i) { + auto in_tensor = GetInputTensor(names[i]); + shape_vector.emplace_back(in_tensor->shape()); + } + MkldnnPreSet(shape_vector); + } +#endif + executor_->Run(); // Fix TensorArray reuse not cleaned bug. tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); @@ -664,6 +687,9 @@ bool AnalysisPredictor::ZeroCopyRun() { // recover the cpu_math_library_num_threads to 1, in order to avoid thread // conflict when integrating it into deployment service. paddle::platform::SetNumThreads(1); +#ifdef PADDLE_WITH_MKLDNN + if (config_.use_mkldnn_) MkldnnPostReset(); +#endif #if defined(PADDLE_WITH_MKLML) && defined(_LINUX) // Frees unused memory allocated by the IntelĀ® MKL Memory Allocator to // avoid memory leak. See: diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 267817829ec..7ae045685e4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -311,6 +311,17 @@ class AnalysisPredictor : public PaddlePredictor { /// \param[in] inputs tensors /// void MkldnnPreSet(const std::vector &inputs); + + /// + /// \brief PreSet for Mkldnn multi-thread and dynamic shape input. + /// + /// Used in AnalysisPredictor::Run(), do not support + /// AnalysisPredictor::ZeroCopyRun() now. + /// + /// \param[in] inputs tensor shape + /// + void MkldnnPreSet(const std::vector> &inputs_shape); + /// /// \brief PostReset for Mkldnn multi-thread and dynamic shape input. /// diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt index fd513b59588..924d273a9fc 100644 --- a/paddle/fluid/inference/lite/CMakeLists.txt +++ b/paddle/fluid/inference/lite/CMakeLists.txt @@ -4,6 +4,6 @@ endif() cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash) cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS}) -cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context) +cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS}) cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis) cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils) diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 8e88c944939..b8f6104780f 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -16,12 +16,16 @@ #define LITE_WITH_CUDA 1 #endif -#ifdef PADDLE_WITH_XPU +#ifdef LITE_SUBGRAPH_WITH_XPU #define LITE_WITH_XPU 1 #endif +#ifndef PADDLE_WITH_ARM +#define LITE_WITH_X86 1 +#endif + #include "paddle/fluid/inference/lite/engine.h" -#include "lite/api/paddle_use_passes.h" +#include namespace paddle { namespace inference { @@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const { return engines_.at(name).get() != nullptr; } -paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { +paddle::lite_api::PaddlePredictor* EngineManager::Get( + const std::string& name) const { return engines_.at(name).get(); } -paddle::lite::Predictor* EngineManager::Create(const std::string& name, - const EngineConfig& cfg) { - if (cfg.valid_places.front().target == TARGET(kCUDA)) { -#ifdef PADDLE_WITH_CUDA - paddle::lite::Env::Init(); +paddle::lite_api::PaddlePredictor* EngineManager::Create( + const std::string& name, const EngineConfig& cfg) { + // config info for predictor. + paddle::lite_api::CxxConfig lite_cxx_config; + lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(), + cfg.param.c_str(), cfg.param.size()); + lite_cxx_config.set_valid_places(cfg.valid_places); +#ifdef PADDLE_WITH_ARM + set_threads.set_threads(cfg.cpu_math_library_num_threads); +#else + lite_cxx_config.set_x86_math_library_num_threads( + cfg.cpu_math_library_num_threads); #endif - } else if (cfg.valid_places.front().target == TARGET(kXPU)) { -#ifdef PADDLE_WITH_XPU - paddle::lite::TargetWrapper::workspace_l3_size_per_thread = - cfg.xpu_l3_workspace_size; + +#ifdef LITE_SUBGRAPH_WITH_XPU + lite_cxx_config.set_xpu_workspace_l3_size_per_thread( + cfg.xpu_l3_workspace_size); #endif - } - auto* p = new paddle::lite::Predictor(); - p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, - cfg.model_type, cfg.model_from_memory); - engines_[name].reset(p); - return p; + + // create predictor + std::shared_ptr p = + paddle::lite_api::CreatePaddlePredictor(lite_cxx_config); + engines_[name] = std::move(p); + return engines_[name].get(); } void EngineManager::DeleteAll() { for (auto& item : engines_) { - item.second.reset(nullptr); + item.second.reset(); } } diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 345eb682e9f..5ba487cc24d 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -23,12 +23,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wall" #include "lite/api/cxx_api.h" +#include "lite/api/paddle_api.h" #include "lite/api/paddle_place.h" -#include "lite/core/context.h" -#include "lite/core/device_info.h" -#include "lite/core/memory.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" +#include "lite/api/paddle_use_passes.h" #pragma GCC diagnostic pop namespace paddle { @@ -38,25 +35,33 @@ namespace lite { struct EngineConfig { std::string model; std::string param; - paddle::lite::Place prefer_place; - std::vector valid_places; + std::vector valid_places; std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; + + // for xpu size_t xpu_l3_workspace_size; + + // for x86 or arm + int cpu_math_library_num_threads{1}; + + // for cuda + bool use_multi_stream{false}; }; class EngineManager { public: bool Empty() const; bool Has(const std::string& name) const; - paddle::lite::Predictor* Get(const std::string& name) const; - paddle::lite::Predictor* Create(const std::string& name, - const EngineConfig& cfg); + paddle::lite_api::PaddlePredictor* Get(const std::string& name) const; + paddle::lite_api::PaddlePredictor* Create(const std::string& name, + const EngineConfig& cfg); void DeleteAll(); private: - std::unordered_map> + std::unordered_map> engines_; }; diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 0b738c1fb86..f108e7d3651 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/lite/tensor_utils.h" +#include #include #include #include "paddle/fluid/framework/data_type.h" @@ -45,6 +46,7 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) { switch (type) { case TargetType::kHost: case TargetType::kX86: + case TargetType::kARM: return platform::CPUPlace(); case TargetType::kCUDA: return platform::CUDAPlace(id); @@ -134,16 +136,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, } } -void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) { +void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src, + PrecisionType precision_type, + TargetType target_type) { + void* res{nullptr}; + switch (precision_type) { + case PrecisionType::kFloat: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt8: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt32: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt64: + res = static_cast(src->mutable_data(target_type)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported precision type. Now only supports FP32, INT8, INT32 and " + "INT64.")); + break; + } + return res; +} + +int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) { + auto shape = tensor.shape(); + int64_t numel = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()); + return numel; +} + +void InitDstTensor(paddle::lite_api::Tensor* dst, + const framework::LoDTensor& src) { // Currently, Lite needs to explicitly specify the target type of // the input tensor. constexpr int empty_size = 0; - dst->mutable_data(GetLiteTargetType(src.place()), empty_size); - dst->set_precision(GetLitePrecisionType(src.type())); - SetLoD(dst->mutable_lod(), src.lod()); + dst->Resize({empty_size}); + GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()), + GetLiteTargetType(src.place())); + dst->SetPrecision(GetLitePrecisionType(src.type())); + paddle::lite::LoD lite_lod; + SetLoD(&lite_lod, src.lod()); + dst->SetLoD(lite_lod); } -void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { +void InitDstTensor(framework::LoDTensor* dst, + const paddle::lite_api::Tensor& src) { constexpr framework::proto::VarType::Type dtype = framework::proto::VarType_Type_FP32; dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()), @@ -152,7 +193,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { } template <> -void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, +void TensorCopyAsync(paddle::lite_api::Tensor* dst, + const framework::LoDTensor& src, const platform::DeviceContext& ctx) { InitDstTensor(dst, src); const platform::Place& src_place = src.place(); @@ -161,52 +203,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, static_cast(src.numel()) * framework::SizeOfType(src.type()); dst->Resize(framework::vectorize(src.dims())); const void* src_data = src.data(); - void* dst_data = dst->mutable_data(bytes); + void* dst_data{nullptr}; + dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()), + GetLiteTargetType(src.place())); VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src << ", dst = " << dst << ", src_type = " << src.type(); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); - VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size(); + VLOG(3) << "[Lite memory size] Bytes = " << bytes; } template <> -void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, +void TensorCopyAsync(framework::LoDTensor* dst, + const paddle::lite_api::Tensor& src, const platform::DeviceContext& ctx) { - dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize())); + dst->Resize(paddle::framework::make_ddim(src.shape())); InitDstTensor(dst, src); const platform::Place& src_place = GetNativePlace(src.target()); const platform::Place& dst_place = dst->place(); - const size_t bytes = - static_cast(src.numel()) * framework::SizeOfType(dst->type()); - const void* src_data = src.raw_data(); + int64_t src_numel = GetLiteTensorNumel(src); + const size_t bytes = src_numel * framework::SizeOfType(dst->type()); + const void* src_data = src.data(); // When Lite is ready, the source type needs to be modified here. void* dst_data = dst->mutable_data(dst_place, dst->type()); VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src << ", dst = " << dst << ", src_type = " << dst->type(); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); - VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); + VLOG(3) << "[Lite memory size] Bytes = " << bytes; } template <> -void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) { - const size_t bytes = - static_cast(src->numel()) * framework::SizeOfType(src->type()); - auto buf = std::make_shared(paddle::lite::Buffer( - src->data(), GetLiteTargetType(src->place()), src->memory_size())); +void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) { dst->Resize(framework::vectorize(src->dims())); - dst->set_precision(GetLitePrecisionType(src->type())); - SetLoD(dst->mutable_lod(), src->lod()); - dst->ResetBuffer(buf, bytes); + dst->ShareExternalMemory(src->data(), src->memory_size(), + GetLiteTargetType(src->place())); + dst->SetPrecision(GetLitePrecisionType(src->type())); + paddle::lite::LoD lite_lod; + SetLoD(&lite_lod, src->lod()); + dst->SetLoD(lite_lod); } template <> -void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) { +void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) { constexpr framework::proto::VarType::Type dtype = framework::proto::VarType_Type_FP32; - void* src_raw_data = src->raw_data(); + void* src_raw_data = + GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target()); + size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float); std::shared_ptr holder( - new memory::allocation::Allocation(src_raw_data, src->memory_size(), + new memory::allocation::Allocation(src_raw_data, memory_size, GetNativePlace(src->target()))); - dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize())); + dst->Resize(paddle::framework::make_ddim(src->shape())); SetLoD(dst->mutable_lod(), src->lod()); dst->ResetHolderWithType(holder, dtype); } diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc index 325c7ab2539..1ac33f6bbf9 100644 --- a/paddle/fluid/inference/lite/test_engine.cc +++ b/paddle/fluid/inference/lite/test_engine.cc @@ -101,10 +101,10 @@ TEST(EngineManager, engine) { config.model_from_memory = true; config.valid_places = { #ifdef PADDLE_WITH_CUDA - paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), #endif - paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), - paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), + paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), }; LOG(INFO) << "Create EngineManager"; @@ -117,7 +117,7 @@ TEST(EngineManager, engine) { ASSERT_EQ(inference::Singleton::Global().Has( unique_key), true); - paddle::lite::Predictor* engine_0 = + paddle::lite_api::PaddlePredictor* engine_0 = inference::Singleton::Global().Get( unique_key); CHECK_NOTNULL(engine_0); diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index eef7bfb68fe..a792fb77d6a 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) { EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC)); } +template +void test_lite_tensor_data_ptr(PrecisionType precision_type) { + void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src, + PrecisionType precision_type, + TargetType target_type); + const int count = 4; + paddle::lite::Tensor lite_tensor; + lite_tensor.Resize({count}); + auto* lite_tensor_data = lite_tensor.mutable_data(); + for (size_t i = 0; i < count; ++i) { + lite_tensor_data[i] = i; + } + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + T* data = static_cast(GetLiteTensorDataPtr( + &lite_api_tensor, precision_type, TargetType::kHost)); + for (size_t i = 0; i < count; ++i) { + CHECK_EQ(data[i], static_cast(i)) << "the i-th num is not correct."; + } +} + +TEST(LiteEngineOp, GetLiteTensorDataPtr) { + test_lite_tensor_data_ptr(PrecisionType::kInt64); + test_lite_tensor_data_ptr(PrecisionType::kInt32); + test_lite_tensor_data_ptr(PrecisionType::kInt8); + EXPECT_ANY_THROW(test_lite_tensor_data_ptr(PrecisionType::kUnk)); +} + void test_tensor_copy(const platform::DeviceContext& ctx) { // Create LoDTensor. std::vector vector({1, 2, 3, 4}); @@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { lod_tensor.set_lod(lod); // Create lite::Tensor and copy. paddle::lite::Tensor lite_tensor; - TensorCopyAsync(&lite_tensor, lod_tensor, ctx); + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx); // Copy to LoDTensor. framework::LoDTensor lod_tensor_n; - TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); + TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(ctx.GetPlace())) { platform::GpuStreamSync( @@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) { lod_tensor.set_lod(lod); // Create lite::Tensor and share. paddle::lite::Tensor lite_tensor; - TensorDataShare(&lite_tensor, &lod_tensor); + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + TensorDataShare(&lite_api_tensor, &lod_tensor); // Copy to LoDTensor. framework::LoDTensor lod_tensor_n; - TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); + TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); std::vector result; TensorToVector(lod_tensor_n, ctx, &result); ASSERT_EQ(result, vector); diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc index a3fd4b4f6de..d5f186148cd 100644 --- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc @@ -25,9 +25,13 @@ namespace inference { TEST(AnalysisPredictor, use_gpu) { std::string model_dir = FLAGS_infer_model + "/" + "model"; AnalysisConfig config; +#if defined(PADDLE_WITH_CUDA) config.EnableUseGpu(100, 0); +#elif defined(LITE_SUBGRAPH_WITH_XPU) + config.EnableXpu(100); +#endif config.SetModel(model_dir + "/model", model_dir + "/params"); - config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); + config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true); std::vector inputs; auto predictor = CreatePaddlePredictor(config); @@ -39,7 +43,7 @@ TEST(AnalysisPredictor, use_gpu) { std::vector input(input_num, 1); PaddleTensor in; - in.shape = {1, 3, 318, 318}; + in.shape = {batch, channel, height, width}; in.data = PaddleBuf(static_cast(input.data()), input_num * sizeof(float)); in.dtype = PaddleDType::FLOAT32; diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index a920bf7c3f5..f6d65704388 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase { private: std::vector in_names_; std::vector out_names_; - paddle::lite::Predictor *engine_; + paddle::lite_api::PaddlePredictor *engine_; framework::proto::VarType::Type precision_; bool use_gpu_; bool zero_copy_; @@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase { framework::LoDTensor src_t = inference::analysis::GetFromScope(scope, in_names_[i]); - paddle::lite::Tensor *dst_t = engine_->GetInput(i); + paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i)); VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> " << engine_->GetInputNames()[i] << ")"; - inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); + inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_); } #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(dev_place)) { @@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase { engine_->Run(); VLOG(3) << "lite engine run done"; for (size_t i = 0; i < out_names_.size(); i++) { - paddle::lite::Tensor src_t = *(engine_->GetOutput(i)); + paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i)); framework::LoDTensor *dst_t = &inference::analysis::GetFromScope( scope, out_names_[i]); diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index fb5c0dcb351..76c963ac652 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) { inference::lite::EngineConfig config; config.valid_places = { #ifdef PADDLE_WITH_CUDA - paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), #endif - paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), - paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), }; serialize_params(&(config.param), &scope, repetitive_params); config.model = program.Proto()->SerializeAsString(); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 7d77ed80cb4..eb9c166d444 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -387,6 +387,8 @@ void BindAnalysisConfig(py::module *m) { .def("params_file", &AnalysisConfig::params_file) .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) + .def("enable_xpu", &AnalysisConfig::EnableXpu, + py::arg("l3_workspace_size")) .def("disable_gpu", &AnalysisConfig::DisableGpu) .def("use_gpu", &AnalysisConfig::use_gpu) .def("gpu_device_id", &AnalysisConfig::gpu_device_id) @@ -427,8 +429,8 @@ void BindAnalysisConfig(py::module *m) { .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, - py::arg("zero_copy") = false, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("zero_copy") = false, py::arg("passes_filter") = std::vector(), py::arg("ops_filter") = std::vector()) .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled) diff --git a/python/setup.py.in b/python/setup.py.in index 5658638854f..c391911bedc 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -261,6 +261,10 @@ else: if '${WITH_LITE}' == 'ON': shutil.copy('${LITE_SHARED_LIB}', libs_path) package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name] + if '${XPU_SDK_ROOT}': + shutil.copy('${XPUAPI_LIB}', libs_path) + shutil.copy('${XPURT_LIB}', libs_path) + package_data['paddle.libs'] += ['libxpuapi.so', 'libxpurt.so'] if '${WITH_PSLIB}' == 'ON': shutil.copy('${PSLIB_LIB}', libs_path) -- GitLab