diff --git a/CMakeLists.txt b/CMakeLists.txt index 04fbb3178ba7a0764e9e9ff68c5c25a02e82fbbd..e4d9060339c6fa9917024f06c9817b575c4eb774 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA) endif() endif() +# lite subgraph compilation depends on CUDNN_ROOT, +# so include(cudnn) needs to be in front of include(third_party/lite) +include(cudnn) # set cudnn libraries, must before configure include(third_party) # download, build, install third_party if(WITH_DISTRIBUTE) @@ -173,7 +176,6 @@ if(NOT WIN32) endif() include(flags) # set paddle compile flags -include(cudnn) # set cudnn libraries, must before configure if(WITH_GPU) include(cuda) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 70c11d37f911da9225a8609de756438c9b74c596..b541d73bc6a633d8e6a77ff567d756f3b40bfce9 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -18,6 +18,15 @@ if(NOT LINUX OR NOT WITH_MKL) return() endif() +if(XPU_SDK_ROOT) + set(LITE_WITH_XPU ON) + include_directories("${XPU_SDK_ROOT}/XTDK/include") + include_directories("${XPU_SDK_ROOT}/XTCL/include") + add_definitions(-DPADDLE_WITH_XPU) + LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/") + LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/") +endif() + if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) include(ExternalProject) set(LITE_PROJECT extern_lite) @@ -25,7 +34,11 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) if(NOT LITE_GIT_TAG) - set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f) + set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa) + endif() + + if(NOT CUDA_ARCH_NAME) + set(CUDA_ARCH_NAME "Auto") endif() # No quotes, so cmake can resolve it as a command with arguments. @@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) -DCUDNN_ROOT=${CUDNN_ROOT} -DLITE_WITH_STATIC_CUDA=OFF -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} -DLITE_WITH_ARM=OFF) ExternalProject_Add( @@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}") include_directories(${LITE_SOURCE_DIR}) include_directories(${LITE_BINARY_DIR}) -function(external_lite_static_libs alias path) +function(external_lite_libs alias path) add_library(${alias} SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) @@ -88,7 +103,8 @@ function(external_lite_static_libs alias path) endif() endfunction() -external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) +external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) +set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) add_definitions(-DPADDLE_WITH_LITE) add_definitions(-DLITE_WITH_LOG) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2fc7f81bf8a59ca6dba3db36dfe7a9c074f03f9b..27bae7a71ea192ac08e4e87cb7bcdb8b84e29dc8 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -200,6 +200,10 @@ struct Argument { DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector); DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode, AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool); + + DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool); + DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4a79a3cf3050380c920590355f10bb7a0d34f125..cd8d86d72938417112e17e86e5cc6dd12254a8d1 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("predictor_id", new int(argument->predictor_id())); pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_gpu", new bool(argument->use_gpu())); + pass->Set("zero_copy", new bool(argument->lite_zero_copy())); + pass->Set("use_xpu", new bool(argument->use_xpu())); + pass->Set("xpu_l3_workspace_size", + new int(argument->xpu_l3_workspace_size())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 91d0aec3f41fd90159958aa9035cfbf4d1c749fb..6b16a481ddedbad0956d1358de95842ea9a3a101 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine( bool use_gpu = Get("use_gpu"); bool enable_int8 = Get("enable_int8"); - lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86); + bool use_xpu = Get("use_xpu"); + int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); + + lite_api::TargetType target_type; + if (use_gpu) { + target_type = TARGET(kCUDA); + } else if (use_xpu) { + target_type = TARGET(kXPU); + } else { + target_type = TARGET(kX86); + } + paddle::lite_api::PrecisionType precision_type = - enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64); + enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat); + serialize_params(&config.param, scope, repetitive_params); config.model = program->Proto()->SerializeAsString(); config.valid_places = { + // Notice: The ordering here determines the device where the + // input tensor of the Lite engine is located, and then affects + // whether tensor sharing is feasible. paddle::lite::Place({target_type, precision_type}), + paddle::lite::Place({target_type, PRECISION(kInt64)}), paddle::lite::Place({target_type, PRECISION(kFloat)}), paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), }; + config.xpu_l3_workspace_size = xpu_l3_workspace_size; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./param.bin", config.param); @@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator( op_desc->SetAttr("engine_key", unique_key); op_desc->SetAttr("enable_int8", Get("enable_int8")); op_desc->SetAttr("use_gpu", Get("use_gpu")); + op_desc->SetAttr("zero_copy", Get("zero_copy")); } void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 8a047e5296430fdd3755bfd31ac4a92522761c3b..39c5cbff1f4513026e23ea81e6e56806f7c84332 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() { Update(); } +void AnalysisConfig::EnableXpu(int l3_workspace_size) { + use_xpu_ = true; + xpu_l3_workspace_size_ = l3_workspace_size; + Update(); +} + AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; @@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(lite_precision_mode_); CP_MEMBER(lite_passes_filter_); CP_MEMBER(lite_ops_filter_); + CP_MEMBER(lite_zero_copy_); + + CP_MEMBER(use_xpu_); + CP_MEMBER(xpu_l3_workspace_size_); // profile related. CP_MEMBER(with_profile_); @@ -342,6 +352,22 @@ void AnalysisConfig::Update() { } } + if (use_xpu_) { +#ifndef PADDLE_WITH_XPU + PADDLE_THROW(platform::errors::Unavailable( + "You tried to use an XPU device, but Paddle was not compiled " + "with XPU-runtime.")); +#endif + if (!use_lite_) { + LOG(WARNING) << "Because XPU currently only works in Paddle-Lite " + "subgraph mode, please make sure you have enabled it."; + } + PADDLE_ENFORCE_EQ(use_gpu_, false, + platform::errors::Unavailable( + "Currently, XPU and GPU cannot be enabled in the " + "same analysis configuration.")); + } + if (ir_debug_) { pass_builder()->TurnOnDebug(); } @@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << cpu_math_library_num_threads_; ss << use_lite_; + ss << use_xpu_; + ss << xpu_l3_workspace_size_; return ss.str(); } @@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() { } void AnalysisConfig::EnableLiteEngine( - AnalysisConfig::Precision precision_mode, + AnalysisConfig::Precision precision_mode, bool zero_copy, const std::vector &passes_filter, const std::vector &ops_filter) { use_lite_ = true; lite_precision_mode_ = precision_mode; lite_passes_filter_ = passes_filter; lite_ops_filter_ = ops_filter; + lite_zero_copy_ = zero_copy; Update(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d21f0292d9bbd267a73834a01f435a6ffe16f204..de3f9ab239cacb5dbc494fa4abea6c601cfa77f0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_); + argument_.SetLiteZeroCopy(config_.lite_zero_copy_); + argument_.SetUseXpu(config_.use_xpu_); + argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); LOG(INFO) << "Lite subgraph engine is enabled"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2002d1f76abfeb8c35fcad51c8c1bcc16db78336..39346414a8a0d62903f56280c638ca89eac833b0 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -176,6 +176,8 @@ struct AnalysisConfig { /// /// void DisableGpu(); + + void EnableXpu(int l3_workspace_size = 0xfffc00); /// /// \brief A boolean state telling whether the GPU is turned on. /// @@ -319,6 +321,7 @@ struct AnalysisConfig { /// void EnableLiteEngine( AnalysisConfig::Precision precision_mode = Precision::kFloat32, + bool zero_copy = false, const std::vector& passes_filter = {}, const std::vector& ops_filter = {}); @@ -562,6 +565,11 @@ struct AnalysisConfig { std::vector lite_passes_filter_; std::vector lite_ops_filter_; Precision lite_precision_mode_; + bool lite_zero_copy_; + + bool thread_local_stream_{false}; + bool use_xpu_{false}; + int xpu_l3_workspace_size_; // mkldnn related. int mkldnn_cache_capacity_{0}; diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt index 1d957048148b59cd98b40ae1d95bd02481288b85..fd513b59588f82716900d4d48e9aac036085baa9 100644 --- a/paddle/fluid/inference/lite/CMakeLists.txt +++ b/paddle/fluid/inference/lite/CMakeLists.txt @@ -1,5 +1,9 @@ +if(XPU_SDK_ROOT) + set(XPU_DEPS xpuapi xpurt) +endif() + cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash) -cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto) -cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost) +cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS}) +cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context) cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis) cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils) diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index edc4f5220aa968453e72ae8c8f80177f9c31131a..8e88c94493952ff257ef69bf73f8edebb6ba2eee 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -16,12 +16,11 @@ #define LITE_WITH_CUDA 1 #endif -#include "paddle/fluid/inference/lite/engine.h" -#include "lite/core/context.h" -#include "lite/core/device_info.h" +#ifdef PADDLE_WITH_XPU +#define LITE_WITH_XPU 1 +#endif -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" +#include "paddle/fluid/inference/lite/engine.h" #include "lite/api/paddle_use_passes.h" namespace paddle { @@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { paddle::lite::Predictor* EngineManager::Create(const std::string& name, const EngineConfig& cfg) { - auto* p = new paddle::lite::Predictor(); + if (cfg.valid_places.front().target == TARGET(kCUDA)) { #ifdef PADDLE_WITH_CUDA - paddle::lite::Env::Init(); + paddle::lite::Env::Init(); #endif + } else if (cfg.valid_places.front().target == TARGET(kXPU)) { +#ifdef PADDLE_WITH_XPU + paddle::lite::TargetWrapper::workspace_l3_size_per_thread = + cfg.xpu_l3_workspace_size; +#endif + } + auto* p = new paddle::lite::Predictor(); p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, cfg.model_type, cfg.model_from_memory); engines_[name].reset(p); diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index f29607490ed179317aebd4126c92f6a7ef50679b..345eb682e9fe81d4ec67a31082c1d347a694fd96 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -20,7 +20,16 @@ #include #include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wall" #include "lite/api/cxx_api.h" +#include "lite/api/paddle_place.h" +#include "lite/core/context.h" +#include "lite/core/device_info.h" +#include "lite/core/memory.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#pragma GCC diagnostic pop namespace paddle { namespace inference { @@ -34,6 +43,7 @@ struct EngineConfig { std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; + size_t xpu_l3_workspace_size; }; class EngineManager { diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc index c5f1eccc3334a109aa0a6a21ae8c189d42e18447..3a162c3fde13f61fae5aba7a7da0bbfdc5f20801 100644 --- a/paddle/fluid/inference/lite/op_teller.cc +++ b/paddle/fluid/inference/lite/op_teller.cc @@ -16,10 +16,9 @@ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/op_teller.h" -#include "lite/core/op_registry.h" - namespace paddle { namespace inference { namespace lite { @@ -27,15 +26,14 @@ namespace lite { // Just tell by the op_types. struct SimpleOpTeller : public Teller { SimpleOpTeller() { - const std::map& op2path = - paddle::lite::GetOp2PathDict(); + std::vector lite_ops = paddle::lite::GetAllOps(); auto is_non_inst = [](const std::string& op) -> bool { const std::vector ops = {"feed", "fetch", "while"}; return std::find(ops.begin(), ops.end(), op) != ops.end(); }; - for (const auto& op : op2path) { - if (!is_non_inst(op.first)) { - ops_.insert(op.first); + for (const auto& op : lite_ops) { + if (!is_non_inst(op)) { + ops_.insert(op); } } } diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 6138e64e2db376a45ede4f13b3fb9df7d6e03461..0b738c1fb861432368c3d6ba71773db3e20c7a02 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -14,8 +14,10 @@ #include "paddle/fluid/inference/lite/tensor_utils.h" #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/inference/lite/engine.h" +#include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { namespace inference { @@ -46,6 +48,9 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) { return platform::CPUPlace(); case TargetType::kCUDA: return platform::CUDAPlace(id); + case TargetType::kXPU: + LOG(ERROR) << "No corresponding device for XPU yet."; + return platform::Place(); default: LOG(FATAL) << "Error target type."; return platform::Place(); @@ -181,6 +186,31 @@ void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); } +template <> +void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) { + const size_t bytes = + static_cast(src->numel()) * framework::SizeOfType(src->type()); + auto buf = std::make_shared(paddle::lite::Buffer( + src->data(), GetLiteTargetType(src->place()), src->memory_size())); + dst->Resize(framework::vectorize(src->dims())); + dst->set_precision(GetLitePrecisionType(src->type())); + SetLoD(dst->mutable_lod(), src->lod()); + dst->ResetBuffer(buf, bytes); +} + +template <> +void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) { + constexpr framework::proto::VarType::Type dtype = + framework::proto::VarType_Type_FP32; + void* src_raw_data = src->raw_data(); + std::shared_ptr holder( + new memory::allocation::Allocation(src_raw_data, src->memory_size(), + GetNativePlace(src->target()))); + dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize())); + SetLoD(dst->mutable_lod(), src->lod()); + dst->ResetHolderWithType(holder, dtype); +} + } // namespace utils } // namespace lite } // namespace inference diff --git a/paddle/fluid/inference/lite/tensor_utils.h b/paddle/fluid/inference/lite/tensor_utils.h index 95fe8ae903ff66ef688b329a947dbc43c71e4fe8..1b2923bc28033934f5304a48c6a90f158a81a12e 100644 --- a/paddle/fluid/inference/lite/tensor_utils.h +++ b/paddle/fluid/inference/lite/tensor_utils.h @@ -14,9 +14,8 @@ #pragma once -#include "lite/api/paddle_place.h" -#include "lite/core/tensor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/lite/engine.h" namespace paddle { namespace inference { @@ -27,6 +26,21 @@ template void TensorCopyAsync(DstTensor* dst, const SrcTensor& src, const platform::DeviceContext& ctx); +template +void TensorDataShare(DstTensor* dst, SrcTensor* src); + +template +void TensorCopy(DstTensor* dst, SrcTensor* src, + const platform::DeviceContext& ctx, bool shared = true) { + if (shared) { + VLOG(3) << "TensorDataShare is running"; + TensorDataShare(dst, src); + } else { + VLOG(3) << "TensorCopyAsync is running"; + TensorCopyAsync(dst, *src, ctx); + } +} + } // namespace utils } // namespace lite } // namespace inference diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index 48ae1bd71d8a4363c7b0f5af9222e92bcd7a3b1c..eef7bfb68fe06537d09f3f3e7e5c35283d4739ef 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) { platform::Place GetNativePlace(const TargetType& type, int id = 0); EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost))); EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA))); - ASSERT_DEATH(GetNativePlace(TargetType::kUnk), ""); + EXPECT_ANY_THROW(GetNativePlace(TargetType::kUnk)); } TEST(LiteEngineOp, GetLiteTargetType) { @@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) { PrecisionType::kInt8); ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32), PrecisionType::kInt32); - ASSERT_DEATH( - GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), ""); + EXPECT_ANY_THROW( + GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS)); } TEST(LiteEngineOp, GetNativePrecisionType) { @@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) { framework::proto::VarType_Type_INT8); ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32), framework::proto::VarType_Type_INT32); - ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), ""); + EXPECT_ANY_THROW(GetNativePrecisionType(PrecisionType::kUnk)); } TEST(LiteEngineOp, GetNativeLayoutType) { @@ -70,14 +70,14 @@ TEST(LiteEngineOp, GetNativeLayoutType) { framework::DataLayout GetNativeLayoutType(const DataLayoutType& type); ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW), framework::DataLayout::kNCHW); - ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), ""); + EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC)); } void test_tensor_copy(const platform::DeviceContext& ctx) { // Create LoDTensor. std::vector vector({1, 2, 3, 4}); framework::LoDTensor lod_tensor; - framework::TensorFromVector(vector, &lod_tensor); + framework::TensorFromVector(vector, ctx, &lod_tensor); framework::LoD lod({{0, 2, 4}}); lod_tensor.Resize({4, 1}); lod_tensor.set_lod(lod); @@ -94,7 +94,26 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { } #endif std::vector result; - TensorToVector(lod_tensor_n, &result); + TensorToVector(lod_tensor_n, ctx, &result); + ASSERT_EQ(result, vector); + ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod()); +} + +void test_tensor_share(const platform::DeviceContext& ctx) { + std::vector vector({1, 2, 3, 4}); + framework::LoDTensor lod_tensor; + framework::TensorFromVector(vector, ctx, &lod_tensor); + framework::LoD lod({{0, 2, 4}}); + lod_tensor.Resize({4, 1}); + lod_tensor.set_lod(lod); + // Create lite::Tensor and share. + paddle::lite::Tensor lite_tensor; + TensorDataShare(&lite_tensor, &lod_tensor); + // Copy to LoDTensor. + framework::LoDTensor lod_tensor_n; + TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); + std::vector result; + TensorToVector(lod_tensor_n, ctx, &result); ASSERT_EQ(result, vector); ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod()); } @@ -110,6 +129,17 @@ TEST(LiteEngineOp, TensorCopyAsync) { #endif } +TEST(LiteEngineOp, TensorShare) { + auto* ctx_cpu = + platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); + test_tensor_share(*ctx_cpu); +#ifdef PADDLE_WITH_CUDA + auto* ctx_gpu = + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)); + test_tensor_share(*ctx_gpu); +#endif +} + } // namespace utils } // namespace lite } // namespace inference diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 3b48615338f729a56db133a2072ceea5e8e94b22..a920bf7c3f505b839f8f1fd252c9f8505393f3a9 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -42,6 +42,7 @@ class LiteEngineOp : public framework::OperatorBase { paddle::lite::Predictor *engine_; framework::proto::VarType::Type precision_; bool use_gpu_; + bool zero_copy_; public: LiteEngineOp(const std::string &type, @@ -60,6 +61,7 @@ class LiteEngineOp : public framework::OperatorBase { precision_ = framework::proto::VarType_Type_FP32; } use_gpu_ = Attr("use_gpu"); + zero_copy_ = Attr("zero_copy"); } protected: @@ -73,13 +75,13 @@ class LiteEngineOp : public framework::OperatorBase { const platform::DeviceContext *ctx = platform::DeviceContextPool::Instance().Get(dev_place); for (size_t i = 0; i < in_names_.size(); i++) { - const framework::LoDTensor &src_t = + framework::LoDTensor src_t = inference::analysis::GetFromScope(scope, in_names_[i]); paddle::lite::Tensor *dst_t = engine_->GetInput(i); - VLOG(3) << "[Copy] fluid -> lite (" << in_names_[i] << " -> " + VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> " << engine_->GetInputNames()[i] << ")"; - inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx); + inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); } #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(dev_place)) { @@ -91,13 +93,13 @@ class LiteEngineOp : public framework::OperatorBase { engine_->Run(); VLOG(3) << "lite engine run done"; for (size_t i = 0; i < out_names_.size(); i++) { - const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i)); + paddle::lite::Tensor src_t = *(engine_->GetOutput(i)); framework::LoDTensor *dst_t = &inference::analysis::GetFromScope( scope, out_names_[i]); - VLOG(3) << "[Copy] lite -> fluid (" << out_names_[i] << " -> " + VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> " << engine_->GetOutputNames()[i] << ")"; - inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx); + inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); } #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(dev_place)) { diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index 3812911e915bc8ad03fd6f1c4ecaeda69b33971b..fb5c0dcb3514de815b97944d0fdbf3bd7853b628 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -100,6 +100,7 @@ TEST(LiteEngineOp, engine_op) { engine_op_desc.SetAttr("engine_key", engine_key); engine_op_desc.SetAttr("enable_int8", false); engine_op_desc.SetAttr("use_gpu", true); + engine_op_desc.SetAttr("zero_copy", true); engine_op_desc.SetBlockAttr("sub_block", &block_desc); inference::Singleton::Global().Create( engine_key, config); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index e4927977aa3bd9d848160b002eecb579d34bb658..a7d5b36bfc8d20e2fbbb34c15465d14668b65f95 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -425,6 +425,7 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, + py::arg("zero_copy") = false, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("passes_filter") = std::vector(), py::arg("ops_filter") = std::vector()) diff --git a/python/setup.py.in b/python/setup.py.in index 7370c38ecfbf0f7c910bd2c2684f71b507e76c81..5658638854fbff99670a62172407f092d6f02718 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -258,6 +258,10 @@ else: shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path) package_data['paddle.libs'] += ['openblas' + ext_name] +if '${WITH_LITE}' == 'ON': + shutil.copy('${LITE_SHARED_LIB}', libs_path) + package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name] + if '${WITH_PSLIB}' == 'ON': shutil.copy('${PSLIB_LIB}', libs_path) if os.path.exists('${PSLIB_VERSION_PY}'):