未验证 提交 45fa6861 编写于 作者: 石晓伟 提交者: GitHub

Cherry-pick of lite engine, test=release/1.8 (#25817)

* ignore warnings of external libraries, test=develop (#24193)

* fix repeat definitions in liengine.cc, test=develop (#25020)

* remove paddle_use_kernel and paddle_use_op. test=develop (#25189)

* fix compile for lite subgraph. test=develop (#25285)

* [CI] [Lite-Subgraph] CI add lite subgraph check. (#25346)

* supports xpu runtime, test=develop (#25554)

* fix cmake of lite, test=develop (#25680)

* change commit files, test=release/1.8
Co-authored-by: NWilber <jiweibo@baidu.com>
上级 01fc84a1
...@@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA) ...@@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA)
endif() endif()
endif() endif()
# lite subgraph compilation depends on CUDNN_ROOT,
# so include(cudnn) needs to be in front of include(third_party/lite)
include(cudnn) # set cudnn libraries, must before configure
include(third_party) # download, build, install third_party include(third_party) # download, build, install third_party
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
...@@ -173,7 +176,6 @@ if(NOT WIN32) ...@@ -173,7 +176,6 @@ if(NOT WIN32)
endif() endif()
include(flags) # set paddle compile flags include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries, must before configure
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
......
...@@ -18,6 +18,15 @@ if(NOT LINUX OR NOT WITH_MKL) ...@@ -18,6 +18,15 @@ if(NOT LINUX OR NOT WITH_MKL)
return() return()
endif() endif()
if(XPU_SDK_ROOT)
set(LITE_WITH_XPU ON)
include_directories("${XPU_SDK_ROOT}/XTDK/include")
include_directories("${XPU_SDK_ROOT}/XTCL/include")
add_definitions(-DPADDLE_WITH_XPU)
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
endif()
if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
include(ExternalProject) include(ExternalProject)
set(LITE_PROJECT extern_lite) set(LITE_PROJECT extern_lite)
...@@ -25,7 +34,11 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -25,7 +34,11 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG) if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f) set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
endif()
if(NOT CUDA_ARCH_NAME)
set(CUDA_ARCH_NAME "Auto")
endif() endif()
# No quotes, so cmake can resolve it as a command with arguments. # No quotes, so cmake can resolve it as a command with arguments.
...@@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
-DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_ROOT=${CUDNN_ROOT}
-DLITE_WITH_STATIC_CUDA=OFF -DLITE_WITH_STATIC_CUDA=OFF
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-DLITE_WITH_XPU=${LITE_WITH_XPU}
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-DLITE_WITH_ARM=OFF) -DLITE_WITH_ARM=OFF)
ExternalProject_Add( ExternalProject_Add(
...@@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}") ...@@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
include_directories(${LITE_SOURCE_DIR}) include_directories(${LITE_SOURCE_DIR})
include_directories(${LITE_BINARY_DIR}) include_directories(${LITE_BINARY_DIR})
function(external_lite_static_libs alias path) function(external_lite_libs alias path)
add_library(${alias} SHARED IMPORTED GLOBAL) add_library(${alias} SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
${path}) ${path})
...@@ -88,7 +103,8 @@ function(external_lite_static_libs alias path) ...@@ -88,7 +103,8 @@ function(external_lite_static_libs alias path)
endif() endif()
endfunction() endfunction()
external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
add_definitions(-DPADDLE_WITH_LITE) add_definitions(-DPADDLE_WITH_LITE)
add_definitions(-DLITE_WITH_LOG) add_definitions(-DLITE_WITH_LOG)
...@@ -200,6 +200,10 @@ struct Argument { ...@@ -200,6 +200,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode, DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
......
...@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("predictor_id", new int(argument->predictor_id())); pass->Set("predictor_id", new int(argument->predictor_id()));
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
pass->Set("use_gpu", new bool(argument->use_gpu())); pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
pass->Set("use_xpu", new bool(argument->use_xpu()));
pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size()));
} }
disable_logs_ = argument->disable_logs(); disable_logs_ = argument->disable_logs();
if (pass_name == "fc_fuse_pass") { if (pass_name == "fc_fuse_pass") {
......
...@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine(
bool use_gpu = Get<bool>("use_gpu"); bool use_gpu = Get<bool>("use_gpu");
bool enable_int8 = Get<bool>("enable_int8"); bool enable_int8 = Get<bool>("enable_int8");
lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86); bool use_xpu = Get<bool>("use_xpu");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
lite_api::TargetType target_type;
if (use_gpu) {
target_type = TARGET(kCUDA);
} else if (use_xpu) {
target_type = TARGET(kXPU);
} else {
target_type = TARGET(kX86);
}
paddle::lite_api::PrecisionType precision_type = paddle::lite_api::PrecisionType precision_type =
enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64); enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
serialize_params(&config.param, scope, repetitive_params); serialize_params(&config.param, scope, repetitive_params);
config.model = program->Proto()->SerializeAsString(); config.model = program->Proto()->SerializeAsString();
config.valid_places = { config.valid_places = {
// Notice: The ordering here determines the device where the
// input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible.
paddle::lite::Place({target_type, precision_type}), paddle::lite::Place({target_type, precision_type}),
paddle::lite::Place({target_type, PRECISION(kInt64)}),
paddle::lite::Place({target_type, PRECISION(kFloat)}), paddle::lite::Place({target_type, PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
}; };
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
if (dump_model) { if (dump_model) {
lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./model.bin", config.model);
lite::StrToBinaryFile("./param.bin", config.param); lite::StrToBinaryFile("./param.bin", config.param);
...@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator( ...@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator(
op_desc->SetAttr("engine_key", unique_key); op_desc->SetAttr("engine_key", unique_key);
op_desc->SetAttr("enable_int8", Get<bool>("enable_int8")); op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
op_desc->SetAttr("use_gpu", Get<bool>("use_gpu")); op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
op_desc->SetAttr("zero_copy", Get<bool>("zero_copy"));
} }
void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const { void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
......
...@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() { ...@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() {
Update(); Update();
} }
void AnalysisConfig::EnableXpu(int l3_workspace_size) {
use_xpu_ = true;
xpu_l3_workspace_size_ = l3_workspace_size;
Update();
}
AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__; #define CP_MEMBER(member__) member__ = other.member__;
...@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(lite_precision_mode_); CP_MEMBER(lite_precision_mode_);
CP_MEMBER(lite_passes_filter_); CP_MEMBER(lite_passes_filter_);
CP_MEMBER(lite_ops_filter_); CP_MEMBER(lite_ops_filter_);
CP_MEMBER(lite_zero_copy_);
CP_MEMBER(use_xpu_);
CP_MEMBER(xpu_l3_workspace_size_);
// profile related. // profile related.
CP_MEMBER(with_profile_); CP_MEMBER(with_profile_);
...@@ -342,6 +352,22 @@ void AnalysisConfig::Update() { ...@@ -342,6 +352,22 @@ void AnalysisConfig::Update() {
} }
} }
if (use_xpu_) {
#ifndef PADDLE_WITH_XPU
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));
#endif
if (!use_lite_) {
LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
"subgraph mode, please make sure you have enabled it.";
}
PADDLE_ENFORCE_EQ(use_gpu_, false,
platform::errors::Unavailable(
"Currently, XPU and GPU cannot be enabled in the "
"same analysis configuration."));
}
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
...@@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << cpu_math_library_num_threads_; ss << cpu_math_library_num_threads_;
ss << use_lite_; ss << use_lite_;
ss << use_xpu_;
ss << xpu_l3_workspace_size_;
return ss.str(); return ss.str();
} }
...@@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() { ...@@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() {
} }
void AnalysisConfig::EnableLiteEngine( void AnalysisConfig::EnableLiteEngine(
AnalysisConfig::Precision precision_mode, AnalysisConfig::Precision precision_mode, bool zero_copy,
const std::vector<std::string> &passes_filter, const std::vector<std::string> &passes_filter,
const std::vector<std::string> &ops_filter) { const std::vector<std::string> &ops_filter) {
use_lite_ = true; use_lite_ = true;
lite_precision_mode_ = precision_mode; lite_precision_mode_ = precision_mode;
lite_passes_filter_ = passes_filter; lite_passes_filter_ = passes_filter;
lite_ops_filter_ = ops_filter; lite_ops_filter_ = ops_filter;
lite_zero_copy_ = zero_copy;
Update(); Update();
} }
......
...@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLitePassesFilter(config_.lite_passes_filter_);
argument_.SetLiteOpsFilter(config_.lite_ops_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
argument_.SetUseXpu(config_.use_xpu_);
argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
LOG(INFO) << "Lite subgraph engine is enabled"; LOG(INFO) << "Lite subgraph engine is enabled";
} }
......
...@@ -176,6 +176,8 @@ struct AnalysisConfig { ...@@ -176,6 +176,8 @@ struct AnalysisConfig {
/// ///
/// ///
void DisableGpu(); void DisableGpu();
void EnableXpu(int l3_workspace_size = 0xfffc00);
/// ///
/// \brief A boolean state telling whether the GPU is turned on. /// \brief A boolean state telling whether the GPU is turned on.
/// ///
...@@ -319,6 +321,7 @@ struct AnalysisConfig { ...@@ -319,6 +321,7 @@ struct AnalysisConfig {
/// ///
void EnableLiteEngine( void EnableLiteEngine(
AnalysisConfig::Precision precision_mode = Precision::kFloat32, AnalysisConfig::Precision precision_mode = Precision::kFloat32,
bool zero_copy = false,
const std::vector<std::string>& passes_filter = {}, const std::vector<std::string>& passes_filter = {},
const std::vector<std::string>& ops_filter = {}); const std::vector<std::string>& ops_filter = {});
...@@ -562,6 +565,11 @@ struct AnalysisConfig { ...@@ -562,6 +565,11 @@ struct AnalysisConfig {
std::vector<std::string> lite_passes_filter_; std::vector<std::string> lite_passes_filter_;
std::vector<std::string> lite_ops_filter_; std::vector<std::string> lite_ops_filter_;
Precision lite_precision_mode_; Precision lite_precision_mode_;
bool lite_zero_copy_;
bool thread_local_stream_{false};
bool use_xpu_{false};
int xpu_l3_workspace_size_;
// mkldnn related. // mkldnn related.
int mkldnn_cache_capacity_{0}; int mkldnn_cache_capacity_{0};
......
if(XPU_SDK_ROOT)
set(XPU_DEPS xpuapi xpurt)
endif()
cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash) cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto) cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost) cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis) cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils) cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
...@@ -16,12 +16,11 @@ ...@@ -16,12 +16,11 @@
#define LITE_WITH_CUDA 1 #define LITE_WITH_CUDA 1
#endif #endif
#include "paddle/fluid/inference/lite/engine.h" #ifdef PADDLE_WITH_XPU
#include "lite/core/context.h" #define LITE_WITH_XPU 1
#include "lite/core/device_info.h" #endif
#include "lite/api/paddle_use_kernels.h" #include "paddle/fluid/inference/lite/engine.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
namespace paddle { namespace paddle {
...@@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { ...@@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
paddle::lite::Predictor* EngineManager::Create(const std::string& name, paddle::lite::Predictor* EngineManager::Create(const std::string& name,
const EngineConfig& cfg) { const EngineConfig& cfg) {
auto* p = new paddle::lite::Predictor(); if (cfg.valid_places.front().target == TARGET(kCUDA)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::lite::Env<TARGET(kCUDA)>::Init(); paddle::lite::Env<TARGET(kCUDA)>::Init();
#endif #endif
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
#ifdef PADDLE_WITH_XPU
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
cfg.xpu_l3_workspace_size;
#endif
}
auto* p = new paddle::lite::Predictor();
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
cfg.model_type, cfg.model_from_memory); cfg.model_type, cfg.model_from_memory);
engines_[name].reset(p); engines_[name].reset(p);
......
...@@ -20,7 +20,16 @@ ...@@ -20,7 +20,16 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall"
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include "lite/api/paddle_place.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/memory.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#pragma GCC diagnostic pop
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -34,6 +43,7 @@ struct EngineConfig { ...@@ -34,6 +43,7 @@ struct EngineConfig {
std::vector<std::string> neglected_passes; std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true}; bool model_from_memory{true};
size_t xpu_l3_workspace_size;
}; };
class EngineManager { class EngineManager {
......
...@@ -16,10 +16,9 @@ ...@@ -16,10 +16,9 @@
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/lite/engine.h"
#include "paddle/fluid/inference/lite/op_teller.h" #include "paddle/fluid/inference/lite/op_teller.h"
#include "lite/core/op_registry.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace lite { namespace lite {
...@@ -27,15 +26,14 @@ namespace lite { ...@@ -27,15 +26,14 @@ namespace lite {
// Just tell by the op_types. // Just tell by the op_types.
struct SimpleOpTeller : public Teller { struct SimpleOpTeller : public Teller {
SimpleOpTeller() { SimpleOpTeller() {
const std::map<std::string, std::string>& op2path = std::vector<std::string> lite_ops = paddle::lite::GetAllOps();
paddle::lite::GetOp2PathDict();
auto is_non_inst = [](const std::string& op) -> bool { auto is_non_inst = [](const std::string& op) -> bool {
const std::vector<std::string> ops = {"feed", "fetch", "while"}; const std::vector<std::string> ops = {"feed", "fetch", "while"};
return std::find(ops.begin(), ops.end(), op) != ops.end(); return std::find(ops.begin(), ops.end(), op) != ops.end();
}; };
for (const auto& op : op2path) { for (const auto& op : lite_ops) {
if (!is_non_inst(op.first)) { if (!is_non_inst(op)) {
ops_.insert(op.first); ops_.insert(op);
} }
} }
} }
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#include "paddle/fluid/inference/lite/tensor_utils.h" #include "paddle/fluid/inference/lite/tensor_utils.h"
#include <map> #include <map>
#include <memory>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/engine.h"
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -46,6 +48,9 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) { ...@@ -46,6 +48,9 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
return platform::CPUPlace(); return platform::CPUPlace();
case TargetType::kCUDA: case TargetType::kCUDA:
return platform::CUDAPlace(id); return platform::CUDAPlace(id);
case TargetType::kXPU:
LOG(ERROR) << "No corresponding device for XPU yet.";
return platform::Place();
default: default:
LOG(FATAL) << "Error target type."; LOG(FATAL) << "Error target type.";
return platform::Place(); return platform::Place();
...@@ -181,6 +186,31 @@ void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, ...@@ -181,6 +186,31 @@ void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
} }
template <>
void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
const size_t bytes =
static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
dst->Resize(framework::vectorize(src->dims()));
dst->set_precision(GetLitePrecisionType(src->type()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetBuffer(buf, bytes);
}
template <>
void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32;
void* src_raw_data = src->raw_data();
std::shared_ptr<memory::allocation::Allocation> holder(
new memory::allocation::Allocation(src_raw_data, src->memory_size(),
GetNativePlace(src->target())));
dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetHolderWithType(holder, dtype);
}
} // namespace utils } // namespace utils
} // namespace lite } // namespace lite
} // namespace inference } // namespace inference
......
...@@ -14,9 +14,8 @@ ...@@ -14,9 +14,8 @@
#pragma once #pragma once
#include "lite/api/paddle_place.h"
#include "lite/core/tensor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/lite/engine.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -27,6 +26,21 @@ template <typename DstTensor, typename SrcTensor> ...@@ -27,6 +26,21 @@ template <typename DstTensor, typename SrcTensor>
void TensorCopyAsync(DstTensor* dst, const SrcTensor& src, void TensorCopyAsync(DstTensor* dst, const SrcTensor& src,
const platform::DeviceContext& ctx); const platform::DeviceContext& ctx);
template <typename DstTensor, typename SrcTensor>
void TensorDataShare(DstTensor* dst, SrcTensor* src);
template <typename DstTensor, typename SrcTensor>
void TensorCopy(DstTensor* dst, SrcTensor* src,
const platform::DeviceContext& ctx, bool shared = true) {
if (shared) {
VLOG(3) << "TensorDataShare is running";
TensorDataShare(dst, src);
} else {
VLOG(3) << "TensorCopyAsync is running";
TensorCopyAsync(dst, *src, ctx);
}
}
} // namespace utils } // namespace utils
} // namespace lite } // namespace lite
} // namespace inference } // namespace inference
......
...@@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) { ...@@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) {
platform::Place GetNativePlace(const TargetType& type, int id = 0); platform::Place GetNativePlace(const TargetType& type, int id = 0);
EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost))); EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost)));
EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA))); EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA)));
ASSERT_DEATH(GetNativePlace(TargetType::kUnk), ""); EXPECT_ANY_THROW(GetNativePlace(TargetType::kUnk));
} }
TEST(LiteEngineOp, GetLiteTargetType) { TEST(LiteEngineOp, GetLiteTargetType) {
...@@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) { ...@@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) {
PrecisionType::kInt8); PrecisionType::kInt8);
ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32), ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32),
PrecisionType::kInt32); PrecisionType::kInt32);
ASSERT_DEATH( EXPECT_ANY_THROW(
GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), ""); GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS));
} }
TEST(LiteEngineOp, GetNativePrecisionType) { TEST(LiteEngineOp, GetNativePrecisionType) {
...@@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) { ...@@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) {
framework::proto::VarType_Type_INT8); framework::proto::VarType_Type_INT8);
ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32), ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32),
framework::proto::VarType_Type_INT32); framework::proto::VarType_Type_INT32);
ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), ""); EXPECT_ANY_THROW(GetNativePrecisionType(PrecisionType::kUnk));
} }
TEST(LiteEngineOp, GetNativeLayoutType) { TEST(LiteEngineOp, GetNativeLayoutType) {
...@@ -70,14 +70,14 @@ TEST(LiteEngineOp, GetNativeLayoutType) { ...@@ -70,14 +70,14 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
framework::DataLayout GetNativeLayoutType(const DataLayoutType& type); framework::DataLayout GetNativeLayoutType(const DataLayoutType& type);
ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW), ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW),
framework::DataLayout::kNCHW); framework::DataLayout::kNCHW);
ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), ""); EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
} }
void test_tensor_copy(const platform::DeviceContext& ctx) { void test_tensor_copy(const platform::DeviceContext& ctx) {
// Create LoDTensor. // Create LoDTensor.
std::vector<float> vector({1, 2, 3, 4}); std::vector<float> vector({1, 2, 3, 4});
framework::LoDTensor lod_tensor; framework::LoDTensor lod_tensor;
framework::TensorFromVector(vector, &lod_tensor); framework::TensorFromVector(vector, ctx, &lod_tensor);
framework::LoD lod({{0, 2, 4}}); framework::LoD lod({{0, 2, 4}});
lod_tensor.Resize({4, 1}); lod_tensor.Resize({4, 1});
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
...@@ -94,7 +94,26 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { ...@@ -94,7 +94,26 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
} }
#endif #endif
std::vector<float> result; std::vector<float> result;
TensorToVector(lod_tensor_n, &result); TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector);
ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
}
void test_tensor_share(const platform::DeviceContext& ctx) {
std::vector<float> vector({1, 2, 3, 4});
framework::LoDTensor lod_tensor;
framework::TensorFromVector(vector, ctx, &lod_tensor);
framework::LoD lod({{0, 2, 4}});
lod_tensor.Resize({4, 1});
lod_tensor.set_lod(lod);
// Create lite::Tensor and share.
paddle::lite::Tensor lite_tensor;
TensorDataShare(&lite_tensor, &lod_tensor);
// Copy to LoDTensor.
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector); ASSERT_EQ(result, vector);
ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod()); ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
} }
...@@ -110,6 +129,17 @@ TEST(LiteEngineOp, TensorCopyAsync) { ...@@ -110,6 +129,17 @@ TEST(LiteEngineOp, TensorCopyAsync) {
#endif #endif
} }
TEST(LiteEngineOp, TensorShare) {
auto* ctx_cpu =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
test_tensor_share(*ctx_cpu);
#ifdef PADDLE_WITH_CUDA
auto* ctx_gpu =
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
test_tensor_share(*ctx_gpu);
#endif
}
} // namespace utils } // namespace utils
} // namespace lite } // namespace lite
} // namespace inference } // namespace inference
......
...@@ -42,6 +42,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -42,6 +42,7 @@ class LiteEngineOp : public framework::OperatorBase {
paddle::lite::Predictor *engine_; paddle::lite::Predictor *engine_;
framework::proto::VarType::Type precision_; framework::proto::VarType::Type precision_;
bool use_gpu_; bool use_gpu_;
bool zero_copy_;
public: public:
LiteEngineOp(const std::string &type, LiteEngineOp(const std::string &type,
...@@ -60,6 +61,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -60,6 +61,7 @@ class LiteEngineOp : public framework::OperatorBase {
precision_ = framework::proto::VarType_Type_FP32; precision_ = framework::proto::VarType_Type_FP32;
} }
use_gpu_ = Attr<bool>("use_gpu"); use_gpu_ = Attr<bool>("use_gpu");
zero_copy_ = Attr<bool>("zero_copy");
} }
protected: protected:
...@@ -73,13 +75,13 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -73,13 +75,13 @@ class LiteEngineOp : public framework::OperatorBase {
const platform::DeviceContext *ctx = const platform::DeviceContext *ctx =
platform::DeviceContextPool::Instance().Get(dev_place); platform::DeviceContextPool::Instance().Get(dev_place);
for (size_t i = 0; i < in_names_.size(); i++) { for (size_t i = 0; i < in_names_.size(); i++) {
const framework::LoDTensor &src_t = framework::LoDTensor src_t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, inference::analysis::GetFromScope<framework::LoDTensor>(scope,
in_names_[i]); in_names_[i]);
paddle::lite::Tensor *dst_t = engine_->GetInput(i); paddle::lite::Tensor *dst_t = engine_->GetInput(i);
VLOG(3) << "[Copy] fluid -> lite (" << in_names_[i] << " -> " VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
<< engine_->GetInputNames()[i] << ")"; << engine_->GetInputNames()[i] << ")";
inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx); inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) { if (platform::is_gpu_place(dev_place)) {
...@@ -91,13 +93,13 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -91,13 +93,13 @@ class LiteEngineOp : public framework::OperatorBase {
engine_->Run(); engine_->Run();
VLOG(3) << "lite engine run done"; VLOG(3) << "lite engine run done";
for (size_t i = 0; i < out_names_.size(); i++) { for (size_t i = 0; i < out_names_.size(); i++) {
const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i)); paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
framework::LoDTensor *dst_t = framework::LoDTensor *dst_t =
&inference::analysis::GetFromScope<framework::LoDTensor>( &inference::analysis::GetFromScope<framework::LoDTensor>(
scope, out_names_[i]); scope, out_names_[i]);
VLOG(3) << "[Copy] lite -> fluid (" << out_names_[i] << " -> " VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> "
<< engine_->GetOutputNames()[i] << ")"; << engine_->GetOutputNames()[i] << ")";
inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx); inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) { if (platform::is_gpu_place(dev_place)) {
......
...@@ -100,6 +100,7 @@ TEST(LiteEngineOp, engine_op) { ...@@ -100,6 +100,7 @@ TEST(LiteEngineOp, engine_op) {
engine_op_desc.SetAttr("engine_key", engine_key); engine_op_desc.SetAttr("engine_key", engine_key);
engine_op_desc.SetAttr("enable_int8", false); engine_op_desc.SetAttr("enable_int8", false);
engine_op_desc.SetAttr("use_gpu", true); engine_op_desc.SetAttr("use_gpu", true);
engine_op_desc.SetAttr("zero_copy", true);
engine_op_desc.SetBlockAttr("sub_block", &block_desc); engine_op_desc.SetBlockAttr("sub_block", &block_desc);
inference::Singleton<inference::lite::EngineManager>::Global().Create( inference::Singleton<inference::lite::EngineManager>::Global().Create(
engine_key, config); engine_key, config);
......
...@@ -425,6 +425,7 @@ void BindAnalysisConfig(py::module *m) { ...@@ -425,6 +425,7 @@ void BindAnalysisConfig(py::module *m) {
py::arg("disable_trt_plugin_fp16") = false) py::arg("disable_trt_plugin_fp16") = false)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
py::arg("zero_copy") = false,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("passes_filter") = std::vector<std::string>(), py::arg("passes_filter") = std::vector<std::string>(),
py::arg("ops_filter") = std::vector<std::string>()) py::arg("ops_filter") = std::vector<std::string>())
......
...@@ -258,6 +258,10 @@ else: ...@@ -258,6 +258,10 @@ else:
shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path) shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
package_data['paddle.libs'] += ['openblas' + ext_name] package_data['paddle.libs'] += ['openblas' + ext_name]
if '${WITH_LITE}' == 'ON':
shutil.copy('${LITE_SHARED_LIB}', libs_path)
package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
if '${WITH_PSLIB}' == 'ON': if '${WITH_PSLIB}' == 'ON':
shutil.copy('${PSLIB_LIB}', libs_path) shutil.copy('${PSLIB_LIB}', libs_path)
if os.path.exists('${PSLIB_VERSION_PY}'): if os.path.exists('${PSLIB_VERSION_PY}'):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册