未验证 提交 2ac4143b 编写于 作者: 石晓伟 提交者: GitHub

support xpu with analysis predictor, test=develop (#30832)

* support xpu inference with analysis predictor, test=develop

* merge the cmake of the xpu toolchain, test=develop

* add c-apis, test=develop

* fix a bug in extern_xpu, test=develop
上级 05d2b7a3
...@@ -5,48 +5,53 @@ endif() ...@@ -5,48 +5,53 @@ endif()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu") SET(XPU_PROJECT "extern_xpu")
if (WITH_AARCH64) if(NOT XPU_SDK_ROOT)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) if (WITH_AARCH64)
elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) elseif(WITH_SUNWAY)
else() SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) else()
endif() SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
endif()
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so") SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so") SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/include xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt ExternalProject_Add(
"PROJECT(XPU)\n" ${XPU_PROJECT}
"cmake_minimum_required(VERSION 3.0)\n" ${EXTERNAL_PROJECT_LOG_ARGS}
"install(DIRECTORY xpu/include xpu/lib \n" PREFIX ${XPU_SOURCE_DIR}
" DESTINATION ${XPU_INSTALL_DIR})\n") DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
ExternalProject_Add( && tar xvf xpu.tar.gz
${XPU_PROJECT} DOWNLOAD_NO_PROGRESS 1
${EXTERNAL_PROJECT_LOG_ARGS} UPDATE_COMMAND ""
PREFIX ${XPU_SOURCE_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz )
&& tar xvf xpu.tar.gz else()
DOWNLOAD_NO_PROGRESS 1 SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/")
UPDATE_COMMAND "" SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} endif()
)
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
...@@ -69,4 +74,14 @@ else(WITH_XPU_BKCL) ...@@ -69,4 +74,14 @@ else(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
endif(WITH_XPU_BKCL) endif(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
else()
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
endif()
# Ensure that xpu/api.h can be included without dependency errors.
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
add_dependencies(xpu_headers_dummy extern_xpu)
link_libraries(xpu_headers_dummy)
...@@ -33,6 +33,8 @@ PassStrategy *AnalysisConfig::pass_builder() const { ...@@ -33,6 +33,8 @@ PassStrategy *AnalysisConfig::pass_builder() const {
if (use_gpu_) { if (use_gpu_) {
LOG(INFO) << "Create GPU IR passes"; LOG(INFO) << "Create GPU IR passes";
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy);
} else { } else {
LOG(INFO) << "Create CPU IR passes"; LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
...@@ -73,7 +75,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, ...@@ -73,7 +75,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
use_gpu_ = true; use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb; memory_pool_init_size_mb_ = memory_pool_init_size_mb;
FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
device_id_ = device_id; gpu_device_id_ = device_id;
#else #else
LOG(ERROR) << "Please compile with gpu to EnableGpu()"; LOG(ERROR) << "Please compile with gpu to EnableGpu()";
use_gpu_ = false; use_gpu_ = false;
...@@ -115,7 +117,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -115,7 +117,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// GPU related. // GPU related.
CP_MEMBER(use_gpu_); CP_MEMBER(use_gpu_);
CP_MEMBER(use_cudnn_); CP_MEMBER(use_cudnn_);
CP_MEMBER(device_id_); CP_MEMBER(gpu_device_id_);
CP_MEMBER(xpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
...@@ -174,8 +177,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -174,8 +177,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(thread_local_stream_); CP_MEMBER(thread_local_stream_);
if (use_gpu_) { if (use_gpu_) {
PADDLE_ENFORCE_EQ(use_xpu_, false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(other.pass_builder())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder()))); *static_cast<CpuPassStrategy *>(other.pass_builder())));
...@@ -333,6 +342,12 @@ void AnalysisConfig::Update() { ...@@ -333,6 +342,12 @@ void AnalysisConfig::Update() {
// Append after the Affine_channel_conv_fuse pass. // Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy);
} else { } else {
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
} }
...@@ -341,7 +356,13 @@ void AnalysisConfig::Update() { ...@@ -341,7 +356,13 @@ void AnalysisConfig::Update() {
if (use_gpu()) { if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get()))); *static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get()))); *static_cast<CpuPassStrategy *>(pass_builder_.get())));
...@@ -420,19 +441,16 @@ void AnalysisConfig::Update() { ...@@ -420,19 +441,16 @@ void AnalysisConfig::Update() {
} }
if (use_xpu_) { if (use_xpu_) {
#ifndef LITE_SUBGRAPH_WITH_XPU #if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU)
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));
#endif
if (!use_lite_) {
LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
"subgraph mode, please make sure you have enabled it.";
}
PADDLE_ENFORCE_EQ(use_gpu_, false, PADDLE_ENFORCE_EQ(use_gpu_, false,
platform::errors::Unavailable( platform::errors::Unavailable(
"Currently, XPU and GPU cannot be enabled in the " "Currently, XPU and GPU cannot be enabled in the "
"same analysis configuration.")); "same analysis configuration."));
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));
#endif
} }
if (ir_debug_) { if (ir_debug_) {
...@@ -448,7 +466,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -448,7 +466,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_gpu_; ss << use_gpu_;
ss << use_fc_padding_; ss << use_fc_padding_;
ss << device_id_; ss << gpu_device_id_;
ss << xpu_device_id_;
ss << memory_pool_init_size_mb_; ss << memory_pool_init_size_mb_;
ss << use_tensorrt_; ss << use_tensorrt_;
...@@ -507,7 +526,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { ...@@ -507,7 +526,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
// Get the GPU memory details and calculate the fraction of memory for the // Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool. // GPU memory pool.
size_t gpu_total, gpu_available; size_t gpu_total, gpu_available;
platform::SetDeviceId(device_id_); platform::SetDeviceId(gpu_device_id_);
platform::GpuMemoryUsage(&gpu_available, &gpu_total); platform::GpuMemoryUsage(&gpu_available, &gpu_total);
double total_gpu_memory = gpu_total / 1024. / 1024.; double total_gpu_memory = gpu_total / 1024. / 1024.;
float fraction_of_gpu_memory = float fraction_of_gpu_memory =
...@@ -548,7 +567,7 @@ NativeConfig AnalysisConfig::ToNativeConfig() const { ...@@ -548,7 +567,7 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
config.prog_file = prog_file_; config.prog_file = prog_file_;
config.param_file = params_file_; config.param_file = params_file_;
config.use_gpu = use_gpu_; config.use_gpu = use_gpu_;
config.device = device_id_; config.device = gpu_device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_; config.specify_input_name = specify_input_name_;
return config; return config;
......
...@@ -103,7 +103,10 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, ...@@ -103,7 +103,10 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), pt.data.data(), std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
pt.data.length()); pt.data.length());
} else { } else if (platform::is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
...@@ -116,6 +119,18 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, ...@@ -116,6 +119,18 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
PADDLE_THROW(paddle::platform::errors::Fatal( PADDLE_THROW(paddle::platform::errors::Fatal(
"Not compile with CUDA, should not reach here.")); "Not compile with CUDA, should not reach here."));
#endif #endif
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length());
#else
PADDLE_THROW(paddle::platform::errors::Fatal(
"Not compile with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
} }
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy. // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod; framework::LoD lod;
...@@ -182,6 +197,12 @@ bool AnalysisPredictor::PrepareScope( ...@@ -182,6 +197,12 @@ bool AnalysisPredictor::PrepareScope(
++dev_id) { ++dev_id) {
memory::Release(platform::CUDAPlace(dev_id)); memory::Release(platform::CUDAPlace(dev_id));
} }
#endif
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount();
++dev_id) {
memory::Release(platform::XPUPlace(dev_id));
}
#endif #endif
memory::Release(platform::CPUPlace()); memory::Release(platform::CPUPlace());
}); });
...@@ -219,7 +240,9 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -219,7 +240,9 @@ bool AnalysisPredictor::PrepareProgram(
} }
bool AnalysisPredictor::CreateExecutor() { bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu()) { if (config_.use_gpu()) {
status_use_gpu_ = true; PADDLE_ENFORCE_EQ(config_.use_xpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (config_.thread_local_stream_enabled()) { if (config_.thread_local_stream_enabled()) {
...@@ -230,6 +253,8 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -230,6 +253,8 @@ bool AnalysisPredictor::CreateExecutor() {
ctx->ResetThreadContext(platform::stream::Priority::kNormal); ctx->ResetThreadContext(platform::stream::Priority::kNormal);
} }
#endif #endif
} else if (config_.use_xpu()) {
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -734,11 +759,16 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -734,11 +759,16 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
res->SetName(name); res->SetName(name);
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
} }
return res; return res;
} }
...@@ -755,6 +785,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -755,6 +785,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
res->SetName(name); res->SetName(name);
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......
...@@ -415,7 +415,6 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -415,7 +415,6 @@ class AnalysisPredictor : public PaddlePredictor {
private: private:
// Some status here that help to determine the status inside the predictor. // Some status here that help to determine the status inside the predictor.
bool status_is_cloned_{false}; bool status_is_cloned_{false};
bool status_use_gpu_{false};
}; };
} // namespace paddle } // namespace paddle
...@@ -80,7 +80,12 @@ bool NativePaddlePredictor::Init( ...@@ -80,7 +80,12 @@ bool NativePaddlePredictor::Init(
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (config_.use_gpu) { if (config_.use_gpu) {
PADDLE_ENFORCE_EQ(config_.use_xpu, false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else if (config_.use_xpu) {
place_ = paddle::platform::XPUPlace(config_.device);
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -240,7 +245,11 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -240,7 +245,11 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length()); inputs[i].data.length());
} else { } else if (platform::is_gpu_place(place_)) {
PADDLE_ENFORCE_EQ(
platform::is_xpu_place(place_), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -253,6 +262,16 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -253,6 +262,16 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here.")); "Not compile with CUDA, should not reach here."));
#endif
} else {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length());
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here."));
#endif #endif
} }
......
...@@ -58,19 +58,15 @@ NativeConfig GetConfig() { ...@@ -58,19 +58,15 @@ NativeConfig GetConfig() {
config.model_dir = FLAGS_word2vec_dirname; config.model_dir = FLAGS_word2vec_dirname;
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA
config.use_gpu = true;
#else
config.use_gpu = false;
#endif
config.device = 0; config.device = 0;
return config; return config;
} }
void MainWord2Vec(bool use_gpu) { void MainWord2Vec(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
auto predictor = CreatePaddlePredictor<NativeConfig>(config); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
config.use_gpu = use_gpu; config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}}; framework::LoD lod{{0, 1}};
...@@ -117,11 +113,12 @@ void MainWord2Vec(bool use_gpu) { ...@@ -117,11 +113,12 @@ void MainWord2Vec(bool use_gpu) {
} }
} }
void MainImageClassification(bool use_gpu) { void MainImageClassification(const paddle::PaddlePlace& place) {
int batch_size = 2; int batch_size = 2;
bool repeat = false; bool repeat = false;
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
config.model_dir = config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
...@@ -162,9 +159,10 @@ void MainImageClassification(bool use_gpu) { ...@@ -162,9 +159,10 @@ void MainImageClassification(bool use_gpu) {
} }
} }
void MainThreadsWord2Vec(bool use_gpu) { void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
// prepare inputs data and reference results // prepare inputs data and reference results
...@@ -223,11 +221,12 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -223,11 +221,12 @@ void MainThreadsWord2Vec(bool use_gpu) {
} }
} }
void MainThreadsImageClassification(bool use_gpu) { void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
constexpr int num_jobs = 4; // each job run 1 batch constexpr int num_jobs = 4; // each job run 1 batch
constexpr int batch_size = 1; constexpr int batch_size = 1;
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
config.model_dir = config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
...@@ -276,29 +275,42 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -276,29 +275,42 @@ void MainThreadsImageClassification(bool use_gpu) {
} }
} }
TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } TEST(inference_api_native, word2vec_cpu) {
MainWord2Vec(paddle::PaddlePlace::kCPU);
}
TEST(inference_api_native, word2vec_cpu_threads) { TEST(inference_api_native, word2vec_cpu_threads) {
MainThreadsWord2Vec(false /*use_gpu*/); MainThreadsWord2Vec(paddle::PaddlePlace::kCPU);
} }
TEST(inference_api_native, image_classification_cpu) { TEST(inference_api_native, image_classification_cpu) {
MainImageClassification(false /*use_gpu*/); MainImageClassification(paddle::PaddlePlace::kCPU);
} }
TEST(inference_api_native, image_classification_cpu_threads) { TEST(inference_api_native, image_classification_cpu_threads) {
MainThreadsImageClassification(false /*use_gpu*/); MainThreadsImageClassification(paddle::PaddlePlace::kCPU);
} }
#ifdef PADDLE_WITH_XPU
TEST(inference_api_native, word2vec_xpu) {
MainWord2Vec(paddle::PaddlePlace::kXPU);
}
TEST(inference_api_native, image_classification_xpu) {
MainImageClassification(paddle::PaddlePlace::kXPU);
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } TEST(inference_api_native, word2vec_gpu) {
MainWord2Vec(paddle::PaddlePlace::kGPU);
}
// Turn off temporarily for the unstable result. // Turn off temporarily for the unstable result.
// TEST(inference_api_native, word2vec_gpu_threads) { // TEST(inference_api_native, word2vec_gpu_threads) {
// MainThreadsWord2Vec(true /*use_gpu*/); // MainThreadsWord2Vec(paddle::PaddlePlace::kGPU);
// } // }
TEST(inference_api_native, image_classification_gpu) { TEST(inference_api_native, image_classification_gpu) {
MainImageClassification(true /*use_gpu*/); MainImageClassification(paddle::PaddlePlace::kGPU);
} }
// Turn off temporarily for the unstable result. // Turn off temporarily for the unstable result.
// TEST(inference_api_native, image_classification_gpu_threads) { // TEST(inference_api_native, image_classification_gpu_threads) {
// MainThreadsImageClassification(true /*use_gpu*/); // MainThreadsImageClassification(paddle::PaddlePlace::kGPU);
// } // }
#endif #endif
......
...@@ -185,11 +185,23 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -185,11 +185,23 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
/// ///
/// \brief A boolean state telling whether the XPU is turned on.
///
/// \return bool Whether the XPU is turned on.
///
bool use_xpu() const { return use_xpu_; }
///
/// \brief Get the GPU device id.
///
/// \return int The GPU device id.
///
int gpu_device_id() const { return gpu_device_id_; }
///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
/// \return int The GPU device id. /// \return int The GPU device id.
/// ///
int gpu_device_id() const { return device_id_; } int xpu_device_id() const { return xpu_device_id_; }
/// ///
/// \brief Get the initial size in MB of the GPU memory pool. /// \brief Get the initial size in MB of the GPU memory pool.
/// ///
...@@ -579,7 +591,8 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -579,7 +591,8 @@ struct PD_INFER_DECL AnalysisConfig {
// GPU related. // GPU related.
bool use_gpu_{false}; bool use_gpu_{false};
int device_id_{0}; int gpu_device_id_{0};
int xpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool use_cudnn_{false}; bool use_cudnn_{false};
......
...@@ -162,7 +162,7 @@ struct PD_INFER_DECL PaddleTensor { ...@@ -162,7 +162,7 @@ struct PD_INFER_DECL PaddleTensor {
std::vector<std::vector<size_t>> lod; ///< Tensor+LoD equals LoDTensor std::vector<std::vector<size_t>> lod; ///< Tensor+LoD equals LoDTensor
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU, kXPU };
/// \brief Represents an n-dimensional array of values. /// \brief Represents an n-dimensional array of values.
/// The ZeroCopyTensor is used to store the input or output of the network. /// The ZeroCopyTensor is used to store the input or output of the network.
...@@ -361,6 +361,7 @@ class PD_INFER_DECL PaddlePredictor { ...@@ -361,6 +361,7 @@ class PD_INFER_DECL PaddlePredictor {
struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config { struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
NativeConfig(); NativeConfig();
/// GPU related fields. /// GPU related fields.
bool use_xpu{false};
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{ float fraction_of_gpu_memory{
......
...@@ -140,11 +140,16 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -140,11 +140,16 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in gpu mode. /// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
/// \brief Check if we are using xpu.
/// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; }
/// \brief Default destructor. /// \brief Default destructor.
virtual ~PassStrategy() = default; virtual ~PassStrategy() = default;
protected: protected:
/// \cond Protected /// \cond Protected
bool use_xpu_{false};
bool use_gpu_{false}; bool use_gpu_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
/// \endcond /// \endcond
...@@ -226,6 +231,14 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -226,6 +231,14 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \endcond /// \endcond
}; };
/// \class XpuPassStrategy
/// \brief The XPU passes controller, it is used in AnalysisPredictor with XPU
/// mode.
class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
public:
XpuPassStrategy() : PassStrategy({}) {}
};
/// \brief List of tensorRT subgraph passes. /// \brief List of tensorRT subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses; PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
......
...@@ -165,12 +165,19 @@ PADDLE_CAPI_EXPORT extern void PD_EnableUseGpu(PD_AnalysisConfig* config, ...@@ -165,12 +165,19 @@ PADDLE_CAPI_EXPORT extern void PD_EnableUseGpu(PD_AnalysisConfig* config,
int memory_pool_init_size_mb, int memory_pool_init_size_mb,
int device_id); int device_id);
PADDLE_CAPI_EXPORT extern void PD_EnableXpu(PD_AnalysisConfig* config,
int l3_workspace_size);
PADDLE_CAPI_EXPORT extern void PD_DisableGpu(PD_AnalysisConfig* config); PADDLE_CAPI_EXPORT extern void PD_DisableGpu(PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_UseGpu(const PD_AnalysisConfig* config); PADDLE_CAPI_EXPORT extern bool PD_UseGpu(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_UseXpu(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_GpuDeviceId(const PD_AnalysisConfig* config); PADDLE_CAPI_EXPORT extern int PD_GpuDeviceId(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_XpuDeviceId(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_MemoryPoolInitSizeMb( PADDLE_CAPI_EXPORT extern int PD_MemoryPoolInitSizeMb(
const PD_AnalysisConfig* config); const PD_AnalysisConfig* config);
......
...@@ -111,6 +111,14 @@ void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, ...@@ -111,6 +111,14 @@ void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
device_id); device_id);
} }
void PD_EnableXpu(PD_AnalysisConfig* config, int l3_workspace_size) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableXpu(l3_workspace_size);
}
void PD_DisableGpu(PD_AnalysisConfig* config) { void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
config, config,
...@@ -127,6 +135,14 @@ bool PD_UseGpu(const PD_AnalysisConfig* config) { ...@@ -127,6 +135,14 @@ bool PD_UseGpu(const PD_AnalysisConfig* config) {
return config->config.use_gpu(); return config->config.use_gpu();
} }
bool PD_UseXpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_xpu();
}
int PD_GpuDeviceId(const PD_AnalysisConfig* config) { int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
config, config,
...@@ -135,6 +151,14 @@ int PD_GpuDeviceId(const PD_AnalysisConfig* config) { ...@@ -135,6 +151,14 @@ int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
return config->config.gpu_device_id(); return config->config.gpu_device_id();
} }
int PD_XpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.xpu_device_id();
}
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) { int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
config, config,
......
...@@ -510,7 +510,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -510,7 +510,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz) if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <vector>
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
#ifdef PADDLE_WITH_XPU
TEST(PD_AnalysisConfig, use_xpu) {
std::string model_dir = FLAGS_infer_model + "/mobilenet";
PD_AnalysisConfig *config = PD_NewAnalysisConfig();
PD_SwitchUseFeedFetchOps(config, false);
PD_SwitchSpecifyInputNames(config, true);
PD_SwitchIrDebug(config, true);
PD_SetModel(config, model_dir.c_str(), nullptr);
PD_SetOptimCacheDir(config, (FLAGS_infer_model + "/OptimCacheDir").c_str());
const char *model_dir_ = PD_ModelDir(config);
LOG(INFO) << model_dir_;
PD_EnableXpu(config, 0xfffc00);
bool use_xpu = PD_UseXpu(config);
CHECK(use_xpu) << "NO";
int device = PD_XpuDeviceId(config);
CHECK(0 == device) << "NO";
PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO";
PD_EnableMemoryOptim(config);
bool memory_optim_enable = PD_MemoryOptimEnabled(config);
CHECK(memory_optim_enable) << "NO";
PD_EnableProfile(config);
bool profiler_enable = PD_ProfileEnabled(config);
CHECK(profiler_enable) << "NO";
PD_SetInValid(config);
bool is_valid = PD_IsValid(config);
CHECK(!is_valid) << "NO";
PD_DeleteAnalysisConfig(config);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -58,6 +58,24 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { ...@@ -58,6 +58,24 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
return 0; return 0;
} }
#ifdef PADDLE_WITH_XPU
TEST(AnalysisPredictor, native_xpu) {
AnalysisConfig config;
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
test_main(config);
}
#endif
#ifdef LITE_SUBGRAPH_WITH_XPU
TEST(AnalysisPredictor, lite_xpu) {
AnalysisConfig config;
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, thread_local_stream) { TEST(AnalysisPredictor, thread_local_stream) {
const size_t thread_num = 5; const size_t thread_num = 5;
......
...@@ -27,6 +27,18 @@ limitations under the License. */ ...@@ -27,6 +27,18 @@ limitations under the License. */
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
namespace paddle {
bool gpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kGPU;
}
bool xpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kXPU;
}
bool cpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kCPU;
}
} // namespace paddle
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor* input, void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, T lower, T upper) { paddle::framework::DDim dims, T lower, T upper) {
......
...@@ -197,12 +197,12 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p, ...@@ -197,12 +197,12 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
template <> template <>
uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) { uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
PADDLE_THROW( LOG(WARNING) << "Release XPU pool is not supported now, no action here.";
platform::errors::PermissionDenied("Release XPU pool is not supported."));
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported.")); platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif #endif
return -1;
} }
template <> template <>
......
...@@ -58,7 +58,7 @@ cc_library(place SRCS place.cc DEPS enforce boost) ...@@ -58,7 +58,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
if(WITH_XPU) if(WITH_XPU)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce) cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
endif() endif()
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -378,7 +378,8 @@ void BindPaddlePlace(py::module *m) { ...@@ -378,7 +378,8 @@ void BindPaddlePlace(py::module *m) {
py::enum_<PaddlePlace>(*m, "PaddlePlace") py::enum_<PaddlePlace>(*m, "PaddlePlace")
.value("UNK", PaddlePlace::kUNK) .value("UNK", PaddlePlace::kUNK)
.value("CPU", PaddlePlace::kCPU) .value("CPU", PaddlePlace::kCPU)
.value("GPU", PaddlePlace::kGPU); .value("GPU", PaddlePlace::kGPU)
.value("XPU", PaddlePlace::kXPU);
} }
void BindPaddlePredictor(py::module *m) { void BindPaddlePredictor(py::module *m) {
...@@ -407,6 +408,7 @@ void BindNativeConfig(py::module *m) { ...@@ -407,6 +408,7 @@ void BindNativeConfig(py::module *m) {
py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig") py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
.def(py::init<>()) .def(py::init<>())
.def_readwrite("use_gpu", &NativeConfig::use_gpu) .def_readwrite("use_gpu", &NativeConfig::use_gpu)
.def_readwrite("use_xpu", &NativeConfig::use_xpu)
.def_readwrite("device", &NativeConfig::device) .def_readwrite("device", &NativeConfig::device)
.def_readwrite("fraction_of_gpu_memory", .def_readwrite("fraction_of_gpu_memory",
&NativeConfig::fraction_of_gpu_memory) &NativeConfig::fraction_of_gpu_memory)
...@@ -468,7 +470,9 @@ void BindAnalysisConfig(py::module *m) { ...@@ -468,7 +470,9 @@ void BindAnalysisConfig(py::module *m) {
py::arg("l3_workspace_size")) py::arg("l3_workspace_size"))
.def("disable_gpu", &AnalysisConfig::DisableGpu) .def("disable_gpu", &AnalysisConfig::DisableGpu)
.def("use_gpu", &AnalysisConfig::use_gpu) .def("use_gpu", &AnalysisConfig::use_gpu)
.def("use_xpu", &AnalysisConfig::use_xpu)
.def("gpu_device_id", &AnalysisConfig::gpu_device_id) .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
.def("xpu_device_id", &AnalysisConfig::xpu_device_id)
.def("memory_pool_init_size_mb", .def("memory_pool_init_size_mb",
&AnalysisConfig::memory_pool_init_size_mb) &AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool", .def("fraction_of_gpu_memory_for_pool",
......
...@@ -26,7 +26,20 @@ import sys ...@@ -26,7 +26,20 @@ import sys
paddle.enable_static() paddle.enable_static()
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): def get_place(target):
if target == "cuda":
return fluid.CUDAPlace(0)
elif target == "xpu":
return fluid.XPUPlace(0)
elif target == "cpu":
return fluid.CPUPlace()
else:
raise ValueError(
"Target `{0}` is not on the support list: `cuda`, `xpu` and `cpu`.".
format(target))
def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
PASS_NUM = 100 PASS_NUM = 100
EMBED_SIZE = 32 EMBED_SIZE = 32
HIDDEN_SIZE = 256 HIDDEN_SIZE = 256
...@@ -93,7 +106,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -93,7 +106,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = get_place(target)
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word], feed_list=[first_word, second_word, third_word, forth_word, next_word],
...@@ -143,13 +156,12 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -143,13 +156,12 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
train_loop(t.get_trainer_program()) train_loop(t.get_trainer_program())
def infer(use_cuda, save_dirname=None): def infer(target, save_dirname=None):
if save_dirname is None: if save_dirname is None:
return return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = get_place(target)
exe = fluid.Executor(place) exe = fluid.Executor(place)
inference_scope = fluid.core.Scope() inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope): with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc, # Use fluid.io.load_inference_model to obtain the inference program desc,
...@@ -211,10 +223,12 @@ def infer(use_cuda, save_dirname=None): ...@@ -211,10 +223,12 @@ def infer(use_cuda, save_dirname=None):
infer_config = fluid.core.NativeConfig() infer_config = fluid.core.NativeConfig()
infer_config.model_dir = 'word2vec.inference.model' infer_config.model_dir = 'word2vec.inference.model'
infer_config.use_gpu = use_cuda if target == "cuda":
if use_cuda: infer_config.use_gpu = True
infer_config.device = 0 infer_config.device = 0
infer_config.fraction_of_gpu_memory = 0.15 infer_config.fraction_of_gpu_memory = 0.15
elif target == "xpu":
infer_config.use_xpu = True
compiled_program = fluid.compiler.CompiledProgram(inference_program) compiled_program = fluid.compiler.CompiledProgram(inference_program)
compiled_program._with_inference_optimize(infer_config) compiled_program._with_inference_optimize(infer_config)
assert compiled_program._is_inference is True assert compiled_program._is_inference is True
...@@ -222,11 +236,13 @@ def infer(use_cuda, save_dirname=None): ...@@ -222,11 +236,13 @@ def infer(use_cuda, save_dirname=None):
np_data = np.array(results[0]) np_data = np.array(results[0])
infer_out = infer_outputs[0].data.float_data() infer_out = infer_outputs[0].data.float_data()
for a, b in zip(np_data[0], infer_out): for a, b in zip(np_data[0], infer_out):
assert np.isclose(a, b), "a: {}, b: {}".format(a, b) assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
def main(use_cuda, is_sparse, is_parallel): def main(target, is_sparse, is_parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if target == "cuda" and not fluid.core.is_compiled_with_cuda():
return
if target == "xpu" and not fluid.core.is_compiled_with_xpu():
return return
if not is_parallel: if not is_parallel:
...@@ -234,8 +250,13 @@ def main(use_cuda, is_sparse, is_parallel): ...@@ -234,8 +250,13 @@ def main(use_cuda, is_sparse, is_parallel):
else: else:
save_dirname = None save_dirname = None
train(use_cuda, is_sparse, is_parallel, save_dirname) if target == "xpu":
infer(use_cuda, save_dirname) # This model cannot be trained with xpu temporarily,
# so only inference is turned on.
train("cpu", is_sparse, is_parallel, save_dirname)
else:
train(target, is_sparse, is_parallel, save_dirname)
infer(target, save_dirname)
FULL_TEST = os.getenv('FULL_TEST', FULL_TEST = os.getenv('FULL_TEST',
...@@ -247,8 +268,8 @@ class W2VTest(unittest.TestCase): ...@@ -247,8 +268,8 @@ class W2VTest(unittest.TestCase):
pass pass
def inject_test_method(use_cuda, is_sparse, is_parallel): def inject_test_method(target, is_sparse, is_parallel):
fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse" fn_name = "test_{0}_{1}_{2}".format(target, "sparse"
if is_sparse else "dense", "parallel" if is_sparse else "dense", "parallel"
if is_parallel else "normal") if is_parallel else "normal")
...@@ -259,11 +280,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): ...@@ -259,11 +280,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog): with fluid.program_guard(prog, startup_prog):
main( main(
use_cuda=use_cuda, target=target, is_sparse=is_sparse, is_parallel=is_parallel)
is_sparse=is_sparse,
is_parallel=is_parallel)
if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse: if (not fluid.core.is_compiled_with_cuda() or
target == "cuda") and is_sparse:
fn = __impl__ fn = __impl__
else: else:
# skip the other test when on CI server # skip the other test when on CI server
...@@ -273,10 +293,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): ...@@ -273,10 +293,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
setattr(W2VTest, fn_name, fn) setattr(W2VTest, fn_name, fn)
for use_cuda in (False, True): for target in ("cuda", "cpu", "xpu"):
for is_sparse in (False, True): for is_sparse in (False, True):
for is_parallel in (False, ): for is_parallel in (False, ):
inject_test_method(use_cuda, is_sparse, is_parallel) inject_test_method(target, is_sparse, is_parallel)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册