未验证 提交 2ac4143b 编写于 作者: 石晓伟 提交者: GitHub

support xpu with analysis predictor, test=develop (#30832)

* support xpu inference with analysis predictor, test=develop

* merge the cmake of the xpu toolchain, test=develop

* add c-apis, test=develop

* fix a bug in extern_xpu, test=develop
上级 05d2b7a3
......@@ -5,36 +5,35 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
if (WITH_AARCH64)
if(NOT XPU_SDK_ROOT)
if (WITH_AARCH64)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
elseif(WITH_SUNWAY)
elseif(WITH_SUNWAY)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
else()
else()
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
endif()
endif()
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/include xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
......@@ -45,8 +44,14 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
)
else()
SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/")
SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
endif()
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
......@@ -69,4 +74,14 @@ else(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
endif(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
else()
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
endif()
# Ensure that xpu/api.h can be included without dependency errors.
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
add_dependencies(xpu_headers_dummy extern_xpu)
link_libraries(xpu_headers_dummy)
......@@ -33,6 +33,8 @@ PassStrategy *AnalysisConfig::pass_builder() const {
if (use_gpu_) {
LOG(INFO) << "Create GPU IR passes";
pass_builder_.reset(new GpuPassStrategy);
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy);
} else {
LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy);
......@@ -73,7 +75,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
device_id_ = device_id;
gpu_device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu()";
use_gpu_ = false;
......@@ -115,7 +117,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// GPU related.
CP_MEMBER(use_gpu_);
CP_MEMBER(use_cudnn_);
CP_MEMBER(device_id_);
CP_MEMBER(gpu_device_id_);
CP_MEMBER(xpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_);
......@@ -174,8 +177,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(thread_local_stream_);
if (use_gpu_) {
PADDLE_ENFORCE_EQ(use_xpu_, false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder())));
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(other.pass_builder())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder())));
......@@ -333,6 +342,12 @@ void AnalysisConfig::Update() {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy);
} else {
pass_builder_.reset(new CpuPassStrategy);
}
......@@ -341,7 +356,13 @@ void AnalysisConfig::Update() {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get())));
......@@ -420,19 +441,16 @@ void AnalysisConfig::Update() {
}
if (use_xpu_) {
#ifndef LITE_SUBGRAPH_WITH_XPU
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));
#endif
if (!use_lite_) {
LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
"subgraph mode, please make sure you have enabled it.";
}
#if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU)
PADDLE_ENFORCE_EQ(use_gpu_, false,
platform::errors::Unavailable(
"Currently, XPU and GPU cannot be enabled in the "
"same analysis configuration."));
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));
#endif
}
if (ir_debug_) {
......@@ -448,7 +466,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_gpu_;
ss << use_fc_padding_;
ss << device_id_;
ss << gpu_device_id_;
ss << xpu_device_id_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
......@@ -507,7 +526,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t gpu_total, gpu_available;
platform::SetDeviceId(device_id_);
platform::SetDeviceId(gpu_device_id_);
platform::GpuMemoryUsage(&gpu_available, &gpu_total);
double total_gpu_memory = gpu_total / 1024. / 1024.;
float fraction_of_gpu_memory =
......@@ -548,7 +567,7 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.device = gpu_device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
......
......@@ -103,7 +103,10 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
pt.data.length());
} else {
} else if (platform::is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
......@@ -116,6 +119,18 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
PADDLE_THROW(paddle::platform::errors::Fatal(
"Not compile with CUDA, should not reach here."));
#endif
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length());
#else
PADDLE_THROW(paddle::platform::errors::Fatal(
"Not compile with XPU, should not reach here."));
#endif
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now."));
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
......@@ -182,6 +197,12 @@ bool AnalysisPredictor::PrepareScope(
++dev_id) {
memory::Release(platform::CUDAPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount();
++dev_id) {
memory::Release(platform::XPUPlace(dev_id));
}
#endif
memory::Release(platform::CPUPlace());
});
......@@ -219,7 +240,9 @@ bool AnalysisPredictor::PrepareProgram(
}
bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu()) {
status_use_gpu_ = true;
PADDLE_ENFORCE_EQ(config_.use_xpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#ifdef PADDLE_WITH_CUDA
if (config_.thread_local_stream_enabled()) {
......@@ -230,6 +253,8 @@ bool AnalysisPredictor::CreateExecutor() {
ctx->ResetThreadContext(platform::stream::Priority::kNormal);
}
#endif
} else if (config_.use_xpu()) {
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -734,11 +759,16 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
PADDLE_ENFORCE_EQ(config_.use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
......@@ -755,6 +785,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......
......@@ -415,7 +415,6 @@ class AnalysisPredictor : public PaddlePredictor {
private:
// Some status here that help to determine the status inside the predictor.
bool status_is_cloned_{false};
bool status_use_gpu_{false};
};
} // namespace paddle
......@@ -80,7 +80,12 @@ bool NativePaddlePredictor::Init(
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (config_.use_gpu) {
PADDLE_ENFORCE_EQ(config_.use_xpu, false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
place_ = paddle::platform::CUDAPlace(config_.device);
} else if (config_.use_xpu) {
place_ = paddle::platform::XPUPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -240,7 +245,11 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
} else {
} else if (platform::is_gpu_place(place_)) {
PADDLE_ENFORCE_EQ(
platform::is_xpu_place(place_), false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
......@@ -253,6 +262,16 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif
} else {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length());
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here."));
#endif
}
......
......@@ -58,19 +58,15 @@ NativeConfig GetConfig() {
config.model_dir = FLAGS_word2vec_dirname;
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA
config.use_gpu = true;
#else
config.use_gpu = false;
#endif
config.device = 0;
return config;
}
void MainWord2Vec(bool use_gpu) {
void MainWord2Vec(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig();
auto predictor = CreatePaddlePredictor<NativeConfig>(config);
config.use_gpu = use_gpu;
config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}};
......@@ -117,11 +113,12 @@ void MainWord2Vec(bool use_gpu) {
}
}
void MainImageClassification(bool use_gpu) {
void MainImageClassification(const paddle::PaddlePlace& place) {
int batch_size = 2;
bool repeat = false;
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model";
......@@ -162,9 +159,10 @@ void MainImageClassification(bool use_gpu) {
}
}
void MainThreadsWord2Vec(bool use_gpu) {
void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
// prepare inputs data and reference results
......@@ -223,11 +221,12 @@ void MainThreadsWord2Vec(bool use_gpu) {
}
}
void MainThreadsImageClassification(bool use_gpu) {
void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
constexpr int num_jobs = 4; // each job run 1 batch
constexpr int batch_size = 1;
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place);
config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model";
......@@ -276,29 +275,42 @@ void MainThreadsImageClassification(bool use_gpu) {
}
}
TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
TEST(inference_api_native, word2vec_cpu) {
MainWord2Vec(paddle::PaddlePlace::kCPU);
}
TEST(inference_api_native, word2vec_cpu_threads) {
MainThreadsWord2Vec(false /*use_gpu*/);
MainThreadsWord2Vec(paddle::PaddlePlace::kCPU);
}
TEST(inference_api_native, image_classification_cpu) {
MainImageClassification(false /*use_gpu*/);
MainImageClassification(paddle::PaddlePlace::kCPU);
}
TEST(inference_api_native, image_classification_cpu_threads) {
MainThreadsImageClassification(false /*use_gpu*/);
MainThreadsImageClassification(paddle::PaddlePlace::kCPU);
}
#ifdef PADDLE_WITH_XPU
TEST(inference_api_native, word2vec_xpu) {
MainWord2Vec(paddle::PaddlePlace::kXPU);
}
TEST(inference_api_native, image_classification_xpu) {
MainImageClassification(paddle::PaddlePlace::kXPU);
}
#endif
#ifdef PADDLE_WITH_CUDA
TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
TEST(inference_api_native, word2vec_gpu) {
MainWord2Vec(paddle::PaddlePlace::kGPU);
}
// Turn off temporarily for the unstable result.
// TEST(inference_api_native, word2vec_gpu_threads) {
// MainThreadsWord2Vec(true /*use_gpu*/);
// MainThreadsWord2Vec(paddle::PaddlePlace::kGPU);
// }
TEST(inference_api_native, image_classification_gpu) {
MainImageClassification(true /*use_gpu*/);
MainImageClassification(paddle::PaddlePlace::kGPU);
}
// Turn off temporarily for the unstable result.
// TEST(inference_api_native, image_classification_gpu_threads) {
// MainThreadsImageClassification(true /*use_gpu*/);
// MainThreadsImageClassification(paddle::PaddlePlace::kGPU);
// }
#endif
......
......@@ -185,11 +185,23 @@ struct PD_INFER_DECL AnalysisConfig {
///
bool use_gpu() const { return use_gpu_; }
///
/// \brief A boolean state telling whether the XPU is turned on.
///
/// \return bool Whether the XPU is turned on.
///
bool use_xpu() const { return use_xpu_; }
///
/// \brief Get the GPU device id.
///
/// \return int The GPU device id.
///
int gpu_device_id() const { return gpu_device_id_; }
///
/// \brief Get the GPU device id.
///
/// \return int The GPU device id.
///
int gpu_device_id() const { return device_id_; }
int xpu_device_id() const { return xpu_device_id_; }
///
/// \brief Get the initial size in MB of the GPU memory pool.
///
......@@ -579,7 +591,8 @@ struct PD_INFER_DECL AnalysisConfig {
// GPU related.
bool use_gpu_{false};
int device_id_{0};
int gpu_device_id_{0};
int xpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool use_cudnn_{false};
......
......@@ -162,7 +162,7 @@ struct PD_INFER_DECL PaddleTensor {
std::vector<std::vector<size_t>> lod; ///< Tensor+LoD equals LoDTensor
};
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU, kXPU };
/// \brief Represents an n-dimensional array of values.
/// The ZeroCopyTensor is used to store the input or output of the network.
......@@ -361,6 +361,7 @@ class PD_INFER_DECL PaddlePredictor {
struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
NativeConfig();
/// GPU related fields.
bool use_xpu{false};
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{
......
......@@ -140,11 +140,16 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; }
/// \brief Check if we are using xpu.
/// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; }
/// \brief Default destructor.
virtual ~PassStrategy() = default;
protected:
/// \cond Protected
bool use_xpu_{false};
bool use_gpu_{false};
bool use_mkldnn_{false};
/// \endcond
......@@ -226,6 +231,14 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \endcond
};
/// \class XpuPassStrategy
/// \brief The XPU passes controller, it is used in AnalysisPredictor with XPU
/// mode.
class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
public:
XpuPassStrategy() : PassStrategy({}) {}
};
/// \brief List of tensorRT subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
......
......@@ -165,12 +165,19 @@ PADDLE_CAPI_EXPORT extern void PD_EnableUseGpu(PD_AnalysisConfig* config,
int memory_pool_init_size_mb,
int device_id);
PADDLE_CAPI_EXPORT extern void PD_EnableXpu(PD_AnalysisConfig* config,
int l3_workspace_size);
PADDLE_CAPI_EXPORT extern void PD_DisableGpu(PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_UseGpu(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_UseXpu(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_GpuDeviceId(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_XpuDeviceId(const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern int PD_MemoryPoolInitSizeMb(
const PD_AnalysisConfig* config);
......
......@@ -111,6 +111,14 @@ void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
device_id);
}
void PD_EnableXpu(PD_AnalysisConfig* config, int l3_workspace_size) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableXpu(l3_workspace_size);
}
void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
......@@ -127,6 +135,14 @@ bool PD_UseGpu(const PD_AnalysisConfig* config) {
return config->config.use_gpu();
}
bool PD_UseXpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_xpu();
}
int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
......@@ -135,6 +151,14 @@ int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
return config->config.gpu_device_id();
}
int PD_XpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.xpu_device_id();
}
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
......
......@@ -510,6 +510,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <vector>
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
#ifdef PADDLE_WITH_XPU
TEST(PD_AnalysisConfig, use_xpu) {
std::string model_dir = FLAGS_infer_model + "/mobilenet";
PD_AnalysisConfig *config = PD_NewAnalysisConfig();
PD_SwitchUseFeedFetchOps(config, false);
PD_SwitchSpecifyInputNames(config, true);
PD_SwitchIrDebug(config, true);
PD_SetModel(config, model_dir.c_str(), nullptr);
PD_SetOptimCacheDir(config, (FLAGS_infer_model + "/OptimCacheDir").c_str());
const char *model_dir_ = PD_ModelDir(config);
LOG(INFO) << model_dir_;
PD_EnableXpu(config, 0xfffc00);
bool use_xpu = PD_UseXpu(config);
CHECK(use_xpu) << "NO";
int device = PD_XpuDeviceId(config);
CHECK(0 == device) << "NO";
PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO";
PD_EnableMemoryOptim(config);
bool memory_optim_enable = PD_MemoryOptimEnabled(config);
CHECK(memory_optim_enable) << "NO";
PD_EnableProfile(config);
bool profiler_enable = PD_ProfileEnabled(config);
CHECK(profiler_enable) << "NO";
PD_SetInValid(config);
bool is_valid = PD_IsValid(config);
CHECK(!is_valid) << "NO";
PD_DeleteAnalysisConfig(config);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -58,6 +58,24 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
return 0;
}
#ifdef PADDLE_WITH_XPU
TEST(AnalysisPredictor, native_xpu) {
AnalysisConfig config;
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
test_main(config);
}
#endif
#ifdef LITE_SUBGRAPH_WITH_XPU
TEST(AnalysisPredictor, lite_xpu) {
AnalysisConfig config;
config.EnableXpu();
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
}
#endif
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, thread_local_stream) {
const size_t thread_num = 5;
......
......@@ -27,6 +27,18 @@ limitations under the License. */
DECLARE_bool(use_mkldnn);
namespace paddle {
bool gpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kGPU;
}
bool xpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kXPU;
}
bool cpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kCPU;
}
} // namespace paddle
template <typename T>
void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, T lower, T upper) {
......
......@@ -197,12 +197,12 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
template <>
uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
PADDLE_THROW(
platform::errors::PermissionDenied("Release XPU pool is not supported."));
LOG(WARNING) << "Release XPU pool is not supported now, no action here.";
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
return -1;
}
template <>
......
......@@ -58,7 +58,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
if(WITH_XPU)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
endif()
add_subdirectory(dynload)
......
......@@ -378,7 +378,8 @@ void BindPaddlePlace(py::module *m) {
py::enum_<PaddlePlace>(*m, "PaddlePlace")
.value("UNK", PaddlePlace::kUNK)
.value("CPU", PaddlePlace::kCPU)
.value("GPU", PaddlePlace::kGPU);
.value("GPU", PaddlePlace::kGPU)
.value("XPU", PaddlePlace::kXPU);
}
void BindPaddlePredictor(py::module *m) {
......@@ -407,6 +408,7 @@ void BindNativeConfig(py::module *m) {
py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
.def(py::init<>())
.def_readwrite("use_gpu", &NativeConfig::use_gpu)
.def_readwrite("use_xpu", &NativeConfig::use_xpu)
.def_readwrite("device", &NativeConfig::device)
.def_readwrite("fraction_of_gpu_memory",
&NativeConfig::fraction_of_gpu_memory)
......@@ -468,7 +470,9 @@ void BindAnalysisConfig(py::module *m) {
py::arg("l3_workspace_size"))
.def("disable_gpu", &AnalysisConfig::DisableGpu)
.def("use_gpu", &AnalysisConfig::use_gpu)
.def("use_xpu", &AnalysisConfig::use_xpu)
.def("gpu_device_id", &AnalysisConfig::gpu_device_id)
.def("xpu_device_id", &AnalysisConfig::xpu_device_id)
.def("memory_pool_init_size_mb",
&AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool",
......
......@@ -26,7 +26,20 @@ import sys
paddle.enable_static()
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
def get_place(target):
if target == "cuda":
return fluid.CUDAPlace(0)
elif target == "xpu":
return fluid.XPUPlace(0)
elif target == "cpu":
return fluid.CPUPlace()
else:
raise ValueError(
"Target `{0}` is not on the support list: `cuda`, `xpu` and `cpu`.".
format(target))
def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
PASS_NUM = 100
EMBED_SIZE = 32
HIDDEN_SIZE = 256
......@@ -93,7 +106,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
place = get_place(target)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word],
......@@ -143,13 +156,12 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
train_loop(t.get_trainer_program())
def infer(use_cuda, save_dirname=None):
def infer(target, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
place = get_place(target)
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
......@@ -211,10 +223,12 @@ def infer(use_cuda, save_dirname=None):
infer_config = fluid.core.NativeConfig()
infer_config.model_dir = 'word2vec.inference.model'
infer_config.use_gpu = use_cuda
if use_cuda:
if target == "cuda":
infer_config.use_gpu = True
infer_config.device = 0
infer_config.fraction_of_gpu_memory = 0.15
elif target == "xpu":
infer_config.use_xpu = True
compiled_program = fluid.compiler.CompiledProgram(inference_program)
compiled_program._with_inference_optimize(infer_config)
assert compiled_program._is_inference is True
......@@ -222,11 +236,13 @@ def infer(use_cuda, save_dirname=None):
np_data = np.array(results[0])
infer_out = infer_outputs[0].data.float_data()
for a, b in zip(np_data[0], infer_out):
assert np.isclose(a, b), "a: {}, b: {}".format(a, b)
assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
def main(use_cuda, is_sparse, is_parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda():
def main(target, is_sparse, is_parallel):
if target == "cuda" and not fluid.core.is_compiled_with_cuda():
return
if target == "xpu" and not fluid.core.is_compiled_with_xpu():
return
if not is_parallel:
......@@ -234,8 +250,13 @@ def main(use_cuda, is_sparse, is_parallel):
else:
save_dirname = None
train(use_cuda, is_sparse, is_parallel, save_dirname)
infer(use_cuda, save_dirname)
if target == "xpu":
# This model cannot be trained with xpu temporarily,
# so only inference is turned on.
train("cpu", is_sparse, is_parallel, save_dirname)
else:
train(target, is_sparse, is_parallel, save_dirname)
infer(target, save_dirname)
FULL_TEST = os.getenv('FULL_TEST',
......@@ -247,8 +268,8 @@ class W2VTest(unittest.TestCase):
pass
def inject_test_method(use_cuda, is_sparse, is_parallel):
fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
def inject_test_method(target, is_sparse, is_parallel):
fn_name = "test_{0}_{1}_{2}".format(target, "sparse"
if is_sparse else "dense", "parallel"
if is_parallel else "normal")
......@@ -259,11 +280,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(
use_cuda=use_cuda,
is_sparse=is_sparse,
is_parallel=is_parallel)
target=target, is_sparse=is_sparse, is_parallel=is_parallel)
if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
if (not fluid.core.is_compiled_with_cuda() or
target == "cuda") and is_sparse:
fn = __impl__
else:
# skip the other test when on CI server
......@@ -273,10 +293,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
setattr(W2VTest, fn_name, fn)
for use_cuda in (False, True):
for target in ("cuda", "cpu", "xpu"):
for is_sparse in (False, True):
for is_parallel in (False, ):
inject_test_method(use_cuda, is_sparse, is_parallel)
inject_test_method(target, is_sparse, is_parallel)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册