From 2ac4143b6ce7a43fd0c549f9d25cfda61f72ed09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 3 Feb 2021 14:55:01 +0800 Subject: [PATCH] support xpu with analysis predictor, test=develop (#30832) * support xpu inference with analysis predictor, test=develop * merge the cmake of the xpu toolchain, test=develop * add c-apis, test=develop * fix a bug in extern_xpu, test=develop --- cmake/external/xpu.cmake | 89 +++++++++++-------- paddle/fluid/inference/api/analysis_config.cc | 49 ++++++---- .../fluid/inference/api/analysis_predictor.cc | 39 +++++++- .../fluid/inference/api/analysis_predictor.h | 1 - paddle/fluid/inference/api/api_impl.cc | 21 ++++- paddle/fluid/inference/api/api_impl_tester.cc | 54 ++++++----- .../inference/api/paddle_analysis_config.h | 17 +++- paddle/fluid/inference/api/paddle_api.h | 3 +- .../fluid/inference/api/paddle_pass_builder.h | 13 +++ paddle/fluid/inference/capi/paddle_c_api.h | 7 ++ paddle/fluid/inference/capi/pd_config.cc | 24 +++++ .../fluid/inference/tests/api/CMakeLists.txt | 5 +- .../tests/api/analyzer_capi_xpu_tester.cc | 61 +++++++++++++ .../tests/api/lite_mul_model_test.cc | 18 ++++ paddle/fluid/inference/tests/test_helper.h | 12 +++ .../allocation/naive_best_fit_allocator.cc | 4 +- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/pybind/inference_api.cc | 6 +- .../fluid/tests/book/test_word2vec_book.py | 60 ++++++++----- 19 files changed, 379 insertions(+), 106 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index a07d845d70..af20663a00 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -5,48 +5,53 @@ endif() INCLUDE(ExternalProject) SET(XPU_PROJECT "extern_xpu") -if (WITH_AARCH64) - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) -elseif(WITH_SUNWAY) - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) -else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) -endif() +if(NOT XPU_SDK_ROOT) + if (WITH_AARCH64) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) + elseif(WITH_SUNWAY) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) + else() + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) + endif() -SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") -SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") -SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") -SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") -SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") + SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") + SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") + SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") + SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") + SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") -SET(XPU_API_LIB_NAME "libxpuapi.so") -SET(XPU_RT_LIB_NAME "libxpurt.so") -SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") -SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") + SET(XPU_API_LIB_NAME "libxpuapi.so") + SET(XPU_RT_LIB_NAME "libxpurt.so") + SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") + SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") + SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") -INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) + FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(XPU)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY xpu/include xpu/lib \n" + " DESTINATION ${XPU_INSTALL_DIR})\n") -FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(XPU)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY xpu/include xpu/lib \n" - " DESTINATION ${XPU_INSTALL_DIR})\n") - -ExternalProject_Add( - ${XPU_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${XPU_SOURCE_DIR} - DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz - && tar xvf xpu.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} -) + ExternalProject_Add( + ${XPU_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${XPU_SOURCE_DIR} + DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz + && tar xvf xpu.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} + ) +else() + SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/") + SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so") + SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so") +endif() +INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") @@ -69,4 +74,14 @@ else(WITH_XPU_BKCL) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ) endif(WITH_XPU_BKCL) -ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) +if(NOT XPU_SDK_ROOT) + ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) +else() + ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib) +endif() + +# Ensure that xpu/api.h can be included without dependency errors. +file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "") +add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc) +add_dependencies(xpu_headers_dummy extern_xpu) +link_libraries(xpu_headers_dummy) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 3b422fe98c..167d083f3d 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -33,6 +33,8 @@ PassStrategy *AnalysisConfig::pass_builder() const { if (use_gpu_) { LOG(INFO) << "Create GPU IR passes"; pass_builder_.reset(new GpuPassStrategy); + } else if (use_xpu_) { + pass_builder_.reset(new XpuPassStrategy); } else { LOG(INFO) << "Create CPU IR passes"; pass_builder_.reset(new CpuPassStrategy); @@ -73,7 +75,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; - device_id_ = device_id; + gpu_device_id_ = device_id; #else LOG(ERROR) << "Please compile with gpu to EnableGpu()"; use_gpu_ = false; @@ -115,7 +117,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // GPU related. CP_MEMBER(use_gpu_); CP_MEMBER(use_cudnn_); - CP_MEMBER(device_id_); + CP_MEMBER(gpu_device_id_); + CP_MEMBER(xpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); @@ -174,8 +177,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(thread_local_stream_); if (use_gpu_) { + PADDLE_ENFORCE_EQ(use_xpu_, false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); + } else if (use_xpu_) { + pass_builder_.reset(new XpuPassStrategy( + *static_cast(other.pass_builder()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(other.pass_builder()))); @@ -333,6 +342,12 @@ void AnalysisConfig::Update() { // Append after the Affine_channel_conv_fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } + } else if (use_xpu()) { + PADDLE_ENFORCE_EQ( + use_gpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); + pass_builder_.reset(new XpuPassStrategy); } else { pass_builder_.reset(new CpuPassStrategy); } @@ -341,7 +356,13 @@ void AnalysisConfig::Update() { if (use_gpu()) { pass_builder_.reset(new GpuPassStrategy( *static_cast(pass_builder_.get()))); - + } else if (use_xpu()) { + PADDLE_ENFORCE_EQ( + use_gpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); + pass_builder_.reset(new XpuPassStrategy( + *static_cast(pass_builder_.get()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(pass_builder_.get()))); @@ -420,19 +441,16 @@ void AnalysisConfig::Update() { } if (use_xpu_) { -#ifndef LITE_SUBGRAPH_WITH_XPU - PADDLE_THROW(platform::errors::Unavailable( - "You tried to use an XPU device, but Paddle was not compiled " - "with XPU-runtime.")); -#endif - if (!use_lite_) { - LOG(WARNING) << "Because XPU currently only works in Paddle-Lite " - "subgraph mode, please make sure you have enabled it."; - } +#if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU) PADDLE_ENFORCE_EQ(use_gpu_, false, platform::errors::Unavailable( "Currently, XPU and GPU cannot be enabled in the " "same analysis configuration.")); +#else + PADDLE_THROW(platform::errors::Unavailable( + "You tried to use an XPU device, but Paddle was not compiled " + "with XPU-runtime.")); +#endif } if (ir_debug_) { @@ -448,7 +466,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_gpu_; ss << use_fc_padding_; - ss << device_id_; + ss << gpu_device_id_; + ss << xpu_device_id_; ss << memory_pool_init_size_mb_; ss << use_tensorrt_; @@ -507,7 +526,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; - platform::SetDeviceId(device_id_); + platform::SetDeviceId(gpu_device_id_); platform::GpuMemoryUsage(&gpu_available, &gpu_total); double total_gpu_memory = gpu_total / 1024. / 1024.; float fraction_of_gpu_memory = @@ -548,7 +567,7 @@ NativeConfig AnalysisConfig::ToNativeConfig() const { config.prog_file = prog_file_; config.param_file = params_file_; config.use_gpu = use_gpu_; - config.device = device_id_; + config.device = gpu_device_id_; config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); config.specify_input_name = specify_input_name_; return config; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2fe1b64fcc..274ae8afa1 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -103,7 +103,10 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), pt.data.data(), pt.data.length()); - } else { + } else if (platform::is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); #ifdef PADDLE_WITH_CUDA platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = @@ -116,6 +119,18 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, PADDLE_THROW(paddle::platform::errors::Fatal( "Not compile with CUDA, should not reach here.")); #endif + } else if (platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU + auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place); + memory::Copy(dst_xpu_place, static_cast(input_ptr), + platform::CPUPlace(), pt.data.data(), pt.data.length()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Not compile with XPU, should not reach here.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The analysis predictor supports CPU, GPU and XPU now.")); } // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; @@ -182,6 +197,12 @@ bool AnalysisPredictor::PrepareScope( ++dev_id) { memory::Release(platform::CUDAPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount(); + ++dev_id) { + memory::Release(platform::XPUPlace(dev_id)); + } #endif memory::Release(platform::CPUPlace()); }); @@ -219,7 +240,9 @@ bool AnalysisPredictor::PrepareProgram( } bool AnalysisPredictor::CreateExecutor() { if (config_.use_gpu()) { - status_use_gpu_ = true; + PADDLE_ENFORCE_EQ(config_.use_xpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); #ifdef PADDLE_WITH_CUDA if (config_.thread_local_stream_enabled()) { @@ -230,6 +253,8 @@ bool AnalysisPredictor::CreateExecutor() { ctx->ResetThreadContext(platform::stream::Priority::kNormal); } #endif + } else if (config_.use_xpu()) { + place_ = paddle::platform::XPUPlace(config_.xpu_device_id()); } else { place_ = paddle::platform::CPUPlace(); } @@ -734,11 +759,16 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); + } else if (platform::is_xpu_place(place_)) { + PADDLE_ENFORCE_EQ(config_.use_gpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); + auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } else { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } - return res; } @@ -755,6 +785,9 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( res->SetName(name); if (platform::is_cpu_place(place_)) { res->SetPlace(PaddlePlace::kCPU); + } else if (platform::is_xpu_place(place_)) { + auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } else { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 35b52fa56d..b55d08dda5 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -415,7 +415,6 @@ class AnalysisPredictor : public PaddlePredictor { private: // Some status here that help to determine the status inside the predictor. bool status_is_cloned_{false}; - bool status_use_gpu_{false}; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 9a5b301fdd..91b18ae00c 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -80,7 +80,12 @@ bool NativePaddlePredictor::Init( paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (config_.use_gpu) { + PADDLE_ENFORCE_EQ(config_.use_xpu, false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.device); + } else if (config_.use_xpu) { + place_ = paddle::platform::XPUPlace(config_.device); } else { place_ = paddle::platform::CPUPlace(); } @@ -240,7 +245,11 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); - } else { + } else if (platform::is_gpu_place(place_)) { + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(place_), false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); #ifdef PADDLE_WITH_CUDA platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -253,6 +262,16 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, #else PADDLE_THROW(platform::errors::Unavailable( "Not compile with CUDA, should not reach here.")); +#endif + } else { +#ifdef PADDLE_WITH_XPU + auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); + memory::Copy(dst_xpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length()); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Not compile with XPU, should not reach here.")); #endif } diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 1e19046d6a..00efbb528a 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -58,19 +58,15 @@ NativeConfig GetConfig() { config.model_dir = FLAGS_word2vec_dirname; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; -#ifdef PADDLE_WITH_CUDA - config.use_gpu = true; -#else - config.use_gpu = false; -#endif config.device = 0; return config; } -void MainWord2Vec(bool use_gpu) { +void MainWord2Vec(const paddle::PaddlePlace& place) { NativeConfig config = GetConfig(); auto predictor = CreatePaddlePredictor(config); - config.use_gpu = use_gpu; + config.use_gpu = paddle::gpu_place_used(place); + config.use_xpu = paddle::xpu_place_used(place); framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoD lod{{0, 1}}; @@ -117,11 +113,12 @@ void MainWord2Vec(bool use_gpu) { } } -void MainImageClassification(bool use_gpu) { +void MainImageClassification(const paddle::PaddlePlace& place) { int batch_size = 2; bool repeat = false; NativeConfig config = GetConfig(); - config.use_gpu = use_gpu; + config.use_gpu = paddle::gpu_place_used(place); + config.use_xpu = paddle::xpu_place_used(place); config.model_dir = FLAGS_book_dirname + "/image_classification_resnet.inference.model"; @@ -162,9 +159,10 @@ void MainImageClassification(bool use_gpu) { } } -void MainThreadsWord2Vec(bool use_gpu) { +void MainThreadsWord2Vec(const paddle::PaddlePlace& place) { NativeConfig config = GetConfig(); - config.use_gpu = use_gpu; + config.use_gpu = paddle::gpu_place_used(place); + config.use_xpu = paddle::xpu_place_used(place); auto main_predictor = CreatePaddlePredictor(config); // prepare inputs data and reference results @@ -223,11 +221,12 @@ void MainThreadsWord2Vec(bool use_gpu) { } } -void MainThreadsImageClassification(bool use_gpu) { +void MainThreadsImageClassification(const paddle::PaddlePlace& place) { constexpr int num_jobs = 4; // each job run 1 batch constexpr int batch_size = 1; NativeConfig config = GetConfig(); - config.use_gpu = use_gpu; + config.use_gpu = paddle::gpu_place_used(place); + config.use_xpu = paddle::xpu_place_used(place); config.model_dir = FLAGS_book_dirname + "/image_classification_resnet.inference.model"; @@ -276,29 +275,42 @@ void MainThreadsImageClassification(bool use_gpu) { } } -TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu) { + MainWord2Vec(paddle::PaddlePlace::kCPU); +} TEST(inference_api_native, word2vec_cpu_threads) { - MainThreadsWord2Vec(false /*use_gpu*/); + MainThreadsWord2Vec(paddle::PaddlePlace::kCPU); } TEST(inference_api_native, image_classification_cpu) { - MainImageClassification(false /*use_gpu*/); + MainImageClassification(paddle::PaddlePlace::kCPU); } TEST(inference_api_native, image_classification_cpu_threads) { - MainThreadsImageClassification(false /*use_gpu*/); + MainThreadsImageClassification(paddle::PaddlePlace::kCPU); } +#ifdef PADDLE_WITH_XPU +TEST(inference_api_native, word2vec_xpu) { + MainWord2Vec(paddle::PaddlePlace::kXPU); +} +TEST(inference_api_native, image_classification_xpu) { + MainImageClassification(paddle::PaddlePlace::kXPU); +} +#endif + #ifdef PADDLE_WITH_CUDA -TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +TEST(inference_api_native, word2vec_gpu) { + MainWord2Vec(paddle::PaddlePlace::kGPU); +} // Turn off temporarily for the unstable result. // TEST(inference_api_native, word2vec_gpu_threads) { -// MainThreadsWord2Vec(true /*use_gpu*/); +// MainThreadsWord2Vec(paddle::PaddlePlace::kGPU); // } TEST(inference_api_native, image_classification_gpu) { - MainImageClassification(true /*use_gpu*/); + MainImageClassification(paddle::PaddlePlace::kGPU); } // Turn off temporarily for the unstable result. // TEST(inference_api_native, image_classification_gpu_threads) { -// MainThreadsImageClassification(true /*use_gpu*/); +// MainThreadsImageClassification(paddle::PaddlePlace::kGPU); // } #endif diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c02af5d9f8..c892284d91 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -185,11 +185,23 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_gpu() const { return use_gpu_; } /// + /// \brief A boolean state telling whether the XPU is turned on. + /// + /// \return bool Whether the XPU is turned on. + /// + bool use_xpu() const { return use_xpu_; } + /// + /// \brief Get the GPU device id. + /// + /// \return int The GPU device id. + /// + int gpu_device_id() const { return gpu_device_id_; } + /// /// \brief Get the GPU device id. /// /// \return int The GPU device id. /// - int gpu_device_id() const { return device_id_; } + int xpu_device_id() const { return xpu_device_id_; } /// /// \brief Get the initial size in MB of the GPU memory pool. /// @@ -579,7 +591,8 @@ struct PD_INFER_DECL AnalysisConfig { // GPU related. bool use_gpu_{false}; - int device_id_{0}; + int gpu_device_id_{0}; + int xpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool use_cudnn_{false}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 11f362504b..c5893a23a4 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -162,7 +162,7 @@ struct PD_INFER_DECL PaddleTensor { std::vector> lod; ///< Tensor+LoD equals LoDTensor }; -enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU, kXPU }; /// \brief Represents an n-dimensional array of values. /// The ZeroCopyTensor is used to store the input or output of the network. @@ -361,6 +361,7 @@ class PD_INFER_DECL PaddlePredictor { struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config { NativeConfig(); /// GPU related fields. + bool use_xpu{false}; bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index b10c290b22..a725ebab35 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -140,11 +140,16 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } + /// \brief Check if we are using xpu. + /// \return A bool variable implying whether we are in xpu mode. + bool use_xpu() const { return use_xpu_; } + /// \brief Default destructor. virtual ~PassStrategy() = default; protected: /// \cond Protected + bool use_xpu_{false}; bool use_gpu_{false}; bool use_mkldnn_{false}; /// \endcond @@ -226,6 +231,14 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \endcond }; +/// \class XpuPassStrategy +/// \brief The XPU passes controller, it is used in AnalysisPredictor with XPU +/// mode. +class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { + public: + XpuPassStrategy() : PassStrategy({}) {} +}; + /// \brief List of tensorRT subgraph passes. PD_INFER_DECL extern const std::vector kTRTSubgraphPasses; diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h index 32129890d0..c7d53c8d6f 100644 --- a/paddle/fluid/inference/capi/paddle_c_api.h +++ b/paddle/fluid/inference/capi/paddle_c_api.h @@ -165,12 +165,19 @@ PADDLE_CAPI_EXPORT extern void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, int device_id); +PADDLE_CAPI_EXPORT extern void PD_EnableXpu(PD_AnalysisConfig* config, + int l3_workspace_size); + PADDLE_CAPI_EXPORT extern void PD_DisableGpu(PD_AnalysisConfig* config); PADDLE_CAPI_EXPORT extern bool PD_UseGpu(const PD_AnalysisConfig* config); +PADDLE_CAPI_EXPORT extern bool PD_UseXpu(const PD_AnalysisConfig* config); + PADDLE_CAPI_EXPORT extern int PD_GpuDeviceId(const PD_AnalysisConfig* config); +PADDLE_CAPI_EXPORT extern int PD_XpuDeviceId(const PD_AnalysisConfig* config); + PADDLE_CAPI_EXPORT extern int PD_MemoryPoolInitSizeMb( const PD_AnalysisConfig* config); diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index af8d4a69ec..2316396672 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -111,6 +111,14 @@ void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, device_id); } +void PD_EnableXpu(PD_AnalysisConfig* config, int l3_workspace_size) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + config->config.EnableXpu(l3_workspace_size); +} + void PD_DisableGpu(PD_AnalysisConfig* config) { PADDLE_ENFORCE_NOT_NULL( config, @@ -127,6 +135,14 @@ bool PD_UseGpu(const PD_AnalysisConfig* config) { return config->config.use_gpu(); } +bool PD_UseXpu(const PD_AnalysisConfig* config) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + return config->config.use_xpu(); +} + int PD_GpuDeviceId(const PD_AnalysisConfig* config) { PADDLE_ENFORCE_NOT_NULL( config, @@ -135,6 +151,14 @@ int PD_GpuDeviceId(const PD_AnalysisConfig* config) { return config->config.gpu_device_id(); } +int PD_XpuDeviceId(const PD_AnalysisConfig* config) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + return config->config.xpu_device_id(); +} + int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) { PADDLE_ENFORCE_NOT_NULL( config, diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f8c7c420eb..2fa076b002 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -510,7 +510,10 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - + inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc new file mode 100644 index 0000000000..33a67d8140 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi/paddle_c_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +#ifdef PADDLE_WITH_XPU +TEST(PD_AnalysisConfig, use_xpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_AnalysisConfig *config = PD_NewAnalysisConfig(); + PD_SwitchUseFeedFetchOps(config, false); + PD_SwitchSpecifyInputNames(config, true); + PD_SwitchIrDebug(config, true); + PD_SetModel(config, model_dir.c_str(), nullptr); + PD_SetOptimCacheDir(config, (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char *model_dir_ = PD_ModelDir(config); + LOG(INFO) << model_dir_; + PD_EnableXpu(config, 0xfffc00); + bool use_xpu = PD_UseXpu(config); + CHECK(use_xpu) << "NO"; + int device = PD_XpuDeviceId(config); + CHECK(0 == device) << "NO"; + PD_SwitchIrOptim(config, true); + bool ir_optim = PD_IrOptim(config); + CHECK(ir_optim) << "NO"; + PD_EnableMemoryOptim(config); + bool memory_optim_enable = PD_MemoryOptimEnabled(config); + CHECK(memory_optim_enable) << "NO"; + PD_EnableProfile(config); + bool profiler_enable = PD_ProfileEnabled(config); + CHECK(profiler_enable) << "NO"; + PD_SetInValid(config); + bool is_valid = PD_IsValid(config); + CHECK(!is_valid) << "NO"; + PD_DeleteAnalysisConfig(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 205898a6fd..ab49cd12bb 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -58,6 +58,24 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { return 0; } +#ifdef PADDLE_WITH_XPU +TEST(AnalysisPredictor, native_xpu) { + AnalysisConfig config; + config.EnableXpu(); + config.SetModel(FLAGS_infer_model + "/" + "mul_model"); + test_main(config); +} +#endif + +#ifdef LITE_SUBGRAPH_WITH_XPU +TEST(AnalysisPredictor, lite_xpu) { + AnalysisConfig config; + config.EnableXpu(); + config.SetModel(FLAGS_infer_model + "/" + "mul_model"); + config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); +} +#endif + #ifdef PADDLE_WITH_CUDA TEST(AnalysisPredictor, thread_local_stream) { const size_t thread_num = 5; diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 1457f5337e..1f6c821352 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -27,6 +27,18 @@ limitations under the License. */ DECLARE_bool(use_mkldnn); +namespace paddle { +bool gpu_place_used(const paddle::PaddlePlace& place) { + return place == paddle::PaddlePlace::kGPU; +} +bool xpu_place_used(const paddle::PaddlePlace& place) { + return place == paddle::PaddlePlace::kXPU; +} +bool cpu_place_used(const paddle::PaddlePlace& place) { + return place == paddle::PaddlePlace::kCPU; +} +} // namespace paddle + template void SetupTensor(paddle::framework::LoDTensor* input, paddle::framework::DDim dims, T lower, T upper) { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 9ae63e74f4..dbea74e7e0 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -197,12 +197,12 @@ void Free(const platform::XPUPlace &place, void *p, template <> uint64_t Release(const platform::XPUPlace &place) { #ifdef PADDLE_WITH_XPU - PADDLE_THROW( - platform::errors::PermissionDenied("Release XPU pool is not supported.")); + LOG(WARNING) << "Release XPU pool is not supported now, no action here."; #else PADDLE_THROW( platform::errors::PermissionDenied("'XPUPlace' is not supported.")); #endif + return -1; } template <> diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 73add8ea06..fc57fbe220 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -58,7 +58,7 @@ cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) if(WITH_XPU) -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib) endif() add_subdirectory(dynload) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 0027181189..3c6d1926d1 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -378,7 +378,8 @@ void BindPaddlePlace(py::module *m) { py::enum_(*m, "PaddlePlace") .value("UNK", PaddlePlace::kUNK) .value("CPU", PaddlePlace::kCPU) - .value("GPU", PaddlePlace::kGPU); + .value("GPU", PaddlePlace::kGPU) + .value("XPU", PaddlePlace::kXPU); } void BindPaddlePredictor(py::module *m) { @@ -407,6 +408,7 @@ void BindNativeConfig(py::module *m) { py::class_(*m, "NativeConfig") .def(py::init<>()) .def_readwrite("use_gpu", &NativeConfig::use_gpu) + .def_readwrite("use_xpu", &NativeConfig::use_xpu) .def_readwrite("device", &NativeConfig::device) .def_readwrite("fraction_of_gpu_memory", &NativeConfig::fraction_of_gpu_memory) @@ -468,7 +470,9 @@ void BindAnalysisConfig(py::module *m) { py::arg("l3_workspace_size")) .def("disable_gpu", &AnalysisConfig::DisableGpu) .def("use_gpu", &AnalysisConfig::use_gpu) + .def("use_xpu", &AnalysisConfig::use_xpu) .def("gpu_device_id", &AnalysisConfig::gpu_device_id) + .def("xpu_device_id", &AnalysisConfig::xpu_device_id) .def("memory_pool_init_size_mb", &AnalysisConfig::memory_pool_init_size_mb) .def("fraction_of_gpu_memory_for_pool", diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py index aae4de70ac..e33b1cc514 100644 --- a/python/paddle/fluid/tests/book/test_word2vec_book.py +++ b/python/paddle/fluid/tests/book/test_word2vec_book.py @@ -26,7 +26,20 @@ import sys paddle.enable_static() -def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): +def get_place(target): + if target == "cuda": + return fluid.CUDAPlace(0) + elif target == "xpu": + return fluid.XPUPlace(0) + elif target == "cpu": + return fluid.CPUPlace() + else: + raise ValueError( + "Target `{0}` is not on the support list: `cuda`, `xpu` and `cpu`.". + format(target)) + + +def train(target, is_sparse, is_parallel, save_dirname, is_local=True): PASS_NUM = 100 EMBED_SIZE = 32 HIDDEN_SIZE = 256 @@ -93,7 +106,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): train_reader = paddle.batch( paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + place = get_place(target) exe = fluid.Executor(place) feeder = fluid.DataFeeder( feed_list=[first_word, second_word, third_word, forth_word, next_word], @@ -143,13 +156,12 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): train_loop(t.get_trainer_program()) -def infer(use_cuda, save_dirname=None): +def infer(target, save_dirname=None): if save_dirname is None: return - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + place = get_place(target) exe = fluid.Executor(place) - inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): # Use fluid.io.load_inference_model to obtain the inference program desc, @@ -211,10 +223,12 @@ def infer(use_cuda, save_dirname=None): infer_config = fluid.core.NativeConfig() infer_config.model_dir = 'word2vec.inference.model' - infer_config.use_gpu = use_cuda - if use_cuda: + if target == "cuda": + infer_config.use_gpu = True infer_config.device = 0 infer_config.fraction_of_gpu_memory = 0.15 + elif target == "xpu": + infer_config.use_xpu = True compiled_program = fluid.compiler.CompiledProgram(inference_program) compiled_program._with_inference_optimize(infer_config) assert compiled_program._is_inference is True @@ -222,11 +236,13 @@ def infer(use_cuda, save_dirname=None): np_data = np.array(results[0]) infer_out = infer_outputs[0].data.float_data() for a, b in zip(np_data[0], infer_out): - assert np.isclose(a, b), "a: {}, b: {}".format(a, b) + assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b) -def main(use_cuda, is_sparse, is_parallel): - if use_cuda and not fluid.core.is_compiled_with_cuda(): +def main(target, is_sparse, is_parallel): + if target == "cuda" and not fluid.core.is_compiled_with_cuda(): + return + if target == "xpu" and not fluid.core.is_compiled_with_xpu(): return if not is_parallel: @@ -234,8 +250,13 @@ def main(use_cuda, is_sparse, is_parallel): else: save_dirname = None - train(use_cuda, is_sparse, is_parallel, save_dirname) - infer(use_cuda, save_dirname) + if target == "xpu": + # This model cannot be trained with xpu temporarily, + # so only inference is turned on. + train("cpu", is_sparse, is_parallel, save_dirname) + else: + train(target, is_sparse, is_parallel, save_dirname) + infer(target, save_dirname) FULL_TEST = os.getenv('FULL_TEST', @@ -247,8 +268,8 @@ class W2VTest(unittest.TestCase): pass -def inject_test_method(use_cuda, is_sparse, is_parallel): - fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse" +def inject_test_method(target, is_sparse, is_parallel): + fn_name = "test_{0}_{1}_{2}".format(target, "sparse" if is_sparse else "dense", "parallel" if is_parallel else "normal") @@ -259,11 +280,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): with fluid.scope_guard(scope): with fluid.program_guard(prog, startup_prog): main( - use_cuda=use_cuda, - is_sparse=is_sparse, - is_parallel=is_parallel) + target=target, is_sparse=is_sparse, is_parallel=is_parallel) - if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse: + if (not fluid.core.is_compiled_with_cuda() or + target == "cuda") and is_sparse: fn = __impl__ else: # skip the other test when on CI server @@ -273,10 +293,10 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): setattr(W2VTest, fn_name, fn) -for use_cuda in (False, True): +for target in ("cuda", "cpu", "xpu"): for is_sparse in (False, True): for is_parallel in (False, ): - inject_test_method(use_cuda, is_sparse, is_parallel) + inject_test_method(target, is_sparse, is_parallel) if __name__ == '__main__': unittest.main() -- GitLab